Merge pull request #9 from b-cubed-eu/dataset-cv

wlangera · web-flow · commit a82ef20185cd · 2026-01-20T09:44:15.000+01:00
Dataset cross-validation analysis
diff --git a/README.md b/README.md
@@ -2,6 +2,7 @@
 ![GitHub](https://img.shields.io/github/license/b-cubed-eu/comp-unstructured-data)
 [![repo status](https://www.repostatus.org/badges/latest/wip.svg)](https://www.repostatus.org/#wip)
 ![GitHub repo size](https://img.shields.io/github/repo-size/b-cubed-eu/comp-unstructured-data)
+[![funder](https://badgen.net/static/funder/European%20Union/f2a)](https://doi.org/10.3030/101059592)
 <!-- badges: end -->
 
 # Compare unstructured data
diff --git a/checklist.yml b/checklist.yml
@@ -20,3 +20,6 @@ spelling:
   - .github
   - LICENSE.md
   - data/raw/utm_grid/utm1_vl.qmd
+pak:
+- b-cubed-eu/b3gbi
+- b-cubed-eu/dubicube
diff --git a/data/raw/utm_grid/utm1_vl.cpg b/data/raw/utm_grid/utm1_vl.cpg
diff --git a/data/raw/utm_grid/utm1_vl.prj b/data/raw/utm_grid/utm1_vl.prj
diff --git a/data/raw/utm_grid/utm1_vl.qmd b/data/raw/utm_grid/utm1_vl.qmd
diff --git a/inst/en_gb.dic b/inst/en_gb.dic
@@ -16,18 +16,21 @@ EOD
 ESAS
 Generis
 Havenlaan
+Inf
 Instituut
 LBBG
 Langeraert
 Laridae
 Larus
 Luscinia
 MGRS
+MRE
 Motacilla
 Natuur
 OOSTENDE
 Parus
 Pielou
+Pielou’s
 Poecile
 Rmd
 Rmd's
diff --git a/organisation.yml b/organisation.yml
@@ -1,13 +1,40 @@
-community: b3
-email: info@inbo.be
-github: b-cubed-eu
-funder: European Union's Horizon Europe Research and Innovation Programme (ID No 101059592)
-rightsholder: Research Institute for Nature and Forest (INBO)
-organisation:
-  inbo.be:
-    affiliation:
-    - Research Institute for Nature and Forest (INBO)
-    - Instituut voor Natuur- en Bosonderzoek (INBO)
-    - Institut de Recherche sur la Nature et les Forêts (INBO)
-    - Institut für Natur- und Waldforschung (INBO)
-    orcid: yes
+checklist version: 0.5.2
+git: https://github.com/b-cubed-eu
+info@inbo.be:
+  name:
+    nl-BE: Instituut voor Natuur- en Bosonderzoek (INBO)
+    fr-FR: Institut de Recherche sur la Nature et les Forêts (INBO)
+    en-GB: Research Institute for Nature and Forest (INBO)
+    de-DE: Institut für Natur- und Waldforschung (INBO)
+  email: info@inbo.be
+  website: https://www.vlaanderen.be/inbo/en-gb
+  logo: https://inbo.github.io/checklist/reference/figures/logo-en.png
+  ror: https://ror.org/00j54wy13
+  orcid: yes
+  zenodo: inbo
+  rightsholder: optional
+  funder: optional
+  license:
+    package:
+      GPL-3: https://raw.githubusercontent.com/inbo/checklist/refs/heads/main/inst/generic_template/gplv3.md
+      MIT: https://raw.githubusercontent.com/inbo/checklist/refs/heads/main/inst/generic_template/mit.md
+    project:
+      CC BY 4.0: https://raw.githubusercontent.com/inbo/checklist/refs/heads/main/inst/generic_template/cc_by_4_0.md
+    data:
+      CC0: https://raw.githubusercontent.com/inbo/checklist/131fe5829907079795533bfea767bf7df50c3cfd/inst/generic_template/cc0.md
+b-cubedsupport@meisebotanicgarden.be:
+  name:
+    en-GB: European Union (ID 101059592)
+  email: b-cubedsupport@meisebotanicgarden.be
+  website: https://b-cubed.eu/
+  orcid: no
+  zenodo: b3
+  rightsholder: optional
+  funder: single
+  license:
+    package:
+      MIT: https://raw.githubusercontent.com/inbo/checklist/refs/heads/main/inst/generic_template/mit.md
+    project:
+      MIT: https://raw.githubusercontent.com/inbo/checklist/refs/heads/main/inst/generic_template/mit.md
+    data:
+      CC BY 4.0: https://raw.githubusercontent.com/inbo/checklist/refs/heads/main/inst/generic_template/cc_by_4_0.md
diff --git a/source/R/download_occ_cube.R b/source/R/download_occ_cube.R
@@ -6,8 +6,11 @@ download_occ_cube <- function(sql_query, file, path, overwrite = FALSE) {
   # Stop if overwrite = FALSE and file does not exist
   file_path <- file.path(path, file)
   if (file.exists(file_path) && !overwrite) {
-    message(paste("File already exists. Reading existing file.",
-                  "Set `overwrite = TRUE` to overwrite file.", sep = "\n"))
+    message(
+      paste("File already exists. Reading existing file.",
+            "Set `overwrite = TRUE` to overwrite file.",
+            sep = "\n")
+    )
 
     occ_cube <- readr::read_csv(file = file_path, show_col_types = FALSE)
 
diff --git a/source/R/get_dataset_names.R b/source/R/get_dataset_names.R
@@ -0,0 +1,22 @@
+get_dataset_names <- function(df) {
+  require("dplyr")
+  require("rlang")
+
+  unique_datasets <- df %>%
+    distinct(.data$datasetkey, .data$datasetname)
+
+  dataset_names <- sapply(as.list(unique_datasets$datasetkey), function(key) {
+    rgbif::dataset_get(key)$title
+  })
+
+  # Complete dataset
+  full_dataset_df <- unique_datasets %>%
+    mutate(datasetname = coalesce(.data$datasetname, dataset_names))
+
+  # Add dataset names
+  df_out <- df %>%
+    select(-"datasetname") %>%
+    left_join(full_dataset_df, by = join_by("datasetkey"))
+
+  return(df_out)
+}
diff --git a/source/R/grouped_lm.R b/source/R/grouped_lm.R
@@ -0,0 +1,109 @@
+#' Fit group-wise linear models and extract slope statistics
+#'
+#' Fits a simple linear regression `y ~ x` separately for each group in a
+#' data frame and returns per-group slope statistics.
+#'
+#' Groups are processed in the order of factor levels if the grouping variable
+#' is a factor; otherwise, groups are processed in the order of appearance in
+#' the data.
+#'
+#' Optionally, the fitted `lm` objects can be returned and a transformation
+#' can be applied to the response variable before model fitting.
+#'
+#' @param data A data frame containing the variables used in the analysis.
+#' @param group_var Character string giving the name of the grouping variable.
+#' @param x_var Character string giving the name of the predictor variable.
+#' @param y_var Character string giving the name of the response variable.
+#' @param conf_level Confidence level for the slope confidence interval.
+#'   Defaults to `0.95`.
+#' @param y_transform Optional function applied to the response variable
+#'   before fitting the model (e.g. `log`, `sqrt`,
+#'   `function(y) log(y + 1)`). Defaults to `NULL`.
+#' @param return_lm Logical; if `TRUE`, the fitted `lm` objects are
+#'   returned in addition to the summary statistics. Defaults to `FALSE`.
+#'
+#' @return
+#' If `return_lm = FALSE`, a data frame with one row per group.
+#' If `return_lm = TRUE`, a list with components `coefficients`
+#' and `models`.
+grouped_lm <- function(data,
+                       group_var,
+                       x_var,
+                       y_var,
+                       conf_level = 0.95,
+                       y_transform = NULL,
+                       return_lm = FALSE) {
+
+  group_vec <- data[[group_var]]
+
+  # Determine group order:
+  # - factor: use factor levels
+  # - otherwise: use order of appearance
+  if (is.factor(group_vec)) {
+    groups <- levels(group_vec)
+  } else {
+    groups <- unique(group_vec)
+  }
+
+  results <- vector("list", length(groups))
+  names(results) <- groups
+
+  lm_list <- if (return_lm) vector("list", length(groups)) else NULL
+  if (return_lm) names(lm_list) <- groups
+
+  for (i in seq_along(groups)) {
+    g <- groups[i]
+
+    # Subset data for current group
+    df_g <- data[group_vec == g, ]
+
+    # Skip empty factor levels (can happen with unused levels)
+    if (nrow(df_g) == 0) {
+      next
+    }
+
+    y <- df_g[[y_var]]
+
+    # Optional response transformation
+    if (!is.null(y_transform)) {
+      y <- y_transform(y)
+    }
+
+    # Fit linear model
+    fit <- lm(y ~ df_g[[x_var]])
+    sm <- summary(fit)
+
+    # Extract slope statistics
+    slope   <- coef(sm)[2, "Estimate"]
+    se      <- coef(sm)[2, "Std. Error"]
+    p_value <- coef(sm)[2, "Pr(>|t|)"]
+    ci <- confint(fit, level = conf_level)[2, ]
+
+    results[[i]] <- data.frame(
+      group = g,
+      slope = slope,
+      se = se,
+      conf_low = ci[1],
+      conf_high = ci[2],
+      p_value = p_value,
+      row.names = NULL
+    )
+
+    if (return_lm) {
+      lm_list[[i]] <- fit
+    }
+  }
+
+  # Remove empty entries (unused factor levels)
+  results <- Filter(Negate(is.null), results)
+  result_df <- do.call(rbind, results)
+
+  if (return_lm) {
+    return(list(
+      coefficients = result_df,
+      models = lm_list[names(lm_list) %in% result_df$group]
+    ))
+  }
+
+  result_df
+}
diff --git a/source/R/plot_cross_validation.R b/source/R/plot_cross_validation.R
@@ -0,0 +1,35 @@
+plot_cross_validation <- function(
+    cv_df,
+    prevalence_df,
+    x = "abv",
+    y = "birdcube",
+    measure = "rmse",
+    quant = 0.9,
+    max.overlaps = 20) {
+  require("dplyr")
+  require("ggplot2")
+  require("rlang")
+  require("ggrepel")
+
+  cv_df %>%
+    distinct(.data$species, .data$rarity, !!sym(measure)) %>%
+    left_join(prevalence_df, by = join_by("species", "rarity")) %>%
+    mutate(measure_quant = stats::quantile(.data[[measure]], probs = quant)) %>%
+    ggplot(aes(x = .data[[x]], y = .data[[y]])) +
+    geom_point(aes(shape = .data$rarity, colour = .data[[measure]]), size = 2) +
+    ggrepel::geom_text_repel(
+      aes(label = ifelse(.data[[measure]] > .data$measure_quant,
+                         paste0(.data$species, "\n(value: ",
+                                round(.data[[measure]], 3), ")"),
+                         NA)
+      ),
+      size = 2.5, max.overlaps = max.overlaps
+    ) +
+    coord_cartesian(xlim = c(0, 1), ylim = c(0, 1)) +
+    labs(x = "Proportion of occupied grid cells\nin ABV dataset",
+         y = "Proportion of occupied grid cells\nin cube dataset",
+         shape = "Rarity",
+         colour = toupper(measure)) +
+    scale_colour_viridis_c(option = "turbo") +
+    theme_minimal()
+}
diff --git a/source/dataset_bias_cv.Rmd b/source/dataset_bias_cv.Rmd