Add new pipeline, this will be an expansion/improvement on the exploratory pipeline

EmmaCartuyvels1 · EmmaCartuyvels1 · commit 016fe2789cd7 · 2025-08-29T18:35:53.000+02:00
diff --git a/source/pipelines/standardisation/R/expl_data.R b/source/pipelines/standardisation/R/expl_data.R
@@ -0,0 +1,88 @@
+my_group_by <- function(data, cols) {
+  require("dplyr")
+
+  group_by(data, pick({{ cols }}))
+}
+
+range_comp <- function(data) {
+  require("dplyr")
+  require("tidyr")
+
+  dataset_least_species <- data |>
+    group_by(.data$id_dataset) |>
+    summarize(n_species = n_distinct(.data$species)) |>
+    filter(.data$n_species == min(.data$n_species)) |>
+    pull(.data$id_dataset)
+
+  species_list <- data |>
+    filter(.data$id_dataset == dataset_least_species) |>
+    select(.data$species) |>
+    distinct() |>
+    pull()
+
+  comp_range_data <- data |>
+    filter(.data$species %in% species_list) |>
+    group_by(pick(matches("^id_"))) |>
+    mutate(tot_n_dist_gridcells = n_distinct(.data$mgrscode)) |>
+    ungroup() |>
+    my_group_by(c(c(.data$species,
+                    .data$tot_n_dist_gridcells),
+                  matches("^id_"))) |>
+    summarise(n_dist_gridcells = n_distinct(.data$mgrscode)) |>
+    ungroup() |>
+    mutate(percentage = .data$n_dist_gridcells / .data$tot_n_dist_gridcells) |>
+    pivot_wider(id_cols = c(.data$id_spat_res,
+                            .data$species,
+                            matches("^id_filter")),
+                names_from = .data$id_dataset,
+                values_from = c(.data$n_dist_gridcells, .data$percentage)) |>
+    left_join(data |>
+                filter(.data$id_dataset == "abv_data") |>
+                distinct(.data$species, .data$category),
+              by = join_by(.data$species))
+
+  return(comp_range_data)
+}
+
+trend_comp <- function(data, time_period) {
+  require("dplyr")
+  require("tidyr")
+
+  dataset_least_species <- data |>
+    group_by(.data$id_dataset) |>
+    summarize(n_species = n_distinct(.data$species)) |>
+    filter(.data$n_species == min(.data$n_species)) |>
+    pull(.data$id_dataset)
+
+  species_list <- data |>
+    filter(.data$id_dataset == dataset_least_species) |>
+    select(.data$species) |>
+    distinct() |>
+    pull()
+
+  trend_range_data <- data |>
+    filter(.data$species %in% species_list) |>
+    my_group_by(c(c(.data$species, !!sym(time_period)), matches("^id_"))) |>
+    summarize(occurrence = sum(n)) |>
+    ungroup() |>
+    pivot_wider(id_cols = c(.data$id_spat_res,
+                            .data$species,
+                            !!sym(time_period),
+                            matches("^id_filter")),
+                names_from = .data$id_dataset,
+                values_from = .data$occurrence) |>
+    drop_na() |>
+    my_group_by(c(c(.data$species, .data$id_spat_res),
+                  matches("^id_filter"))) |>
+    summarise(correlation = cor(.data$abv_data,
+                                .data$birdflanders,
+                                method = "pearson")) |>
+    ungroup() |>
+    left_join(data |>
+                filter(.data$id_dataset == "abv_data") |>
+                distinct(.data$species, .data$category),
+              by = join_by(.data$species)) |>
+    mutate(time_period = time_period)
+
+  return(trend_range_data)
+}
diff --git a/source/pipelines/standardisation/R/read_data.R b/source/pipelines/standardisation/R/read_data.R
@@ -0,0 +1,111 @@
+path_to_interim <- function(path_to_data, dataset, spat_res) {
+  file <- paste0(dataset, "_cube_", spat_res, ".csv")
+  file.path(path_to_data, "interim", file)
+}
+
+read_andid <- function(data_file, dataset, spat_res) {
+  require("dplyr")
+
+  data <- read.csv(data_file)
+
+  output <- data |>
+    mutate(id_dataset = dataset,
+           id_spat_res = spat_res)
+
+  return(output)
+}
+
+add_cyclus <- function(data) {
+  require("dplyr")
+
+  output <- data |>
+    mutate(cyclus = case_when(
+      year >= 2007 & year <= 2009 ~ 1,
+      year >= 2010 & year <= 2012 ~ 2,
+      year >= 2013 & year <= 2015 ~ 3,
+      year >= 2016 & year <= 2018 ~ 4,
+      year >= 2019 & year <= 2021 ~ 5,
+      year >= 2022 & year <= 2024 ~ 6
+    ))
+
+  return(output)
+}
+
+add_category <- function(data) {
+  require("dplyr")
+
+  output <- data |>
+    group_by(.data$species) |>
+    mutate(n_obs = sum(.data$n)) |>
+    ungroup() |>
+    mutate(category = cut(.data$n_obs,
+                          breaks = c(0, 10, 100, 1000, 10000, +Inf),
+                          labels = c("Very rare", "Rare", "Common",
+                                     "Very common", "Extremely common"),
+                          right = FALSE))
+
+  return(output)
+}
+
+
+filter_1 <- function(data) {
+  abv_birds <- read.csv("./data/interim/abv_birds.csv")
+
+  output <- data |>
+    filter(.data$species %in% abv_birds$species)
+
+  return(output)
+}
+
+#' Rules (loosely based on ABV):
+#' 1) A square is only relevant is the species was observed in
+#' more than one time period
+#' 2) A minimum of three relevant squares to include the species
+#' 3) A minimum of a hundred observations to include the species
+
+filter_2 <- function(data, time_period = "year") {
+  require("dplyr")
+
+  output <- data |>
+    group_by(.data$mgrscode, .data$species) |>
+    mutate(periods = n_distinct(!!sym(time_period))) |>
+    ungroup() |>
+    filter(.data$periods > 1) |>
+    group_by(.data$species) |>
+    mutate(squares = n_distinct(.data$mgrscode)) |>
+    ungroup() |>
+    filter(.data$squares > 2) |>
+    group_by(.data$species) |>
+    mutate(obs = n()) |>
+    ungroup() |>
+    filter(.data$obs > 100) |>
+    mutate(id_filter_per = .data$time_period)
+
+  return(output)
+}
+
+filter_3 <- function(data, time_period = "year") {
+  require("dplyr")
+
+  output <- data |>
+    group_by(.data$id_dataset,
+             .data$id_spat_res,
+             .data$species,
+             .data$category,
+             !!sym(time_period)) |>
+    summarise(n = sum(.data$n)) |>
+    ungroup() |>
+    group_by(!!sym(time_period)) |>
+    mutate(total_obs = sum(.data$n)) |>
+    ungroup() |>
+    mutate(n = .data$n / .data$total_obs)
+
+  if ("id_filter_per" %in% colnames(data)) {
+    output$id_filter_per <- data$id_filter_per[1]
+    output$id_filter_per2 <- time_period
+  } else {
+    output$id_filter_per <- time_period
+  }
+
+  return(output)
+}
diff --git a/source/pipelines/standardisation/_targets.R b/source/pipelines/standardisation/_targets.R
@@ -0,0 +1,121 @@
+# Load packages required to define the pipeline:
+library(targets)
+
+
+# Set target options:
+tar_option_set(
+  packages = c("tidyverse"),
+  format = "qs" # Optionally set the default storage format. qs is fast.
+)
+
+targets_project_dir <- rprojroot::find_root(rprojroot::is_git_root) |>
+  file.path("source/pipelines/")
+path_to_data <- rprojroot::find_root(rprojroot::is_git_root) |>
+  file.path("data")
+
+tar_config_set(
+  script = file.path(targets_project_dir, "exploratory_analysis", "_targets.R"),
+  store = file.path(targets_project_dir, "exploratory_analysis",
+                    "_targets/"),
+  config = "_targets.yaml",
+  project = "exploratory_analysis",
+  use_crew = TRUE
+)
+
+# Run the R scripts in the R/ folder with our custom functions:
+tar_source(file.path(targets_project_dir, "exploratory_analysis", "R"))
+
+# List of targets:
+list(
+  tar_target(
+    time_period,
+    c("year", "cyclus")
+  ),
+  tar_target(
+    spat_res,
+    c("1km", "10km")
+  ),
+  tar_target(
+    dataset,
+    c("abv_data", "birdflanders")
+  ),
+  tarchetypes::tar_file(
+    data_file,
+    path_to_interim(path_to_data = path_to_data,
+                    dataset = dataset,
+                    spat_res = spat_res),
+    pattern = cross(dataset, spat_res)
+  ),
+  tar_target(
+    data_int1,
+    read_andid(data_file, dataset, spat_res),
+    pattern = map(data_file, cross(dataset, spat_res))
+  ),
+  tar_target(
+    data_int2,
+    add_cyclus(data_int1),
+    pattern = map(data_int1)
+  ),
+  tar_target(
+    data,
+    add_category(data_int2),
+    pattern = map(data_int2)
+  ),
+  tar_target(
+    filter1,
+    filter_1(data),
+    pattern = map(data)
+  ),
+  tar_target(
+    filter2,
+    filter_2(data, time_period),
+    pattern = cross(data, time_period)
+  ),
+  tar_target(
+    filter3,
+    filter_3(data, time_period),
+    pattern = cross(data, time_period)
+  ),
+  tar_target(
+    filter4,
+    filter_3(filter2, time_period),
+    pattern = cross(filter2, time_period)
+  ),
+  tar_target(
+    range_comp_0,
+    range_comp(data)
+  ),
+  tar_target(
+    range_comp_1,
+    range_comp(filter1)
+  ),
+  tar_target(
+    range_comp_2,
+    range_comp(filter2)
+  ),
+  tar_target(
+    trend_comp_0,
+    trend_comp(data, time_period),
+    pattern = map(time_period)
+  ),
+  tar_target(
+    trend_comp_1,
+    trend_comp(filter1, time_period),
+    pattern = map(time_period)
+  ),
+  tar_target(
+    trend_comp_2,
+    trend_comp(filter2, time_period),
+    pattern = map(time_period)
+  ),
+  tar_target(
+    trend_comp_3,
+    trend_comp(filter3, time_period),
+    pattern = map(time_period)
+  ),
+  tar_target(
+    trend_comp_4,
+    trend_comp(filter4, time_period),
+    pattern = map(time_period)
+  )
+)
diff --git a/source/pipelines/standardisation/_targets/.gitignore b/source/pipelines/standardisation/_targets/.gitignore
@@ -0,0 +1,11 @@
+# CAUTION: do not edit this file by hand!
+# _targets/objects/ may have large data files,
+# and _targets/meta/process may have sensitive information.
+# It is good pratice to either commit nothing from _targets/,
+# or if your data is not too sensitive,
+# commit only _targets/meta/meta.
+*
+!.gitignore
+!meta
+meta/*
+!meta/meta
diff --git a/source/pipelines/standardisation/_targets/meta/meta b/source/pipelines/standardisation/_targets/meta/meta
diff --git a/source/pipelines/standardisation/run_pipeline.R b/source/pipelines/standardisation/run_pipeline.R