Skip to content

Commit 016fe27

Browse files
author
EmmaCartuyvels1
committed
Add new pipeline, this will be an expansion/improvement on the exploratory pipeline
1 parent 092d742 commit 016fe27

File tree

6 files changed

+471
-0
lines changed

6 files changed

+471
-0
lines changed
Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
my_group_by <- function(data, cols) {
2+
require("dplyr")
3+
4+
group_by(data, pick({{ cols }}))
5+
}
6+
7+
range_comp <- function(data) {
8+
require("dplyr")
9+
require("tidyr")
10+
11+
dataset_least_species <- data |>
12+
group_by(.data$id_dataset) |>
13+
summarize(n_species = n_distinct(.data$species)) |>
14+
filter(.data$n_species == min(.data$n_species)) |>
15+
pull(.data$id_dataset)
16+
17+
species_list <- data |>
18+
filter(.data$id_dataset == dataset_least_species) |>
19+
select(.data$species) |>
20+
distinct() |>
21+
pull()
22+
23+
comp_range_data <- data |>
24+
filter(.data$species %in% species_list) |>
25+
group_by(pick(matches("^id_"))) |>
26+
mutate(tot_n_dist_gridcells = n_distinct(.data$mgrscode)) |>
27+
ungroup() |>
28+
my_group_by(c(c(.data$species,
29+
.data$tot_n_dist_gridcells),
30+
matches("^id_"))) |>
31+
summarise(n_dist_gridcells = n_distinct(.data$mgrscode)) |>
32+
ungroup() |>
33+
mutate(percentage = .data$n_dist_gridcells / .data$tot_n_dist_gridcells) |>
34+
pivot_wider(id_cols = c(.data$id_spat_res,
35+
.data$species,
36+
matches("^id_filter")),
37+
names_from = .data$id_dataset,
38+
values_from = c(.data$n_dist_gridcells, .data$percentage)) |>
39+
left_join(data |>
40+
filter(.data$id_dataset == "abv_data") |>
41+
distinct(.data$species, .data$category),
42+
by = join_by(.data$species))
43+
44+
return(comp_range_data)
45+
}
46+
47+
trend_comp <- function(data, time_period) {
48+
require("dplyr")
49+
require("tidyr")
50+
51+
dataset_least_species <- data |>
52+
group_by(.data$id_dataset) |>
53+
summarize(n_species = n_distinct(.data$species)) |>
54+
filter(.data$n_species == min(.data$n_species)) |>
55+
pull(.data$id_dataset)
56+
57+
species_list <- data |>
58+
filter(.data$id_dataset == dataset_least_species) |>
59+
select(.data$species) |>
60+
distinct() |>
61+
pull()
62+
63+
trend_range_data <- data |>
64+
filter(.data$species %in% species_list) |>
65+
my_group_by(c(c(.data$species, !!sym(time_period)), matches("^id_"))) |>
66+
summarize(occurrence = sum(n)) |>
67+
ungroup() |>
68+
pivot_wider(id_cols = c(.data$id_spat_res,
69+
.data$species,
70+
!!sym(time_period),
71+
matches("^id_filter")),
72+
names_from = .data$id_dataset,
73+
values_from = .data$occurrence) |>
74+
drop_na() |>
75+
my_group_by(c(c(.data$species, .data$id_spat_res),
76+
matches("^id_filter"))) |>
77+
summarise(correlation = cor(.data$abv_data,
78+
.data$birdflanders,
79+
method = "pearson")) |>
80+
ungroup() |>
81+
left_join(data |>
82+
filter(.data$id_dataset == "abv_data") |>
83+
distinct(.data$species, .data$category),
84+
by = join_by(.data$species)) |>
85+
mutate(time_period = time_period)
86+
87+
return(trend_range_data)
88+
}
Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
path_to_interim <- function(path_to_data, dataset, spat_res) {
2+
file <- paste0(dataset, "_cube_", spat_res, ".csv")
3+
file.path(path_to_data, "interim", file)
4+
}
5+
6+
read_andid <- function(data_file, dataset, spat_res) {
7+
require("dplyr")
8+
9+
data <- read.csv(data_file)
10+
11+
output <- data |>
12+
mutate(id_dataset = dataset,
13+
id_spat_res = spat_res)
14+
15+
return(output)
16+
}
17+
18+
add_cyclus <- function(data) {
19+
require("dplyr")
20+
21+
output <- data |>
22+
mutate(cyclus = case_when(
23+
year >= 2007 & year <= 2009 ~ 1,
24+
year >= 2010 & year <= 2012 ~ 2,
25+
year >= 2013 & year <= 2015 ~ 3,
26+
year >= 2016 & year <= 2018 ~ 4,
27+
year >= 2019 & year <= 2021 ~ 5,
28+
year >= 2022 & year <= 2024 ~ 6
29+
))
30+
31+
return(output)
32+
}
33+
34+
add_category <- function(data) {
35+
require("dplyr")
36+
37+
output <- data |>
38+
group_by(.data$species) |>
39+
mutate(n_obs = sum(.data$n)) |>
40+
ungroup() |>
41+
mutate(category = cut(.data$n_obs,
42+
breaks = c(0, 10, 100, 1000, 10000, +Inf),
43+
labels = c("Very rare", "Rare", "Common",
44+
"Very common", "Extremely common"),
45+
right = FALSE))
46+
47+
return(output)
48+
}
49+
50+
51+
filter_1 <- function(data) {
52+
abv_birds <- read.csv("./data/interim/abv_birds.csv")
53+
54+
output <- data |>
55+
filter(.data$species %in% abv_birds$species)
56+
57+
return(output)
58+
}
59+
60+
#' Rules (loosely based on ABV):
61+
#' 1) A square is only relevant is the species was observed in
62+
#' more than one time period
63+
#' 2) A minimum of three relevant squares to include the species
64+
#' 3) A minimum of a hundred observations to include the species
65+
66+
filter_2 <- function(data, time_period = "year") {
67+
require("dplyr")
68+
69+
output <- data |>
70+
group_by(.data$mgrscode, .data$species) |>
71+
mutate(periods = n_distinct(!!sym(time_period))) |>
72+
ungroup() |>
73+
filter(.data$periods > 1) |>
74+
group_by(.data$species) |>
75+
mutate(squares = n_distinct(.data$mgrscode)) |>
76+
ungroup() |>
77+
filter(.data$squares > 2) |>
78+
group_by(.data$species) |>
79+
mutate(obs = n()) |>
80+
ungroup() |>
81+
filter(.data$obs > 100) |>
82+
mutate(id_filter_per = .data$time_period)
83+
84+
return(output)
85+
}
86+
87+
filter_3 <- function(data, time_period = "year") {
88+
require("dplyr")
89+
90+
output <- data |>
91+
group_by(.data$id_dataset,
92+
.data$id_spat_res,
93+
.data$species,
94+
.data$category,
95+
!!sym(time_period)) |>
96+
summarise(n = sum(.data$n)) |>
97+
ungroup() |>
98+
group_by(!!sym(time_period)) |>
99+
mutate(total_obs = sum(.data$n)) |>
100+
ungroup() |>
101+
mutate(n = .data$n / .data$total_obs)
102+
103+
if ("id_filter_per" %in% colnames(data)) {
104+
output$id_filter_per <- data$id_filter_per[1]
105+
output$id_filter_per2 <- time_period
106+
} else {
107+
output$id_filter_per <- time_period
108+
}
109+
110+
return(output)
111+
}
Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
# Load packages required to define the pipeline:
2+
library(targets)
3+
4+
5+
# Set target options:
6+
tar_option_set(
7+
packages = c("tidyverse"),
8+
format = "qs" # Optionally set the default storage format. qs is fast.
9+
)
10+
11+
targets_project_dir <- rprojroot::find_root(rprojroot::is_git_root) |>
12+
file.path("source/pipelines/")
13+
path_to_data <- rprojroot::find_root(rprojroot::is_git_root) |>
14+
file.path("data")
15+
16+
tar_config_set(
17+
script = file.path(targets_project_dir, "exploratory_analysis", "_targets.R"),
18+
store = file.path(targets_project_dir, "exploratory_analysis",
19+
"_targets/"),
20+
config = "_targets.yaml",
21+
project = "exploratory_analysis",
22+
use_crew = TRUE
23+
)
24+
25+
# Run the R scripts in the R/ folder with our custom functions:
26+
tar_source(file.path(targets_project_dir, "exploratory_analysis", "R"))
27+
28+
# List of targets:
29+
list(
30+
tar_target(
31+
time_period,
32+
c("year", "cyclus")
33+
),
34+
tar_target(
35+
spat_res,
36+
c("1km", "10km")
37+
),
38+
tar_target(
39+
dataset,
40+
c("abv_data", "birdflanders")
41+
),
42+
tarchetypes::tar_file(
43+
data_file,
44+
path_to_interim(path_to_data = path_to_data,
45+
dataset = dataset,
46+
spat_res = spat_res),
47+
pattern = cross(dataset, spat_res)
48+
),
49+
tar_target(
50+
data_int1,
51+
read_andid(data_file, dataset, spat_res),
52+
pattern = map(data_file, cross(dataset, spat_res))
53+
),
54+
tar_target(
55+
data_int2,
56+
add_cyclus(data_int1),
57+
pattern = map(data_int1)
58+
),
59+
tar_target(
60+
data,
61+
add_category(data_int2),
62+
pattern = map(data_int2)
63+
),
64+
tar_target(
65+
filter1,
66+
filter_1(data),
67+
pattern = map(data)
68+
),
69+
tar_target(
70+
filter2,
71+
filter_2(data, time_period),
72+
pattern = cross(data, time_period)
73+
),
74+
tar_target(
75+
filter3,
76+
filter_3(data, time_period),
77+
pattern = cross(data, time_period)
78+
),
79+
tar_target(
80+
filter4,
81+
filter_3(filter2, time_period),
82+
pattern = cross(filter2, time_period)
83+
),
84+
tar_target(
85+
range_comp_0,
86+
range_comp(data)
87+
),
88+
tar_target(
89+
range_comp_1,
90+
range_comp(filter1)
91+
),
92+
tar_target(
93+
range_comp_2,
94+
range_comp(filter2)
95+
),
96+
tar_target(
97+
trend_comp_0,
98+
trend_comp(data, time_period),
99+
pattern = map(time_period)
100+
),
101+
tar_target(
102+
trend_comp_1,
103+
trend_comp(filter1, time_period),
104+
pattern = map(time_period)
105+
),
106+
tar_target(
107+
trend_comp_2,
108+
trend_comp(filter2, time_period),
109+
pattern = map(time_period)
110+
),
111+
tar_target(
112+
trend_comp_3,
113+
trend_comp(filter3, time_period),
114+
pattern = map(time_period)
115+
),
116+
tar_target(
117+
trend_comp_4,
118+
trend_comp(filter4, time_period),
119+
pattern = map(time_period)
120+
)
121+
)
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# CAUTION: do not edit this file by hand!
2+
# _targets/objects/ may have large data files,
3+
# and _targets/meta/process may have sensitive information.
4+
# It is good pratice to either commit nothing from _targets/,
5+
# or if your data is not too sensitive,
6+
# commit only _targets/meta/meta.
7+
*
8+
!.gitignore
9+
!meta
10+
meta/*
11+
!meta/meta

0 commit comments

Comments
 (0)