Skip to content

Commit a82ef20

Browse files
authored
Merge pull request #9 from b-cubed-eu/dataset-cv
Dataset cross-validation analysis
2 parents 092d742 + 0986dd6 commit a82ef20

File tree

12 files changed

+1824
-44
lines changed

12 files changed

+1824
-44
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
![GitHub](https://img.shields.io/github/license/b-cubed-eu/comp-unstructured-data)
33
[![repo status](https://www.repostatus.org/badges/latest/wip.svg)](https://www.repostatus.org/#wip)
44
![GitHub repo size](https://img.shields.io/github/repo-size/b-cubed-eu/comp-unstructured-data)
5+
[![funder](https://badgen.net/static/funder/European%20Union/f2a)](https://doi.org/10.3030/101059592)
56
<!-- badges: end -->
67

78
# Compare unstructured data

checklist.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,3 +20,6 @@ spelling:
2020
- .github
2121
- LICENSE.md
2222
- data/raw/utm_grid/utm1_vl.qmd
23+
pak:
24+
- b-cubed-eu/b3gbi
25+
- b-cubed-eu/dubicube

data/raw/utm_grid/utm1_vl.cpg

Lines changed: 0 additions & 1 deletion
This file was deleted.

data/raw/utm_grid/utm1_vl.prj

Lines changed: 0 additions & 1 deletion
This file was deleted.

data/raw/utm_grid/utm1_vl.qmd

Lines changed: 0 additions & 27 deletions
This file was deleted.

inst/en_gb.dic

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,18 +16,21 @@ EOD
1616
ESAS
1717
Generis
1818
Havenlaan
19+
Inf
1920
Instituut
2021
LBBG
2122
Langeraert
2223
Laridae
2324
Larus
2425
Luscinia
2526
MGRS
27+
MRE
2628
Motacilla
2729
Natuur
2830
OOSTENDE
2931
Parus
3032
Pielou
33+
Pielou’s
3134
Poecile
3235
Rmd
3336
Rmd's

organisation.yml

Lines changed: 40 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,40 @@
1-
community: b3
2-
email: info@inbo.be
3-
github: b-cubed-eu
4-
funder: European Union's Horizon Europe Research and Innovation Programme (ID No 101059592)
5-
rightsholder: Research Institute for Nature and Forest (INBO)
6-
organisation:
7-
inbo.be:
8-
affiliation:
9-
- Research Institute for Nature and Forest (INBO)
10-
- Instituut voor Natuur- en Bosonderzoek (INBO)
11-
- Institut de Recherche sur la Nature et les Forêts (INBO)
12-
- Institut für Natur- und Waldforschung (INBO)
13-
orcid: yes
1+
checklist version: 0.5.2
2+
git: https://github.com/b-cubed-eu
3+
info@inbo.be:
4+
name:
5+
nl-BE: Instituut voor Natuur- en Bosonderzoek (INBO)
6+
fr-FR: Institut de Recherche sur la Nature et les Forêts (INBO)
7+
en-GB: Research Institute for Nature and Forest (INBO)
8+
de-DE: Institut für Natur- und Waldforschung (INBO)
9+
email: info@inbo.be
10+
website: https://www.vlaanderen.be/inbo/en-gb
11+
logo: https://inbo.github.io/checklist/reference/figures/logo-en.png
12+
ror: https://ror.org/00j54wy13
13+
orcid: yes
14+
zenodo: inbo
15+
rightsholder: optional
16+
funder: optional
17+
license:
18+
package:
19+
GPL-3: https://raw.githubusercontent.com/inbo/checklist/refs/heads/main/inst/generic_template/gplv3.md
20+
MIT: https://raw.githubusercontent.com/inbo/checklist/refs/heads/main/inst/generic_template/mit.md
21+
project:
22+
CC BY 4.0: https://raw.githubusercontent.com/inbo/checklist/refs/heads/main/inst/generic_template/cc_by_4_0.md
23+
data:
24+
CC0: https://raw.githubusercontent.com/inbo/checklist/131fe5829907079795533bfea767bf7df50c3cfd/inst/generic_template/cc0.md
25+
b-cubedsupport@meisebotanicgarden.be:
26+
name:
27+
en-GB: European Union (ID 101059592)
28+
email: b-cubedsupport@meisebotanicgarden.be
29+
website: https://b-cubed.eu/
30+
orcid: no
31+
zenodo: b3
32+
rightsholder: optional
33+
funder: single
34+
license:
35+
package:
36+
MIT: https://raw.githubusercontent.com/inbo/checklist/refs/heads/main/inst/generic_template/mit.md
37+
project:
38+
MIT: https://raw.githubusercontent.com/inbo/checklist/refs/heads/main/inst/generic_template/mit.md
39+
data:
40+
CC BY 4.0: https://raw.githubusercontent.com/inbo/checklist/refs/heads/main/inst/generic_template/cc_by_4_0.md

source/R/download_occ_cube.R

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,11 @@ download_occ_cube <- function(sql_query, file, path, overwrite = FALSE) {
66
# Stop if overwrite = FALSE and file does not exist
77
file_path <- file.path(path, file)
88
if (file.exists(file_path) && !overwrite) {
9-
message(paste("File already exists. Reading existing file.",
10-
"Set `overwrite = TRUE` to overwrite file.", sep = "\n"))
9+
message(
10+
paste("File already exists. Reading existing file.",
11+
"Set `overwrite = TRUE` to overwrite file.",
12+
sep = "\n")
13+
)
1114

1215
occ_cube <- readr::read_csv(file = file_path, show_col_types = FALSE)
1316

source/R/get_dataset_names.R

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
get_dataset_names <- function(df) {
2+
require("dplyr")
3+
require("rlang")
4+
5+
unique_datasets <- df %>%
6+
distinct(.data$datasetkey, .data$datasetname)
7+
8+
dataset_names <- sapply(as.list(unique_datasets$datasetkey), function(key) {
9+
rgbif::dataset_get(key)$title
10+
})
11+
12+
# Complete dataset
13+
full_dataset_df <- unique_datasets %>%
14+
mutate(datasetname = coalesce(.data$datasetname, dataset_names))
15+
16+
# Add dataset names
17+
df_out <- df %>%
18+
select(-"datasetname") %>%
19+
left_join(full_dataset_df, by = join_by("datasetkey"))
20+
21+
return(df_out)
22+
}

source/R/grouped_lm.R

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
#' Fit group-wise linear models and extract slope statistics
2+
#'
3+
#' Fits a simple linear regression `y ~ x` separately for each group in a
4+
#' data frame and returns per-group slope statistics.
5+
#'
6+
#' Groups are processed in the order of factor levels if the grouping variable
7+
#' is a factor; otherwise, groups are processed in the order of appearance in
8+
#' the data.
9+
#'
10+
#' Optionally, the fitted `lm` objects can be returned and a transformation
11+
#' can be applied to the response variable before model fitting.
12+
#'
13+
#' @param data A data frame containing the variables used in the analysis.
14+
#' @param group_var Character string giving the name of the grouping variable.
15+
#' @param x_var Character string giving the name of the predictor variable.
16+
#' @param y_var Character string giving the name of the response variable.
17+
#' @param conf_level Confidence level for the slope confidence interval.
18+
#' Defaults to `0.95`.
19+
#' @param y_transform Optional function applied to the response variable
20+
#' before fitting the model (e.g. `log`, `sqrt`,
21+
#' `function(y) log(y + 1)`). Defaults to `NULL`.
22+
#' @param return_lm Logical; if `TRUE`, the fitted `lm` objects are
23+
#' returned in addition to the summary statistics. Defaults to `FALSE`.
24+
#'
25+
#' @return
26+
#' If `return_lm = FALSE`, a data frame with one row per group.
27+
#' If `return_lm = TRUE`, a list with components `coefficients`
28+
#' and `models`.
29+
grouped_lm <- function(data,
30+
group_var,
31+
x_var,
32+
y_var,
33+
conf_level = 0.95,
34+
y_transform = NULL,
35+
return_lm = FALSE) {
36+
37+
group_vec <- data[[group_var]]
38+
39+
# Determine group order:
40+
# - factor: use factor levels
41+
# - otherwise: use order of appearance
42+
if (is.factor(group_vec)) {
43+
groups <- levels(group_vec)
44+
} else {
45+
groups <- unique(group_vec)
46+
}
47+
48+
results <- vector("list", length(groups))
49+
names(results) <- groups
50+
51+
lm_list <- if (return_lm) vector("list", length(groups)) else NULL
52+
if (return_lm) names(lm_list) <- groups
53+
54+
for (i in seq_along(groups)) {
55+
g <- groups[i]
56+
57+
# Subset data for current group
58+
df_g <- data[group_vec == g, ]
59+
60+
# Skip empty factor levels (can happen with unused levels)
61+
if (nrow(df_g) == 0) {
62+
next
63+
}
64+
65+
y <- df_g[[y_var]]
66+
67+
# Optional response transformation
68+
if (!is.null(y_transform)) {
69+
y <- y_transform(y)
70+
}
71+
72+
# Fit linear model
73+
fit <- lm(y ~ df_g[[x_var]])
74+
sm <- summary(fit)
75+
76+
# Extract slope statistics
77+
slope <- coef(sm)[2, "Estimate"]
78+
se <- coef(sm)[2, "Std. Error"]
79+
p_value <- coef(sm)[2, "Pr(>|t|)"]
80+
ci <- confint(fit, level = conf_level)[2, ]
81+
82+
results[[i]] <- data.frame(
83+
group = g,
84+
slope = slope,
85+
se = se,
86+
conf_low = ci[1],
87+
conf_high = ci[2],
88+
p_value = p_value,
89+
row.names = NULL
90+
)
91+
92+
if (return_lm) {
93+
lm_list[[i]] <- fit
94+
}
95+
}
96+
97+
# Remove empty entries (unused factor levels)
98+
results <- Filter(Negate(is.null), results)
99+
result_df <- do.call(rbind, results)
100+
101+
if (return_lm) {
102+
return(list(
103+
coefficients = result_df,
104+
models = lm_list[names(lm_list) %in% result_df$group]
105+
))
106+
}
107+
108+
result_df
109+
}

0 commit comments

Comments
 (0)