b-cubed-eu · EmmaCartuyvels1 · Aug 26, 2025 · Mar 4, 2025 · Mar 4, 2025 · Mar 4, 2025
diff --git a/_targets.yaml b/_targets.yaml
@@ -1,8 +1,8 @@
-target_workflow:
-  script: C:/R/git_repositories/comp-unstructured-data/source/pipelines/target_workflow/_targets.R
-  store: C:/R/git_repositories/comp-unstructured-data/source/pipelines/target_workflow/_targets
-  use_crew: yes
 biodiversity_indicators:
   script: C:/R/git_repositories/comp-unstructured-data/source/pipelines/biodiversity_indicators/_targets.R
   store: C:/R/git_repositories/comp-unstructured-data/source/pipelines/biodiversity_indicators/_targets
   use_crew: yes
+exploratory_analysis:
+  script: C:/R/git_repositories/comp-unstructured-data/source/pipelines/exploratory_analysis/_targets.R
+  store: C:/R/git_repositories/comp-unstructured-data/source/pipelines/exploratory_analysis/_targets
+  use_crew: yes
diff --git a/comp-unstructured-data.Rproj b/comp-unstructured-data.Rproj
@@ -1,4 +1,5 @@
 Version: 1.0
+ProjectId: 917f1e07-7bf8-4404-b0ed-c2b02a93dc01
 
 RestoreWorkspace: Default
 SaveWorkspace: Default

diff --git a/data/raw/utm_grid/utm10_vlgrens_zBRU.dbf b/data/raw/utm_grid/utm10_vlgrens_zBRU.dbf
diff --git a/data/raw/utm_grid/utm10_vlgrens_zBRU.prj b/data/raw/utm_grid/utm10_vlgrens_zBRU.prj
@@ -0,0 +1 @@
+PROJCS["Belge_Lambert_1972",GEOGCS["GCS_Belge_1972",DATUM["D_Belge_1972",SPHEROID["International_1924",6378388.0,297.0]],PRIMEM["Greenwich",0.0],UNIT["Degree",0.0174532925199433]],PROJECTION["Lambert_Conformal_Conic"],PARAMETER["False_Easting",150000.01256],PARAMETER["False_Northing",5400088.4378],PARAMETER["Central_Meridian",4.367486666666666],PARAMETER["Standard_Parallel_1",49.8333339],PARAMETER["Standard_Parallel_2",51.16666733333333],PARAMETER["Latitude_Of_Origin",90.0],UNIT["Meter",1.0]],VERTCS["Oostende",VDATUM["Oostende"],PARAMETER["Vertical_Shift",0.0],PARAMETER["Direction",1.0],UNIT["Meter",1.0]]
diff --git a/data/raw/utm_grid/utm10_vlgrens_zBRU.sbn b/data/raw/utm_grid/utm10_vlgrens_zBRU.sbn
diff --git a/data/raw/utm_grid/utm10_vlgrens_zBRU.sbx b/data/raw/utm_grid/utm10_vlgrens_zBRU.sbx
diff --git a/data/raw/utm_grid/utm10_vlgrens_zBRU.shp b/data/raw/utm_grid/utm10_vlgrens_zBRU.shp
diff --git a/data/raw/utm_grid/utm10_vlgrens_zBRU.shx b/data/raw/utm_grid/utm10_vlgrens_zBRU.shx
diff --git a/inst/en_gb.dic b/inst/en_gb.dic
@@ -1,10 +1,13 @@
 Algemene
+Anthus
 Bosonderzoek
 Broedvogelmonitoring
 Broedvogels
 Cartuyvels
 Cetti's
 Cettia
+Chloris
+Cyanistes
 Daele
 Databricks
 Dendrocopos
@@ -20,6 +23,7 @@ Laridae
 Larus
 Luscinia
 MGRS
+Motacilla
 Natuur
 OOSTENDE
 Parus
@@ -35,23 +39,30 @@ Watervogels
 abv
 argentatus
 birdcube
+caeruleus
 cetti
+chloris
 color
+communis
 datacube
 datacubes
+domesticus
 eBird
+flava
 fuscus
 gbi
 ies
 labeled
 megarhynchos
+modularis
 montanus
 org
 rubicola
 sublicensable
 synched
 tabset
 torquatus
+trivialis
 utm
 voor
 waarnemingen

diff --git a/source/Prepare_data_10km.Rmd b/source/Prepare_data_10km.Rmd
@@ -0,0 +1,302 @@
+---
+title: "Download and prepare ABV and cube data at 10km² grid"
+author: "Ward Langeraert, Emma Cartuyvels"
+date: "`r Sys.Date()`"
+output:
+  html_document:
+    code_folding: show
+    toc: true
+    toc_float: true
+    toc_collapsed: true
+editor_options: 
+  chunk_output_type: console
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+```
+
+```{r, warning=FALSE, message=FALSE}
+# Load packages
+library(tidyverse) # Data wrangling and visualisation
+library(zen4R)     # Download from zenodo
+library(here)      # Relative paths
+library(sf)        # Work with spatial data
+
+# Source
+source(here("source/R/download_occ_cube.R"))
+
+# Data path and create directory if necessary
+data_path <- here("data", "raw")
+dir.create(data_path, showWarnings = FALSE, recursive = TRUE)
+```
+
+# Goal
+
+Load and save structured data of the “Common Breeding Bird Survey Flanders” (ABV) at 10km² grid.
+Load and save unstructured data at 10km² grid.
+
+# Structured data
+
+## Occurrence data
+
+The ABV data is downloaded as a cube from GBIF.org.
+The zip file is stored under *./data/raw*.
+
+> GBIF.org (15 April 2025) GBIF Occurrence Download  https://doi.org/10.15468/dl.hdwm9t
+
+```{r}
+# nolint start: line_length_linter.
+query_abv <- "SELECT
+  \"year\",
+  GBIF_MGRSCode(10000, decimalLatitude, decimalLongitude,
+  COALESCE(coordinateUncertaintyInMeters, 1000)) AS mgrsCode,
+  speciesKey,
+  species,
+  family,
+  COUNT(*) AS n,
+  MIN(COALESCE(coordinateUncertaintyInMeters, 1000)) AS minCoordinateUncertaintyInMeters,
+  IF(ISNULL(family), NULL, SUM(COUNT(*)) OVER (PARTITION BY family)) AS familyCount
+  FROM
+  occurrence
+  WHERE
+  occurrenceStatus = 'PRESENT'
+  AND NOT occurrence.basisofrecord IN ('FOSSIL_SPECIMEN', 'LIVING_SPECIMEN')
+  AND NOT ARRAY_CONTAINS(issue, 'ZERO_COORDINATE')
+  AND NOT ARRAY_CONTAINS(issue, 'COORDINATE_OUT_OF_RANGE')
+  AND NOT ARRAY_CONTAINS(issue, 'COORDINATE_INVALID')
+  AND NOT ARRAY_CONTAINS(issue, 'COUNTRY_COORDINATE_MISMATCH')
+  AND level1gid = 'BEL.2_1'
+  AND \"year\" >= 2007
+  AND \"year\" <= 2022
+  AND speciesKey IS NOT NULL
+  AND decimalLatitude IS NOT NULL
+  AND decimalLongitude IS NOT NULL
+  AND class = 'Aves'
+  AND collectionCode = 'ABV'
+  GROUP BY
+  \"year\",
+  mgrsCode,
+  speciesKey,
+  family,
+  species
+  ORDER BY
+  \"year\" ASC,
+  mgrsCode ASC,
+  speciesKey ASC"
+# nolint end
+
+abv_data_total <- download_occ_cube(
+  sql_query = query_abv,
+  file = "abv_data_10km.csv",
+  path = data_path,
+  overwrite = FALSE
+)
+```
+
+We get a big dataframe with all occurrences.
+
+```{r}
+# Explore dataframe
+glimpse(abv_data_total)
+```
+
+# Unstructured data
+
+The cube data is downloaded from GBIF.org.
+The zip file is stored under *./data/raw*.
+
+> GBIF.org (15 April 2025) GBIF Occurrence Download  https://doi.org/10.15468/dl.75hgxm
+
+```{r}
+# nolint start: line_length_linter.
+query_birdcube <- "SELECT
+  \"year\",
+  GBIF_MGRSCode(10000, decimalLatitude, decimalLongitude,
+  COALESCE(coordinateUncertaintyInMeters, 10000)) AS mgrsCode,
+  speciesKey,
+  species,
+  family,
+  COUNT(*) AS n,
+  MIN(COALESCE(coordinateUncertaintyInMeters, 10000)) AS minCoordinateUncertaintyInMeters,
+  IF(ISNULL(family), NULL, SUM(COUNT(*)) OVER (PARTITION BY family)) AS familyCount
+  FROM
+  occurrence
+  WHERE
+  occurrenceStatus = 'PRESENT'
+  AND NOT occurrence.basisofrecord IN ('FOSSIL_SPECIMEN', 'LIVING_SPECIMEN')
+  AND NOT ARRAY_CONTAINS(issue, 'ZERO_COORDINATE')
+  AND NOT ARRAY_CONTAINS(issue, 'COORDINATE_OUT_OF_RANGE')
+  AND NOT ARRAY_CONTAINS(issue, 'COORDINATE_INVALID')
+  AND NOT ARRAY_CONTAINS(issue, 'COUNTRY_COORDINATE_MISMATCH')
+  AND level1gid = 'BEL.2_1'
+  AND \"year\" >= 2007
+  AND \"year\" <= 2022
+  AND speciesKey IS NOT NULL
+  AND decimalLatitude IS NOT NULL
+  AND decimalLongitude IS NOT NULL
+  AND class = 'Aves'
+  AND collectionCode != 'ABV'
+  GROUP BY
+  \"year\",
+  mgrsCode,
+  speciesKey,
+  family,
+  species
+  ORDER BY
+  \"year\" ASC,
+  mgrsCode ASC,
+  speciesKey ASC"
+# nolint end
+
+birdcube_data_total <- download_occ_cube(
+  sql_query = query_birdcube,
+  file = "birdcube_10km.csv",
+  path = data_path,
+  overwrite = FALSE
+)
+```
+
+We get a big dataframe with all occurrences.
+
+```{r}
+# Explore dataframe
+glimpse(birdcube_data_total)
+```
+
+# Select Flanders grid cells
+The datacubes cover multiple zones although Flanders is present only in zone 31U. 
+
+```{r}
+# Number of rows per zone
+table(substring(abv_data_total$mgrscode, 1, 3))
+```
+
+We load in the UTM grid for Flanders (10 km) and add 31U to the tag names.
+
+```{r}
+# Read UTM 10 km grid and add new column with correct MGRS code
+utm_grid <- read_sf(file.path(data_path, "utm_grid", "utm10_vlgrens_zBRU.shp"))
+utm_grid <- utm_grid %>%
+  mutate(mgrscode = paste0("31U", TAG))
+
+# Explore dataframe
+glimpse(utm_grid)
+```
+
+We add the geometry to the data layers by taking an inner join.
+
+```{r}
+# Add UTM geometry by taking an inner join
+abv_data_total_sf <- utm_grid %>%
+  inner_join(abv_data_total, by = join_by(mgrscode)) %>%
+  st_sf(sf_column_name = "geometry")
+
+# Visualise spatial distribution of the ABV data
+utm_grid %>%
+  left_join(abv_data_total %>%
+              group_by(mgrscode) %>%
+              summarise(n_species = n_distinct(species), .groups = "drop"),
+            by = join_by(mgrscode)) %>%
+  ggplot() +
+  geom_sf(aes(fill = n_species), col = alpha("white", 0)) +
+  scale_fill_viridis_c(option = "inferno") +
+  ggtitle("ABV data")
+```
+
+We select cube data from Flanders and add the geometry to the data layers by taking an inner join.
+
+```{r}
+# Add UTM geometry and select data by taking an inner join
+birdcube_data_total_sf <- utm_grid %>%
+  inner_join(birdcube_data_total, by = join_by(mgrscode)) %>%
+  st_sf(sf_column_name = "geometry")
+```
+
+```{r}
+# Visualise spatial distribution data cube as number of species
+utm_grid %>%
+  left_join(birdcube_data_total %>%
+              group_by(mgrscode) %>%
+              summarise(n_species = n_distinct(species), .groups = "drop"),
+            by = join_by(mgrscode)) %>%
+  ggplot() +
+  geom_sf(aes(fill = n_species), col = alpha("white", 0)) +
+  scale_fill_viridis_c(option = "inferno") +
+  ggtitle("Bird cube data from Flanders")
+```
+
+# Correction of species names
+
+There are some double accepted species names that cause trouble.
+
+```{r}
+abv_data_total_sf <- abv_data_total_sf %>%
+  mutate(
+    species = case_when(
+      species == "Dendrocopus major" ~ "Dendrocopos major",
+      species == "Saxicola torquatus" ~ "Saxicola rubicola",
+      TRUE ~ species
+    ),
+    specieskey = case_when(
+      species == "Dendrocopos major" ~ 2477968,
+      species == "Saxicola rubicola" ~ 4408759,
+      TRUE ~ specieskey
+    )
+  )
+```
+
+```{r}
+birdcube_data_total_sf <- birdcube_data_total_sf %>%
+  mutate(
+    species = case_when(
+      species == "Poecile montanus" ~ "Parus montanus",
+      TRUE ~ species
+    ),
+    specieskey = case_when(
+      species == "Parus montanus" ~ 4409010,
+      TRUE ~ specieskey
+    )
+  )
+```
+
+# Write out data
+
+We select the columns we want in a logical order:
+
+```{r}
+abv_data_out_sf <- abv_data_total_sf %>%
+  select("mgrscode", "year", "specieskey", "species", "family", "n",
+         "mincoordinateuncertaintyinmeters", "familycount", "geometry")
+abv_data_out <- st_drop_geometry(abv_data_out_sf)
+
+birdcube_data_out_sf <- birdcube_data_total_sf %>%
+  select("mgrscode", "year", "specieskey", "species", "family", "n",
+         "mincoordinateuncertaintyinmeters", "familycount", "geometry")
+birdcube_data_out <- st_drop_geometry(birdcube_data_out_sf)
+```
+
+We write out the data for exploration and analysis.
+
+```{r}
+out_path <- here("data", "interim")
+dir.create(out_path, showWarnings = FALSE, recursive = TRUE)
+
+# Structured data
+## CSV
+write_csv(abv_data_out,
+          file.path(out_path, "abv_data_cube_10km.csv"))
+
+## Spatial object
+write_sf(abv_data_out_sf,
+         file.path(out_path, "abv_data_cube_10km.gpkg"))
+
+# Unstructured data
+## CSV
+write_csv(birdcube_data_out,
+          file.path(out_path, "birdflanders_cube_10km.csv"))
+
+## Spatial object
+write_sf(birdcube_data_out_sf,
+         file.path(out_path, "birdflanders_cube_10km.gpkg"))
+```
diff --git a/source/R/download_occ_cube.R b/source/R/download_occ_cube.R
@@ -7,7 +7,7 @@ download_occ_cube <- function(sql_query, file, path, overwrite = FALSE) {
   file_path <- file.path(path, file)
   if (file.exists(file_path) && !overwrite) {
     message(paste("File already exists. Reading existing file.",
-            "Set `overwrite = TRUE` to overwrite file.", sep = "\n"))
+                  "Set `overwrite = TRUE` to overwrite file.", sep = "\n"))
 
     occ_cube <- readr::read_csv(file = file_path, show_col_types = FALSE)
 
@@ -34,7 +34,8 @@ download_occ_cube <- function(sql_query, file, path, overwrite = FALSE) {
   readr::write_csv(
     x = occ_cube,
     file = file_path,
-    append = FALSE)
+    append = FALSE
+  )
 
   # Return tibble
   return(occ_cube)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		PROJCS["Belge_Lambert_1972",GEOGCS["GCS_Belge_1972",DATUM["D_Belge_1972",SPHEROID["International_1924",6378388.0,297.0]],PRIMEM["Greenwich",0.0],UNIT["Degree",0.0174532925199433]],PROJECTION["Lambert_Conformal_Conic"],PARAMETER["False_Easting",150000.01256],PARAMETER["False_Northing",5400088.4378],PARAMETER["Central_Meridian",4.367486666666666],PARAMETER["Standard_Parallel_1",49.8333339],PARAMETER["Standard_Parallel_2",51.16666733333333],PARAMETER["Latitude_Of_Origin",90.0],UNIT["Meter",1.0]],VERTCS["Oostende",VDATUM["Oostende"],PARAMETER["Vertical_Shift",0.0],PARAMETER["Direction",1.0],UNIT["Meter",1.0]]