Skip to content

Commit cfaa8a7

Browse files
authored
Merge pull request #63 from epiforecasts/58-ww-data-as-of-forecast-date
Issue 58: Get wastewater data as of the forecast date
2 parents 9b19806 + 73be9c4 commit cfaa8a7

File tree

14 files changed

+442
-85
lines changed

14 files changed

+442
-85
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ cache
1212
.DS_Store
1313
.Rproj.user
1414
*.Rproj
15+
*.Rdata
1516
!inst/extdata/*.rds
1617
/doc/
1718
/Meta/

DESCRIPTION

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,9 @@ Imports:
3636
glue,
3737
scoringutils,
3838
zoo,
39-
stringr
39+
stringr,
40+
httr,
41+
jsonlite
4042
Suggests:
4143
bookdown,
4244
targets,

NAMESPACE

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,10 @@ importFrom(dplyr,select)
1515
importFrom(dplyr,ungroup)
1616
importFrom(fs,dir_create)
1717
importFrom(glue,glue)
18+
importFrom(httr,GET)
19+
importFrom(httr,parse_url)
20+
importFrom(httr,stop_for_status)
21+
importFrom(jsonlite,fromJSON)
1822
importFrom(lubridate,days)
1923
importFrom(lubridate,ymd)
2024
importFrom(readr,read_csv)

R/get_ww_data.R

Lines changed: 167 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
#'
55
#' @inheritParams get_hosp_for_eval
66
#' @param filepath_name Name of directory to save the raw input wastewater data.
7-
#' @importFrom dplyr mutate filter select rename
87
#' @importFrom fs dir_create
98
#' @importFrom readr read_tsv read_csv write_csv
109
#' @autoglobal
@@ -25,15 +24,176 @@ get_ww_for_eval <- function(location_name,
2524
}
2625

2726

28-
ww_clean <- RKI_ww_sites |>
27+
# Rename columns and format for wwinference
28+
ww_clean <- reformat_ww_data(
29+
raw_ww = RKI_ww_sites,
30+
location_name = location_name,
31+
location_abbr = location_abbr
32+
)
33+
34+
return(ww_clean)
35+
}
36+
37+
#' Use the Git commit history to get the wastewater data available as of
38+
#' the forecast date. Subsequent function will filter it to the location we
39+
#' want.
40+
#'
41+
#' @inheritParams get_hosp_for_eval
42+
#' @inheritParams get_ww_for_eval
43+
#' @param forecast_date Character string or date indicating the date of
44+
#' forecast in YYYY-MM-DD
45+
#' @param calibration_period Integer indicating the number of days of
46+
#' wastewater calibration data to extract. Default is `100`.
47+
#' @param ww_data_url Character string of the url of the wastewater data (not
48+
#' the raw version).
49+
50+
#' @autoglobal
51+
#' @importFrom dplyr filter
52+
#' @importFrom lubridate ymd days
53+
get_ww_as_of_forecast_date <- function(forecast_date,
54+
location_name,
55+
location_abbr,
56+
filepath_name = file.path("input", "data", "ww", "vintages"), # nolint
57+
calibration_period = 100,
58+
ww_data_url = "https://raw.githubusercontent.com/robert-koch-institut/Abwassersurveillance_AMELAG/refs/heads/main/amelag_einzelstandorte.tsv") { # nolint
59+
vintage_fp <- file.path(filepath_name, forecast_date, "ww_sites_as_of.csv")
60+
if (file.exists(vintage_fp)) {
61+
RKI_ww_sites <- read_csv(vintage_fp)
62+
} else {
63+
# Need to fine the url associated with the commit history as of the
64+
# forecast date
65+
ww_vintage_data_url <- get_vintage(
66+
raw_url = ww_data_url,
67+
target_date = forecast_date
68+
)
69+
RKI_ww_sites <- read_tsv(ww_vintage_data_url)
70+
dir_create(file.path(filepath_name, forecast_date))
71+
write_csv(RKI_ww_sites, vintage_fp)
72+
}
73+
74+
# Clean the and filter to the calibration period, add the forecast date
75+
ww_for_fit <- reformat_ww_data(
76+
raw_ww = RKI_ww_sites,
77+
location_name = location_name,
78+
location_abbr = location_abbr
79+
) |>
80+
filter(
81+
date >= ymd(forecast_date) - days(calibration_period)
82+
) |>
83+
mutate(forecast_date = forecast_date)
84+
85+
return(ww_for_fit)
86+
}
87+
88+
89+
#' Use GitHub raw url and commit history to get the url as of a specific target
90+
#' date.
91+
#'
92+
#' @param raw_url Character string indicating url of latest data.
93+
#' @param target_date As of date
94+
#' @param github_token Optional Personal Access Token
95+
#'
96+
#' @returns Character string of the url to use to get the data as of a certain
97+
#' date.
98+
#' @autoglobal
99+
#' @importFrom httr parse_url GET stop_for_status
100+
#' @importFrom jsonlite fromJSON
101+
get_vintage <- function(raw_url,
102+
target_date,
103+
github_token = NULL) {
104+
parsed <- parse_url(raw_url)
105+
path_parts <- strsplit(gsub("^/|/$", "", parsed$path), "/")[[1]] # nolint
106+
107+
owner <- path_parts[1]
108+
repo <- path_parts[2]
109+
110+
if (length(path_parts) > 3 && path_parts[3] == "refs" && path_parts[4] == "heads") { # nolint
111+
branch <- path_parts[5]
112+
file_path <- paste(path_parts[6:length(path_parts)], collapse = "/")
113+
} else {
114+
branch <- path_parts[3]
115+
file_path <- paste(path_parts[4:length(path_parts)], collapse = "/")
116+
}
117+
118+
target_datetime <- as.POSIXct(paste0(target_date, " 23:59:59"), tz = "UTC")
119+
target_iso <- format(target_datetime, "%Y-%m-%dT%H:%M:%SZ")
120+
121+
headers <- c("Accept" = "application/vnd.github.v3+json") # nolint
122+
if (!is.null(github_token)) {
123+
headers <- c(headers, "Authorization" = paste("token", github_token)) # nolint
124+
}
125+
126+
api_url <- sprintf("https://api.github.com/repos/%s/%s/commits", owner, repo)
127+
128+
response <- GET(
129+
api_url,
130+
httr::add_headers(.headers = headers),
131+
query = list(
132+
path = file_path,
133+
sha = branch,
134+
until = target_iso,
135+
per_page = 1
136+
)
137+
)
138+
139+
stop_for_status(response)
140+
141+
commits <- fromJSON(httr::content(response, "text", encoding = "UTF-8"))
142+
143+
if (length(commits) == 0 || nrow(commits) == 0) {
144+
stop(sprintf("No commits found for %s before %s", file_path, target_date),
145+
call. = FALSE
146+
)
147+
}
148+
# Get the commit SHA
149+
commit_sha <- commits$sha[1]
150+
commit_date <- commits$commit$committer$date[1]
151+
152+
# Construct the historical raw URL
153+
historical_url <- sprintf(
154+
"https://raw.githubusercontent.com/%s/%s/%s/%s",
155+
owner, repo, commit_sha, file_path
156+
)
157+
158+
message(sprintf(
159+
"Found commit %s from %s",
160+
substr(commit_sha, 1, 7), commit_date
161+
))
162+
163+
return(historical_url)
164+
}
165+
166+
#' Reformat the ww data, selecting only the location we need and the variables
167+
#' we need.
168+
#'
169+
#' @param raw_ww Data.frame from the RKI GitHub
170+
#' @inheritParams get_hosp_for_eval
171+
#' @param log_lod_val Scalar indicating the value to reset the
172+
#' limit of detection (LOD) to, to be
173+
#' removed in future iterations and replace with an LOD value at each
174+
#' measurement or site.
175+
#'
176+
#' @returns Data.frame of cleaned wastewater values from location_abbr.
177+
#' @importFrom dplyr mutate filter select rename
178+
#' @autoglobal
179+
reformat_ww_data <- function(raw_ww,
180+
location_abbr,
181+
location_name,
182+
log_lod_val = 1) {
183+
if ("normalisierung" %in% names(raw_ww)) {
184+
raw_ww <- dplyr::rename(raw_ww, normalized = normalisierung)
185+
} else if ("viruslast_normalisiert" %in% names(raw_ww)) {
186+
raw_ww <- dplyr::rename(raw_ww, normalized = viruslast_normalisiert)
187+
}
188+
189+
ww_clean <- raw_ww |>
29190
rename(
30191
location = "standort",
31192
date = "datum",
32193
state = "bundesland",
33194
conc = "viruslast",
34195
pop_cov = "einwohner",
35196
change_in_lab_indicator = "laborwechsel",
36-
normalized = "viruslast_normalisiert",
37197
pathogen = "typ",
38198
below_LOD = "unter_bg"
39199
) |>
@@ -49,44 +209,18 @@ get_ww_for_eval <- function(location_name,
49209
mutate(
50210
lab = change_in_lab_indicator,
51211
log_genome_copies_per_ml = log((conc / 1e3) + 1e-8),
52-
log_lod = 3 # make this up for now (maybe )
212+
log_lod = log_lod_val, # make this up for now (maybe )
213+
location_name = location_name,
214+
location_abbr = location_abbr
53215
) |>
54216
rename(
55217
site = location,
56218
site_pop = pop_cov
57219
) |>
58220
select(
59221
date, site, lab, log_genome_copies_per_ml, log_lod, site_pop,
60-
below_LOD
222+
below_LOD, location_abbr, location_name
61223
) |>
62224
filter(!is.na(log_genome_copies_per_ml))
63-
64225
return(ww_clean)
65226
}
66-
67-
#' Filter wastewater data for fitting
68-
#'
69-
#' @param ww_data_eval wastewater data for evaluation step
70-
#' @param forecast_date Character string or date indicating the date of
71-
#' forecast in YYYY-MM-DD
72-
#' @param calibration_period Integer indicating the number of days of
73-
#' wastewater calibration data to extract. Default is `100`.
74-
#' @param lag Integer indicating the number of days from the forecast date of
75-
#' the latest wastewater data. Default is `3`
76-
#' @autoglobal
77-
#' @importFrom dplyr filter
78-
#' @importFrom lubridate ymd days
79-
get_ww_for_fit <- function(ww_data_eval,
80-
forecast_date,
81-
calibration_period = 100,
82-
lag = 3) {
83-
ww_for_fit <- ww_data_eval |>
84-
filter(
85-
date >= ymd(forecast_date) - days(calibration_period),
86-
date <= ymd(forecast_date) - days(lag)
87-
) |>
88-
mutate(
89-
forecast_date = forecast_date
90-
)
91-
return(ww_for_fit)
92-
}

R/globals.R

Lines changed: 15 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -18,18 +18,20 @@ utils::globalVariables(c(
1818
"location", # <get_initial_values>
1919
"age_group", # <get_initial_values>
2020
"value", # <get_initial_values>
21-
"location", # <get_ww_for_eval>
22-
"state", # <get_ww_for_eval>
23-
"conc", # <get_ww_for_eval>
24-
"pop_cov", # <get_ww_for_eval>
25-
"change_in_lab_indicator", # <get_ww_for_eval>
26-
"normalized", # <get_ww_for_eval>
27-
"pathogen", # <get_ww_for_eval>
28-
"below_LOD", # <get_ww_for_eval>
29-
"site", # <get_ww_for_eval>
30-
"lab", # <get_ww_for_eval>
31-
"log_genome_copies_per_ml", # <get_ww_for_eval>
32-
"log_lod", # <get_ww_for_eval>
33-
"site_pop", # <get_ww_for_eval>
21+
"normalisierung", # <reformat_ww_data>
22+
"viruslast_normalisiert", # <reformat_ww_data>
23+
"location", # <reformat_ww_data>
24+
"state", # <reformat_ww_data>
25+
"conc", # <reformat_ww_data>
26+
"pop_cov", # <reformat_ww_data>
27+
"change_in_lab_indicator", # <reformat_ww_data>
28+
"normalized", # <reformat_ww_data>
29+
"pathogen", # <reformat_ww_data>
30+
"below_LOD", # <reformat_ww_data>
31+
"site", # <reformat_ww_data>
32+
"lab", # <reformat_ww_data>
33+
"log_genome_copies_per_ml", # <reformat_ww_data>
34+
"log_lod", # <reformat_ww_data>
35+
"site_pop", # <reformat_ww_data>
3436
NULL
3537
))

_targets.R

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
# The pipeline can be run using `tar_make()`
55

66
library(targets)
7+
library(jsonlite)
8+
library(httr)
79
library(tarchetypes)
810
library(wwinference)
911
library(dplyr)
@@ -35,7 +37,9 @@ tar_option_set(
3537
"readr",
3638
"lubridate",
3739
"tidyr",
38-
"glue"
40+
"glue",
41+
"jsonlite",
42+
"httr"
3943
),
4044
workspace_on_error = TRUE,
4145
storage = "worker",

inst/WORDLIST

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,17 @@
11
COVID
22
DD
33
KITmetricslab
4+
LOD
45
Nowcast
56
RKI
67
YYYY
8+
abbr
79
com
810
csv
911
df
10-
ww
1112
githubusercontent
1213
nolint
1314
nowcast
1415
refs
16+
ww
1517
wwinference

man/get_vintage.Rd

Lines changed: 24 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)