Skip to content

Commit 46b0b2d

Browse files
lwjohnst86Aastedet
andauthored
perf: ⚡ classification requires DuckDB/SQL (#410)
# Description Had to do a lot of work to strictly use DuckDB/SQL. I couldn't easily make this a smaller PR as each issue only came up after addressing the previous issue. Closes #392, closes #390 (as this also converts the object into a DuckDB dataset), closes #381 ## Checklist - [x] Ran `just run-all` --------- Co-authored-by: Anders Aasted Isaksen <67263135+Aastedet@users.noreply.github.com>
1 parent 1657f70 commit 46b0b2d

28 files changed

+251
-242
lines changed

.github/workflows/build.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,8 @@ jobs:
4848
env:
4949
GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
5050
R_KEEP_PKG_SOURCE: yes
51+
# For tests to use to not run.
52+
DEVELOP_R: ${{ matrix.config.r == 'devel' }}
5153

5254
steps:
5355
- uses: actions/checkout@v4

DESCRIPTION

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ Imports:
2828
checkmate,
2929
cli,
3030
codeCollection,
31+
dbplyr,
3132
dplyr,
3233
duckplyr,
3334
fabricatr,
@@ -37,7 +38,8 @@ Imports:
3738
rlang,
3839
rvest,
3940
stats,
40-
tidyselect
41+
tidyselect,
42+
utils
4143
Suggests:
4244
glue,
4345
knitr,

R/add.R

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,19 +30,23 @@ add_insulin_purchases_cols <- function(gld_hba1c_after_drop_steps) {
3030
# number of packages purchased
3131
dplyr::mutate(
3232
contained_doses = .data$volume * .data$apk,
33-
is_insulin_gld_code = !!logic$is_insulin_gld_code,
34-
date = as_date(date)
33+
is_insulin_gld_code = !!logic$is_insulin_gld_code
3534
) |>
3635
dplyr::select(
3736
"pnr",
3837
"date",
3938
"contained_doses",
4039
"is_insulin_gld_code"
4140
) |>
42-
dplyr::summarise(
41+
dplyr::mutate(
42+
# Needs to be done before hand, can't use the same variable in
43+
# `summarise()` when using SQL.
4344
# Get first date of a GLD purchase and if a purchase of insulin occurs
4445
# within 180 days of the first purchase.
4546
first_gld_date = min(date, na.rm = TRUE),
47+
.by = "pnr"
48+
) |>
49+
dplyr::summarise(
4650
has_insulin_purchases_within_180_days = !!logic$has_insulin_purchases_within_180_days,
4751
# Sum up total doses of insulin and of all GLD.
4852
n_insulin_doses = sum(

R/algorithm.R

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -68,13 +68,13 @@ algorithm <- function() {
6868
lpr2_is_endocrinology_dept = list(
6969
register = "lpr_adm",
7070
title = "LPR2 endocrinology department",
71-
logic = "c_spec == 8",
71+
logic = "c_spec == 8L",
7272
comments = "`TRUE` when the department where the recorded diagnosis was endocrinology."
7373
),
7474
lpr2_is_medical_dept = list(
7575
register = "lpr_adm",
7676
title = "LPR2 other medical department",
77-
logic = "c_spec %in% c(1:7, 9:30)",
77+
logic = "c_spec %in% c(1L:7L, 9L:30L)",
7878
comments = "`TRUE` when the diagnosis was recorded at a medical department other than endocrinology."
7979
),
8080
lpr2_is_pregnancy_code = list(
@@ -141,7 +141,7 @@ algorithm <- function() {
141141
is_within_pregnancy_interval = list(
142142
register = NA,
143143
title = "Events that are within a potential pregnancy interval",
144-
logic = "has_pregnancy_event AND date >= (pregnancy_event_date - weeks(40)) AND date <= (pregnancy_event_date + weeks(12))",
144+
logic = "has_pregnancy_event AND date >= (pregnancy_event_date - weeks(40L)) AND date <= (pregnancy_event_date + weeks(12L))",
145145
comments = "The potential pregnancy interval is defined as 40 weeks before and 12 weeks after the pregnancy event date (birth or miscarriage)."
146146
),
147147
is_podiatrist_services = list(
@@ -153,7 +153,7 @@ algorithm <- function() {
153153
is_not_metformin_for_pcos = list(
154154
register = NA,
155155
title = "Metformin purchases that aren't potentially for the treatment of PCOS",
156-
logic = "NOT (koen == 2 AND atc =~ '^A10BA02$' AND ((date - foed_dato) < years(40) OR indication_code %in% c('0000092', '0000276', '0000781')))",
156+
logic = "NOT (koen == 2 AND atc =~ '^A10BA02$' AND (date < (foed_dato + years(40)) OR indication_code %in% c('0000092', '0000276', '0000781')))",
157157
comments = "Woman is defined as 2 in `koen`."
158158
),
159159
has_t1d = list(
@@ -189,7 +189,7 @@ algorithm <- function() {
189189
has_insulin_purchases_within_180_days = list(
190190
register = NA,
191191
title = "Whether any insulin was purchased within 180 days of the first purchase of GLD",
192-
logic = "any(is_insulin_gld_code & date <= (first_gld_date + days(180)))",
192+
logic = "any(is_insulin_gld_code & date <= (first_gld_date + days(180L)), na.rm = TRUE)",
193193
comments = "This is used to classify type 1 diabetes. It determines if any insulin was bought shortly after first buying any type of GLD, which suggests type 1 diabetes."
194194
)
195195
)

R/classify-diabetes.R

Lines changed: 38 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -31,20 +31,14 @@
3131
#' description of the internal implementation of this classification function.
3232
#'
3333
#' @examples
34-
#' register_data <- simulate_registers(
35-
#' c(
36-
#' "kontakter",
37-
#' "diagnoser",
38-
#' "lpr_diag",
39-
#' "lpr_adm",
40-
#' "sysi",
41-
#' "sssy",
42-
#' "lab_forsker",
43-
#' "bef",
44-
#' "lmdb"
45-
#' ),
46-
#' n = 10000
47-
#' )
34+
#' # Can't run this multiple times, will cause an error as the table
35+
#' # has already been created in the DuckDB connection.
36+
#' register_data <- registers() |>
37+
#' names() |>
38+
#' simulate_registers() |>
39+
#' purrr::map(duckplyr::as_duckdb_tibble) |>
40+
#' purrr::map(duckplyr::as_tbl)
41+
#'
4842
#' classify_diabetes(
4943
#' kontakter = register_data$kontakter,
5044
#' diagnoser = register_data$diagnoser,
@@ -69,26 +63,35 @@ classify_diabetes <- function(
6963
stable_inclusion_start_date = "1998-01-01"
7064
) {
7165
# Input checks -----
72-
check_is_duckdb(kontakter)
73-
check_is_duckdb(diagnoser)
74-
check_is_duckdb(lpr_diag)
75-
check_is_duckdb(lpr_adm)
76-
check_is_duckdb(sysi)
77-
check_is_duckdb(sssy)
78-
check_is_duckdb(lab_forsker)
79-
check_is_duckdb(bef)
80-
check_is_duckdb(lmdb)
66+
67+
# Convert to dbplyr connection with duckdb to use dbplyr functions
68+
# (since duckplyr is still in development).
69+
# Also need to convert here rather than as a function, because of the
70+
# way duckplyr works. It creates a temporary DuckDB DB in the background
71+
# based on the name of the object passed to it.
72+
registers <- list(
73+
kontakter = kontakter,
74+
diagnoser = diagnoser,
75+
lpr_diag = lpr_diag,
76+
lpr_adm = lpr_adm,
77+
sysi = sysi,
78+
sssy = sssy,
79+
lab_forsker = lab_forsker,
80+
bef = bef,
81+
lmdb = lmdb
82+
) |>
83+
purrr::map(verify_duckdb)
8184

8285
# Verification step -----
83-
kontakter <- select_required_variables(kontakter, "kontakter")
84-
diagnoser <- select_required_variables(diagnoser, "diagnoser")
85-
lpr_diag <- select_required_variables(lpr_diag, "lpr_diag")
86-
lpr_adm <- select_required_variables(lpr_adm, "lpr_adm")
87-
sysi <- select_required_variables(sysi, "sysi")
88-
sssy <- select_required_variables(sssy, "sssy")
89-
lab_forsker <- select_required_variables(lab_forsker, "lab_forsker")
90-
bef <- select_required_variables(bef, "bef")
91-
lmdb <- select_required_variables(lmdb, "lmdb")
86+
kontakter <- select_required_variables(registers$kontakter, "kontakter")
87+
diagnoser <- select_required_variables(registers$diagnoser, "diagnoser")
88+
lpr_diag <- select_required_variables(registers$lpr_diag, "lpr_diag")
89+
lpr_adm <- select_required_variables(registers$lpr_adm, "lpr_adm")
90+
sysi <- select_required_variables(registers$sysi, "sysi")
91+
sssy <- select_required_variables(registers$sssy, "sssy")
92+
lab_forsker <- select_required_variables(registers$lab_forsker, "lab_forsker")
93+
bef <- select_required_variables(registers$bef, "bef")
94+
lmdb <- select_required_variables(registers$lmdb, "lmdb")
9295

9396
# Initially processing -----
9497
lpr2 <- prepare_lpr2(
@@ -178,27 +181,25 @@ classify_diabetes <- function(
178181
)
179182
}
180183

181-
check_is_duckdb <- function(data, call = rlang::caller_env()) {
184+
verify_duckdb <- function(data, call = rlang::caller_env()) {
182185
check <- checkmate::test_multi_class(
183186
data,
184187
classes = c(
185188
"tbl_duckdb_connection",
186-
"duckplyr_df",
187-
"duckplyr_tbl",
188189
"duckdb_connection"
189190
)
190191
)
191192
if (!check) {
192193
cli::cli_abort(
193194
message = c(
194-
"The data needs to be a DuckDB object because we heavily process the data.",
195+
"The data needs to be a {.cls tbl_duckdb_connection} object because we heavily process the data and need the power.",
195196
"i" = "The data has the class{?es}: {.code {class(data)}}"
196197
),
197198
call = call
198199
)
199200
}
200201

201-
invisible(NULL)
202+
data
202203
}
203204

204205
#' After filtering, classify those with type 1 diabetes.

R/create-inclusion-dates.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ create_inclusion_dates <- function(
4242
# Set the stable inclusion date to NA if the raw inclusion date is before
4343
# stable_inclusion_start_date.
4444
stable_inclusion_date = dplyr::if_else(
45-
.data$raw_inclusion_date < as_date(stable_inclusion_start_date),
45+
.data$raw_inclusion_date < as.Date(stable_inclusion_start_date),
4646
NA,
4747
.data$raw_inclusion_date
4848
)

R/dates.R

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,14 @@
1-
#' Simple `as.Date()` wrapper.
1+
#' Translate to SQL for datetime conversion to eventually date
22
#'
3-
#' DuckDB doesn't support using [lubridate::as_date()], so this is
4-
#' a simple wrapper around [as.Date()] with the correct formats.
3+
#' DuckDB doesn't support using [lubridate::as_date()], so this
4+
#' uses [dbplyr::sql()] to directly use DuckDB's `strptime` to
5+
#' convert strings to datetimes. Afterwards, it can be converted
6+
#' to dates.
57
#'
6-
#' @param x A character (or date) column.
8+
#' @param x A character (or date) column, in quotes.
79
#'
8-
#' @returns A Date column.
10+
#' @returns A Datetime column.
911
#' @keywords internal
10-
as_date <- function(x) {
11-
as.Date(x, tryFormats = c("%Y%m%d", "%Y-%m-%d"))
12+
as_sql_datetime <- function(x) {
13+
dbplyr::sql(glue::glue("strptime({x}, ['%Y%m%d', '%Y-%m-%d'])"))
1214
}

R/drop.R

Lines changed: 5 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,10 @@ drop_pcos <- function(gld_purchases, bef) {
2424
gld_purchases |>
2525
dplyr::inner_join(bef, by = dplyr::join_by("pnr")) |>
2626
dplyr::mutate(
27-
date = as_date(.data$date),
28-
foed_dato = as_date(.data$foed_dato)
27+
date = !!as_sql_datetime("date"),
28+
date = as.Date(.data$date),
29+
foed_dato = !!as_sql_datetime("foed_dato"),
30+
foed_dato = as.Date(.data$foed_dato)
2931
) |>
3032
# Use !! to inject the expression into filter
3133
dplyr::filter(!!logic) |>
@@ -70,17 +72,6 @@ drop_pregnancies <- function(
7072
) {
7173
criteria <- logic_as_expression("is_within_pregnancy_interval")[[1]]
7274

73-
# TODO: This should be done at an earlier stage.
74-
# Ensure both date columns are of type Date.
75-
dropped_pcos <- dropped_pcos |>
76-
dplyr::mutate(
77-
date = as_date(.data$date)
78-
)
79-
included_hba1c <- included_hba1c |>
80-
dplyr::mutate(
81-
date = as_date(.data$date)
82-
)
83-
8475
dropped_pcos |>
8576
# Full join to keep rows from both dropped_pcos and included_hba1c.
8677
dplyr::full_join(included_hba1c, by = dplyr::join_by("pnr", "date")) |>
@@ -100,7 +91,7 @@ drop_pregnancies <- function(
10091
# inside another for the same pnr.
10192
# Only keep rows that don't fall within any pregnancy interval.
10293
dplyr::filter(
103-
!any(.data$is_within_pregnancy_interval),
94+
!any(.data$is_within_pregnancy_interval, na.rm = TRUE),
10495
.by = c("pnr", "date")
10596
) |>
10697
# Drop columns that were only used here.

R/edge-cases.R

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#' Create a synthetic dataset of edge case inputs
22
#'
33
#' @description
4-
#' This function generates a list of DuckDB tibbles representing the Danish health
4+
#' This function generates a list of tibbles representing the Danish health
55
#' registers and the data necessary to run the algorithm. The dataset contains
66
#' 23 individual cases (`pnr`s), each designed to test a specific logical branch
77
#' of the diabetes classification algorithm, including inclusion, exclusion,
@@ -11,7 +11,7 @@
1111
#' behaves as expected under a wide range of conditions, but it is also intended
1212
#' to be explored by users to better understand how the algorithm logic works.
1313
#'
14-
#' @return A named list of 9 [duckplyr::duckdb_tibble()] objects, each representing a
14+
#' @return A named list of 9 [tibble::tibble()] objects, each representing a
1515
#' different health register: `bef`, `lmdb`, `lpr_adm`, `lpr_diag`,
1616
#' `kontakter`, `diagnoser`, `sysi`, `sssy`, and `lab_forsker`.
1717
#' @export
@@ -350,6 +350,5 @@ edge_cases <- function() {
350350
out <- rlang::set_names(out, name)
351351
}) |>
352352
purrr::flatten() |>
353-
purrr::map(duckplyr::as_duckdb_tibble) |>
354-
append(list(classified = duckplyr::as_duckdb_tibble(classified)))
353+
append(list(classified = classified))
355354
}

R/join-inclusions.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ join_inclusions <- function(
4242
dplyr::mutate(
4343
dplyr::across(
4444
dplyr::starts_with("has_"),
45-
~ dplyr::coalesce(any(.x), FALSE)
45+
\(x) any(dplyr::coalesce(x, FALSE), na.rm = TRUE)
4646
),
4747
.by = "pnr"
4848
)

0 commit comments

Comments
 (0)