Skip to content

Commit c6baf73

Browse files
authored
duckdb date filter before recode results in up to 100,000x speed up in spod_get and spod_convert (#166)
* apply air linting to duckdb helpers file * update all sql files to create or replace views instead of simply creating them * create or replace view in spod_connect * explicit arguments in call to spod_duckdb_filter_by_dates in spod_get * rewrite spod_duckdb_filter_by_dates to apply filter to the raw data view and then recreate clean recoded table on top of it for huge speed improvement * plural in categories in available data * available data format improvements with factors * update docs for available data returned table * update news * add news about new col in metdata
1 parent 3284cc2 commit c6baf73

File tree

47 files changed

+381
-212
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

47 files changed

+381
-212
lines changed

NEWS.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,12 @@
88

99
## Improvements
1010

11+
* `spod_get()` and `spod_convert()` are now up to x100,000 faster when you have all (or a lot of) data downloaded, but only requesting several days in the call to `spod_get()` or `spod_convert()`. This is thanks to a new smarter filtering strategy (issue [#159](https://github.com/rOpenSpain/spanishoddata/issues/159), PR [#166](https://github.com/rOpenSpain/spanishoddata/pull/166)).
12+
1113
* Metadata is now fetched from Amazon S3 storage of the original data files, which allows validation of downloaded files (issue [#126](https://github.com/rOpenSpain/spanishoddata/issues/126)) with both size and checksum. PR [#165](https://github.com/rOpenSpain/spanishoddata/pull/165).
1214

15+
* Metadata fetched by `spod_available_data()` has extra columns such as data `type`, `zones` and `period`, see help `?spod_available_data()` for details.
16+
1317
## Bug fixes
1418

1519
* More reliable, but still multi-threaded data file downloads using base R `utils::download.file()` instead of `curl::multi_download()` which failed on some connections (issue [#127](https://github.com/rOpenSpain/spanishoddata/issues/127)), so now `curl` dependency is no longer required. PR [#165](https://github.com/rOpenSpain/spanishoddata/pull/165).

R/available-data.R

Lines changed: 89 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,10 @@
2020
#' \item{file_extension}{\code{character}. The file extension of the data file (e.g., 'tar', 'gz').}
2121
#' \item{data_ym}{\code{Date}. The year and month of the data coverage, if available.}
2222
#' \item{data_ymd}{\code{Date}. The specific date of the data coverage, if available.}
23+
#' \item{study}{\code{factor}. Study category derived from the URL (e.g., 'basic', 'complete', 'routes').}
24+
#' \item{type}{\code{factor}. Data type category derived from the URL (e.g., 'number_of_trips', 'origin-destination', 'overnight_stays', 'data_quality', 'metadata').}
25+
#' \item{period}{\code{factor}. Temporal granularity category derived from the URL (e.g., 'day', 'month').}
26+
#' \item{zones}{\code{factor}. Geographic zone classification derived from the URL (e.g., 'districts', 'municipalities', 'large_urban_areas').}
2327
#' \item{local_path}{\code{character}. The local file path where the data is (or going to be) stored.}
2428
#' \item{downloaded}{\code{logical}. Indicator of whether the data file has been downloaded locally. This is only available if `check_local_files` is `TRUE`.}
2529
#' }
@@ -252,32 +256,52 @@ spod_available_data_v1 <- function(
252256

253257
files_table <- files_table |>
254258
dplyr::mutate(
255-
study = dplyr::case_when(
256-
grepl("maestra", .data$target_url) ~ "basic",
257-
TRUE ~ ""
259+
study = factor(
260+
dplyr::case_when(
261+
grepl("maestra", .data$target_url) ~ "basic",
262+
TRUE ~ NA_character_
263+
),
264+
levels = c("basic")
258265
),
259266

260-
type = dplyr::case_when(
261-
grepl("maestra2", .data$target_url) ~ "number_of_trips",
262-
grepl("maestra1", .data$target_url) ~ "origin-destination",
263-
grepl("RSS\\.xml", .data$target_url) ~ "metadata",
264-
grepl("zonificacion", .data$target_url) ~ "zones",
265-
grepl("relacion", .data$target_url) ~ "relations",
266-
grepl("index\\.html", .data$target_url) ~ "index",
267-
grepl(".\\pdf", .data$target_url) ~ "documentation",
268-
TRUE ~ ""
267+
type = factor(
268+
dplyr::case_when(
269+
grepl("maestra2", .data$target_url) ~ "number_of_trips",
270+
grepl("maestra1", .data$target_url) ~ "origin-destination",
271+
grepl("RSS\\.xml", .data$target_url) ~ "metadata",
272+
grepl("zonificacion", .data$target_url) ~ "zones",
273+
grepl("relacion", .data$target_url) ~ "relations",
274+
grepl("index\\.html", .data$target_url) ~ "index",
275+
grepl("\\.pdf", .data$target_url) ~ "documentation",
276+
TRUE ~ NA_character_
277+
),
278+
levels = c(
279+
"number_of_trips",
280+
"origin-destination",
281+
"metadata",
282+
"zones",
283+
"relations",
284+
"index",
285+
"documentation"
286+
)
269287
),
270288

271-
period = dplyr::case_when(
272-
grepl("ficheros-diarios", .data$target_url) ~ "day",
273-
grepl("meses-completos|mensual", .data$target_url) ~ "month",
274-
TRUE ~ ""
289+
period = factor(
290+
dplyr::case_when(
291+
grepl("ficheros-diarios", .data$target_url) ~ "day",
292+
grepl("meses-completos|mensual", .data$target_url) ~ "month",
293+
TRUE ~ NA_character_
294+
),
295+
levels = c("day", "month")
275296
),
276297

277-
zones = dplyr::case_when(
278-
grepl("distrito", .data$target_url) ~ "district",
279-
grepl("municipio", .data$target_url) ~ "municipality",
280-
TRUE ~ ""
298+
zones = factor(
299+
dplyr::case_when(
300+
grepl("distrito", .data$target_url) ~ "districts",
301+
grepl("municipio", .data$target_url) ~ "municipalities",
302+
TRUE ~ NA_character_
303+
),
304+
levels = c("districts", "municipalities")
281305
)
282306
)
283307

@@ -557,33 +581,51 @@ spod_available_data_v2 <- function(
557581

558582
files_table <- files_table |>
559583
dplyr::mutate(
560-
study = dplyr::case_when(
561-
grepl("estudios_basicos", .data$target_url) ~ "basic",
562-
grepl("estudios_completos", .data$target_url) ~ "complete",
563-
grepl("rutas", .data$target_url) ~ "routes",
564-
TRUE ~ ""
584+
study = factor(
585+
dplyr::case_when(
586+
grepl("estudios_basicos", .data$target_url) ~ "basic",
587+
grepl("estudios_completos", .data$target_url) ~ "complete",
588+
grepl("rutas", .data$target_url) ~ "routes",
589+
TRUE ~ NA_character_
590+
),
591+
levels = c("basic", "complete", "routes")
565592
),
566593

567-
type = dplyr::case_when(
568-
grepl("personas", .data$target_url) ~ "number_of_trips",
569-
grepl("viajes", .data$target_url) ~ "origin-destination",
570-
grepl("pernoctaciones", .data$target_url) ~ "overnight_stays",
571-
grepl("calidad", .data$target_url) ~ "data_quality",
572-
grepl("RSS\\.xml", .data$target_url) ~ "metadata",
573-
TRUE ~ ""
594+
type = factor(
595+
dplyr::case_when(
596+
grepl("personas", .data$target_url) ~ "number_of_trips",
597+
grepl("viajes", .data$target_url) ~ "origin-destination",
598+
grepl("pernoctaciones", .data$target_url) ~ "overnight_stays",
599+
grepl("calidad", .data$target_url) ~ "data_quality",
600+
grepl("RSS\\.xml", .data$target_url) ~ "metadata",
601+
TRUE ~ NA_character_
602+
),
603+
levels = c(
604+
"origin-destination",
605+
"number_of_trips",
606+
"overnight_stays",
607+
"data_quality",
608+
"metadata"
609+
)
574610
),
575611

576-
period = dplyr::case_when(
577-
grepl("ficheros-diarios", .data$target_url) ~ "day",
578-
grepl("meses-completos|mensual", .data$target_url) ~ "month",
579-
TRUE ~ ""
612+
period = factor(
613+
dplyr::case_when(
614+
grepl("ficheros-diarios", .data$target_url) ~ "day",
615+
grepl("meses-completos|mensual", .data$target_url) ~ "month",
616+
TRUE ~ NA_character_
617+
),
618+
levels = c("day", "month")
580619
),
581620

582-
zones = dplyr::case_when(
583-
grepl("distritos", .data$target_url) ~ "district",
584-
grepl("municipios", .data$target_url) ~ "municipality",
585-
grepl("GAU", .data$target_url) ~ "gau",
586-
TRUE ~ ""
621+
zones = factor(
622+
dplyr::case_when(
623+
grepl("distritos", .data$target_url) ~ "districts",
624+
grepl("municipios", .data$target_url) ~ "municipalities",
625+
grepl("GAU", .data$target_url) ~ "large_urban_areas",
626+
TRUE ~ NA_character_
627+
),
628+
levels = c("districts", "municipalities", "large_urban_areas")
587629
)
588630
)
589631

@@ -728,12 +770,16 @@ read_data_links_xml <- function(
728770
Sys.Date()
729771

730772
if (needs_update) {
731-
if (!quiet) message("Fetching latest data links xml")
773+
if (!quiet) {
774+
message("Fetching latest data links xml")
775+
}
732776
latest_data_links_xml_path <- latest_file_function(
733777
data_dir = data_dir
734778
)
735779
} else {
736-
if (!quiet) message("Using existing data links xml: ", latest_file)
780+
if (!quiet) {
781+
message("Using existing data links xml: ", latest_file)
782+
}
737783
latest_data_links_xml_path <- latest_file
738784
}
739785

R/connect.R

Lines changed: 22 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,24 @@
11
#' Connect to data converted to `DuckDB` or hive-style `parquet` files
2-
#'
2+
#'
33
#' @description
4-
#'
4+
#'
55
#' `r lifecycle::badge("stable")`
6-
#'
6+
#'
77
#' This function allows the user to quickly connect to the data converted to DuckDB with the \link{spod_convert} function. This function simplifies the connection process. The user is free to use the `DBI` and `DuckDB` packages to connect to the data manually, or to use the `arrow` package to connect to the `parquet` files folder.
8-
#'
8+
#'
99
#' @param data_path a path to the `DuckDB` database file with '.duckdb' extension, or a path to the folder with `parquet` files. Eigher one should have been created with the \link{spod_convert} function.
1010
#' @param target_table_name Default is `NULL`. When connecting to a folder of `parquet` files, this argument is ignored. When connecting to a `DuckDB` database, a `character` vector of length 1 with the table name to open from the database file. If not specified, it will be guessed from the `data_path` argument and from table names that are available in the database. If you have not manually interfered with the database, this should be guessed automatically and you do not need to specify it.
1111
#' @inheritParams spod_duckdb_limit_resources
1212
#' @inheritParams spod_duckdb_set_temp
1313
#' @inheritParams global_quiet_param
1414
#' @export
1515
#' @return a `DuckDB` table connection object.
16-
#'
16+
#'
1717
#' @examplesIf interactive()
1818
#' \donttest{
1919
#' # Set data dir for file downloads
2020
#' spod_set_data_dir(tempdir())
21-
#'
21+
#'
2222
#' # download and convert data
2323
#' dates_1 <- c(start = "2020-02-17", end = "2020-02-18")
2424
#' db_2 <- spod_convert(
@@ -27,22 +27,22 @@
2727
#' dates = dates_1,
2828
#' overwrite = TRUE
2929
#' )
30-
#'
30+
#'
3131
#' # now connect to the converted data
3232
#' my_od_data_2 <- spod_connect(db_2)
33-
#'
33+
#'
3434
#' # disconnect from the database
3535
#' spod_disconnect(my_od_data_2)
3636
#' }
37-
#'
37+
#'
3838
spod_connect <- function(
3939
data_path,
4040
target_table_name = NULL,
4141
quiet = FALSE,
4242
max_mem_gb = max(4, spod_available_ram() - 4),
4343
max_n_cpu = max(1, parallelly::availableCores() - 1),
4444
temp_path = spod_get_temp_dir()
45-
){
45+
) {
4646
# Validate imputs
4747
checkmate::assert_access(data_path, access = 'r')
4848
checkmate::assert_character(target_table_name, null.ok = TRUE)
@@ -59,7 +59,7 @@ spod_connect <- function(
5959
duckdb_path <- ":memory:"
6060
target_format <- "parquet"
6161
}
62-
62+
6363
con <- DBI::dbConnect(
6464
duckdb::duckdb(),
6565
dbdir = duckdb_path,
@@ -74,7 +74,7 @@ spod_connect <- function(
7474

7575
if (target_format == "duckdb") {
7676
# try to guess the table name if not provided
77-
77+
7878
if (is.null(target_table_name)) {
7979
# try the same name as the file name
8080
target_table_name <- gsub("\\..*", "", basename(duckdb_path)) # experimental
@@ -84,7 +84,9 @@ spod_connect <- function(
8484
target_table_name <- target_table_name
8585
} else {
8686
# pick the first table that does not contain CSV
87-
target_table_name <- tables_list[!stringr::str_detect(tables_list, "csv")][1]
87+
target_table_name <- tables_list[
88+
!stringr::str_detect(tables_list, "csv")
89+
][1]
8890
}
8991
}
9092
tbl_con <- dplyr::tbl(con, target_table_name)
@@ -93,21 +95,23 @@ spod_connect <- function(
9395
if (target_format == "parquet") {
9496
view_name <- basename(data_path)
9597
parquet_glob_path <- fs::path(data_path, "**", "*.parquet")
96-
98+
9799
DBI::dbExecute(
98100
con,
99101
dplyr::sql(
100-
glue::glue("
101-
CREATE VIEW {view_name} AS
102+
glue::glue(
103+
"
104+
CREATE OR REPLACE VIEW {view_name} AS
102105
SELECT *
103106
FROM read_parquet(
104107
'{parquet_glob_path}',
105108
hive_partitioning = true
106109
) ;
107-
")
110+
"
111+
)
108112
)
109113
)
110-
114+
111115
# set temp path for intermediate spilling
112116
# https://duckdb.org/2024/07/09/memory-management.html#intermediate-spilling
113117
# we do not do the same for duckdb above, as there the temp is automatically created in the same folder as the database
@@ -117,6 +121,5 @@ spod_connect <- function(
117121
tbl_con <- dplyr::tbl(con, view_name)
118122
}
119123

120-
121124
return(tbl_con)
122125
}

0 commit comments

Comments
 (0)