Skip to content
Merged
4 changes: 4 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,12 @@

## Improvements

* `spod_get()` and `spod_convert()` are now up to x100,000 faster when you have all (or a lot of) data downloaded, but only requesting several days in the call to `spod_get()` or `spod_convert()`. This is thanks to a new smarter filtering strategy (issue [#159](https://github.com/rOpenSpain/spanishoddata/issues/159), PR [#166](https://github.com/rOpenSpain/spanishoddata/pull/166)).

* Metadata is now fetched from Amazon S3 storage of the original data files, which allows validation of downloaded files (issue [#126](https://github.com/rOpenSpain/spanishoddata/issues/126)) with both size and checksum. PR [#165](https://github.com/rOpenSpain/spanishoddata/pull/165).

* Metadata fetched by `spod_available_data()` has extra columns such as data `type`, `zones` and `period`, see help `?spod_available_data()` for details.

## Bug fixes

* More reliable, but still multi-threaded data file downloads using base R `utils::download.file()` instead of `curl::multi_download()` which failed on some connections (issue [#127](https://github.com/rOpenSpain/spanishoddata/issues/127)), so now `curl` dependency is no longer required. PR [#165](https://github.com/rOpenSpain/spanishoddata/pull/165).
Expand Down
132 changes: 89 additions & 43 deletions R/available-data.R
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@
#' \item{file_extension}{\code{character}. The file extension of the data file (e.g., 'tar', 'gz').}
#' \item{data_ym}{\code{Date}. The year and month of the data coverage, if available.}
#' \item{data_ymd}{\code{Date}. The specific date of the data coverage, if available.}
#' \item{study}{\code{factor}. Study category derived from the URL (e.g., 'basic', 'complete', 'routes').}
#' \item{type}{\code{factor}. Data type category derived from the URL (e.g., 'number_of_trips', 'origin-destination', 'overnight_stays', 'data_quality', 'metadata').}
#' \item{period}{\code{factor}. Temporal granularity category derived from the URL (e.g., 'day', 'month').}
#' \item{zones}{\code{factor}. Geographic zone classification derived from the URL (e.g., 'districts', 'municipalities', 'large_urban_areas').}
#' \item{local_path}{\code{character}. The local file path where the data is (or going to be) stored.}
#' \item{downloaded}{\code{logical}. Indicator of whether the data file has been downloaded locally. This is only available if `check_local_files` is `TRUE`.}
#' }
Expand Down Expand Up @@ -252,32 +256,52 @@ spod_available_data_v1 <- function(

files_table <- files_table |>
dplyr::mutate(
study = dplyr::case_when(
grepl("maestra", .data$target_url) ~ "basic",
TRUE ~ ""
study = factor(
dplyr::case_when(
grepl("maestra", .data$target_url) ~ "basic",
TRUE ~ NA_character_
),
levels = c("basic")
),

type = dplyr::case_when(
grepl("maestra2", .data$target_url) ~ "number_of_trips",
grepl("maestra1", .data$target_url) ~ "origin-destination",
grepl("RSS\\.xml", .data$target_url) ~ "metadata",
grepl("zonificacion", .data$target_url) ~ "zones",
grepl("relacion", .data$target_url) ~ "relations",
grepl("index\\.html", .data$target_url) ~ "index",
grepl(".\\pdf", .data$target_url) ~ "documentation",
TRUE ~ ""
type = factor(
dplyr::case_when(
grepl("maestra2", .data$target_url) ~ "number_of_trips",
grepl("maestra1", .data$target_url) ~ "origin-destination",
grepl("RSS\\.xml", .data$target_url) ~ "metadata",
grepl("zonificacion", .data$target_url) ~ "zones",
grepl("relacion", .data$target_url) ~ "relations",
grepl("index\\.html", .data$target_url) ~ "index",
grepl("\\.pdf", .data$target_url) ~ "documentation",
TRUE ~ NA_character_
),
levels = c(
"number_of_trips",
"origin-destination",
"metadata",
"zones",
"relations",
"index",
"documentation"
)
),

period = dplyr::case_when(
grepl("ficheros-diarios", .data$target_url) ~ "day",
grepl("meses-completos|mensual", .data$target_url) ~ "month",
TRUE ~ ""
period = factor(
dplyr::case_when(
grepl("ficheros-diarios", .data$target_url) ~ "day",
grepl("meses-completos|mensual", .data$target_url) ~ "month",
TRUE ~ NA_character_
),
levels = c("day", "month")
),

zones = dplyr::case_when(
grepl("distrito", .data$target_url) ~ "district",
grepl("municipio", .data$target_url) ~ "municipality",
TRUE ~ ""
zones = factor(
dplyr::case_when(
grepl("distrito", .data$target_url) ~ "districts",
grepl("municipio", .data$target_url) ~ "municipalities",
TRUE ~ NA_character_
),
levels = c("districts", "municipalities")
)
)

Expand Down Expand Up @@ -557,33 +581,51 @@ spod_available_data_v2 <- function(

files_table <- files_table |>
dplyr::mutate(
study = dplyr::case_when(
grepl("estudios_basicos", .data$target_url) ~ "basic",
grepl("estudios_completos", .data$target_url) ~ "complete",
grepl("rutas", .data$target_url) ~ "routes",
TRUE ~ ""
study = factor(
dplyr::case_when(
grepl("estudios_basicos", .data$target_url) ~ "basic",
grepl("estudios_completos", .data$target_url) ~ "complete",
grepl("rutas", .data$target_url) ~ "routes",
TRUE ~ NA_character_
),
levels = c("basic", "complete", "routes")
),

type = dplyr::case_when(
grepl("personas", .data$target_url) ~ "number_of_trips",
grepl("viajes", .data$target_url) ~ "origin-destination",
grepl("pernoctaciones", .data$target_url) ~ "overnight_stays",
grepl("calidad", .data$target_url) ~ "data_quality",
grepl("RSS\\.xml", .data$target_url) ~ "metadata",
TRUE ~ ""
type = factor(
dplyr::case_when(
grepl("personas", .data$target_url) ~ "number_of_trips",
grepl("viajes", .data$target_url) ~ "origin-destination",
grepl("pernoctaciones", .data$target_url) ~ "overnight_stays",
grepl("calidad", .data$target_url) ~ "data_quality",
grepl("RSS\\.xml", .data$target_url) ~ "metadata",
TRUE ~ NA_character_
),
levels = c(
"origin-destination",
"number_of_trips",
"overnight_stays",
"data_quality",
"metadata"
)
),

period = dplyr::case_when(
grepl("ficheros-diarios", .data$target_url) ~ "day",
grepl("meses-completos|mensual", .data$target_url) ~ "month",
TRUE ~ ""
period = factor(
dplyr::case_when(
grepl("ficheros-diarios", .data$target_url) ~ "day",
grepl("meses-completos|mensual", .data$target_url) ~ "month",
TRUE ~ NA_character_
),
levels = c("day", "month")
),

zones = dplyr::case_when(
grepl("distritos", .data$target_url) ~ "district",
grepl("municipios", .data$target_url) ~ "municipality",
grepl("GAU", .data$target_url) ~ "gau",
TRUE ~ ""
zones = factor(
dplyr::case_when(
grepl("distritos", .data$target_url) ~ "districts",
grepl("municipios", .data$target_url) ~ "municipalities",
grepl("GAU", .data$target_url) ~ "large_urban_areas",
TRUE ~ NA_character_
),
levels = c("districts", "municipalities", "large_urban_areas")
)
)

Expand Down Expand Up @@ -728,12 +770,16 @@ read_data_links_xml <- function(
Sys.Date()

if (needs_update) {
if (!quiet) message("Fetching latest data links xml")
if (!quiet) {
message("Fetching latest data links xml")
}
latest_data_links_xml_path <- latest_file_function(
data_dir = data_dir
)
} else {
if (!quiet) message("Using existing data links xml: ", latest_file)
if (!quiet) {
message("Using existing data links xml: ", latest_file)
}
latest_data_links_xml_path <- latest_file
}

Expand Down
41 changes: 22 additions & 19 deletions R/connect.R
Original file line number Diff line number Diff line change
@@ -1,24 +1,24 @@
#' Connect to data converted to `DuckDB` or hive-style `parquet` files
#'
#'
#' @description
#'
#'
#' `r lifecycle::badge("stable")`
#'
#'
#' This function allows the user to quickly connect to the data converted to DuckDB with the \link{spod_convert} function. This function simplifies the connection process. The user is free to use the `DBI` and `DuckDB` packages to connect to the data manually, or to use the `arrow` package to connect to the `parquet` files folder.
#'
#'
#' @param data_path a path to the `DuckDB` database file with '.duckdb' extension, or a path to the folder with `parquet` files. Eigher one should have been created with the \link{spod_convert} function.
#' @param target_table_name Default is `NULL`. When connecting to a folder of `parquet` files, this argument is ignored. When connecting to a `DuckDB` database, a `character` vector of length 1 with the table name to open from the database file. If not specified, it will be guessed from the `data_path` argument and from table names that are available in the database. If you have not manually interfered with the database, this should be guessed automatically and you do not need to specify it.
#' @inheritParams spod_duckdb_limit_resources
#' @inheritParams spod_duckdb_set_temp
#' @inheritParams global_quiet_param
#' @export
#' @return a `DuckDB` table connection object.
#'
#'
#' @examplesIf interactive()
#' \donttest{
#' # Set data dir for file downloads
#' spod_set_data_dir(tempdir())
#'
#'
#' # download and convert data
#' dates_1 <- c(start = "2020-02-17", end = "2020-02-18")
#' db_2 <- spod_convert(
Expand All @@ -27,22 +27,22 @@
#' dates = dates_1,
#' overwrite = TRUE
#' )
#'
#'
#' # now connect to the converted data
#' my_od_data_2 <- spod_connect(db_2)
#'
#'
#' # disconnect from the database
#' spod_disconnect(my_od_data_2)
#' }
#'
#'
spod_connect <- function(
data_path,
target_table_name = NULL,
quiet = FALSE,
max_mem_gb = max(4, spod_available_ram() - 4),
max_n_cpu = max(1, parallelly::availableCores() - 1),
temp_path = spod_get_temp_dir()
){
) {
# Validate imputs
checkmate::assert_access(data_path, access = 'r')
checkmate::assert_character(target_table_name, null.ok = TRUE)
Expand All @@ -59,7 +59,7 @@ spod_connect <- function(
duckdb_path <- ":memory:"
target_format <- "parquet"
}

con <- DBI::dbConnect(
duckdb::duckdb(),
dbdir = duckdb_path,
Expand All @@ -74,7 +74,7 @@ spod_connect <- function(

if (target_format == "duckdb") {
# try to guess the table name if not provided

if (is.null(target_table_name)) {
# try the same name as the file name
target_table_name <- gsub("\\..*", "", basename(duckdb_path)) # experimental
Expand All @@ -84,7 +84,9 @@ spod_connect <- function(
target_table_name <- target_table_name
} else {
# pick the first table that does not contain CSV
target_table_name <- tables_list[!stringr::str_detect(tables_list, "csv")][1]
target_table_name <- tables_list[
!stringr::str_detect(tables_list, "csv")
][1]
}
}
tbl_con <- dplyr::tbl(con, target_table_name)
Expand All @@ -93,21 +95,23 @@ spod_connect <- function(
if (target_format == "parquet") {
view_name <- basename(data_path)
parquet_glob_path <- fs::path(data_path, "**", "*.parquet")

DBI::dbExecute(
con,
dplyr::sql(
glue::glue("
CREATE VIEW {view_name} AS
glue::glue(
"
CREATE OR REPLACE VIEW {view_name} AS
SELECT *
FROM read_parquet(
'{parquet_glob_path}',
hive_partitioning = true
) ;
")
"
)
)
)

# set temp path for intermediate spilling
# https://duckdb.org/2024/07/09/memory-management.html#intermediate-spilling
# we do not do the same for duckdb above, as there the temp is automatically created in the same folder as the database
Expand All @@ -117,6 +121,5 @@ spod_connect <- function(
tbl_con <- dplyr::tbl(con, view_name)
}


return(tbl_con)
}
Loading