diff --git a/DESCRIPTION b/DESCRIPTION index 3f6aafe0..e6b7c888 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -29,7 +29,6 @@ Depends: R (>= 4.1.0) Imports: checkmate, - curl (>= 5.0.0), DBI, digest, dplyr, @@ -45,6 +44,7 @@ Imports: memuse, openssl, parallelly, + paws.storage (>= 0.4.0), purrr, readr, rlang, @@ -58,6 +58,7 @@ Suggests: flowmapper (>= 0.1.2), furrr, future, + future.mirai, hexSticker, mapSpain, quarto, diff --git a/NAMESPACE b/NAMESPACE index 9e87ff03..865500cc 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,6 +1,7 @@ # Generated by roxygen2: do not edit by hand export(spod_available_data) +export(spod_check_files) export(spod_cite) export(spod_codebook) export(spod_connect) diff --git a/NEWS.md b/NEWS.md index e191e7a3..809fbd87 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,13 +2,21 @@ ## New features -* `spod_quick_get_zones()` is a new function to quickly get municipality geometries to match with the data retrieved with `spod_quick_get_od()` [#163](https://github.com/rOpenSpain/spanishoddata/pull/163). Requests to get geometies are cached in memory of the current R session with `memoise` package. +* `spod_quick_get_zones()` is a new function to quickly get municipality geometries to match with the data retrieved with `spod_quick_get_od()` [#163](https://github.com/rOpenSpain/spanishoddata/pull/163). Requests to get geometies are cached in memory of the current R session with `memoise` package. This function is experimental, just as the `spod_quick_get_od()` function, as the API of the Spanish Ministry of Transport may change in the future. It is only intended for quick analysis in educational or other demonstration purposes, as it downloads very little data compared to the regular `spod_get_od()`, `spod_download()` and `spod_convert()` functions. + +* `spod_check_files()` function allows to check consistency of downloaded files with Amazon S3 checksums (PR [#165](https://github.com/rOpenSpain/spanishoddata/pull/165)). ETags for v1 data are stored with the package, and for v2 data they are fetched from Amazon S3. This function is experimental. + +## Improvements + +* Metadata is now fetched from Amazon S3 storage of the original data files, which allows validation of downloaded files ((#126)[https://github.com/rOpenSpain/spanishoddata/issues/126]) with both size and checksum. PR [#165](https://github.com/rOpenSpain/spanishoddata/pull/165). ## Bug fixes -* `spod_quick_get_od()` is working again. We fixed it to work with the updated API of the Spanish Ministry of Transport (PR [#163](https://github.com/rOpenSpain/spanishoddata/pull/163), issue [#162](https://github.com/rOpenSpain/spanishoddata/issues/162)). It will remain experimental, as the API may change in the future. +* More reliable, but still multi-threaded data file downloads using base R `utils::download.file()` instead of `curl::multi_download()` which failed on some connections ([#127](https://github.com/rOpenSpain/spanishoddata/issues/127)), so now `curl` dependency is no longer required. PR [#165](https://github.com/rOpenSpain/spanishoddata/pull/165). + +* `spod_quick_get_od()` is working again. We fixed it to work with the updated API of the Spanish Ministry of Transport (PR [#163](https://github.com/rOpenSpain/spanishoddata/pull/163), issue [#162](https://github.com/rOpenSpain/spanishoddata/issues/162)). This function will remain experimental, just as the `spod_quick_get_zones()` function, as the API of the Spanish Ministry of Transport may change in the future. It is only intended for quick analysis in educational or other demonstration purposes, as it downloads very little data compared to the regular `spod_get_od()`, `spod_download()` and `spod_convert()` functions. -* `spod_convert()` can now accept `overwrite = 'update'` with `save_format = 'parquet'` ([#161](https://github.com/rOpenSpain/spanishoddata/pull/161)) previously it failed because of the incorrect check that asserted only `TRUE` or `FALSE` ([#160](https://github.com/rOpenSpain/spanishoddata/issues/160)) +* `spod_convert()` now accepts `overwrite = 'update'` with `save_format = 'parquet'` ([#161](https://github.com/rOpenSpain/spanishoddata/pull/161)) previously it failed because of the incorrect check that asserted only `TRUE` or `FALSE` ([#160](https://github.com/rOpenSpain/spanishoddata/issues/160)) # spanishoddata 0.1.1 diff --git a/R/available-data-s3.R b/R/available-data-s3.R new file mode 100644 index 00000000..d6e4f001 --- /dev/null +++ b/R/available-data-s3.R @@ -0,0 +1,187 @@ +#' Get available data list from Amazon S3 storage +#' +#' @description +#' +#' Get a table with links to available data files for the specified data version from Amazon S3 storage. +#' +#' @inheritParams spod_available_data +#' @inheritParams global_quiet_param +#' @return A tibble with links, release dates of files in the data, dates of data coverage, local paths to files, and the download status. +#' +#' @keywords internal +spod_available_data_s3 <- function( + ver = c(1, 2), + force = FALSE, + quiet = FALSE, + data_dir = spod_get_data_dir() +) { + ver <- as.character(ver) + ver <- match.arg(ver) + metadata_folder <- glue::glue("{data_dir}/{spod_subfolder_metadata_cache()}") + + # if forcing, evict the in-session cache now + if (isTRUE(force)) { + memoise::forget(spod_available_data_s3_memoised) + } + + # shortcut: if we already have it memoised, return immediately + if (!force && memoise::has_cache(spod_available_data_s3_memoised)(ver)) { + if (!quiet) message("Using memory-cached available data from S3") + return(spod_available_data_s3_memoised(ver)) + } + + # no in-session data, check your on-disk RDS pool + pattern <- glue::glue("metadata_s3_v{ver}_\\d{{4}}-\\d{{2}}-\\d{{2}}\\.rds$") + rds_files <- fs::dir_ls( + path = metadata_folder, + type = "file", + regexp = pattern + ) |> + sort() + + latest_file <- utils::tail(rds_files, 1) + latest_date <- if (length(latest_file) == 1) { + stringr::str_extract(basename(latest_file), "\\d{4}-\\d{2}-\\d{2}") |> + as.Date() + } else { + NA + } + + needs_update <- isTRUE(force) || + length(rds_files) == 0 || + (!is.na(latest_date) && latest_date < Sys.Date()) + + if (!needs_update) { + if (!quiet) message("Using existing disk cache: ", latest_file) + return(readRDS(latest_file)) + } + + # if forcing, also clear old disk files + if (isTRUE(force) && length(rds_files) > 0) { + fs::file_delete(rds_files) + } + + # fetch via the memoised function (this will re-hit S3 if we forgot it) + if (!quiet) message("Fetching latest metadata from AmazonS3 (v", ver, ")...") + dat <- spod_available_data_s3_memoised(ver) + + # write a new RDS stamped with today's date + file_date <- format(Sys.Date(), "%Y-%m-%d") + out_path <- file.path( + metadata_folder, + glue::glue("metadata_s3_v{ver}_{file_date}.rds") + ) + saveRDS(dat, out_path) + if (!quiet) message("Cached new data to: ", out_path) + + dat +} + + +spod_available_data_s3_function <- function( + ver = c(1, 2) +) { + ver <- as.character(ver) + ver <- match.arg(ver) + + bucket <- paste0("mitma-movilidad-v", ver) + + # original_aws_region <- Sys.getenv("AWS_DEFAULT_REGION") + # original_aws_url_style <- Sys.getenv("AWS_S3_URL_STYLE") + # on.exit({ + # Sys.setenv( + # AWS_DEFAULT_REGION = original_aws_region, + # AWS_S3_URL_STYLE = original_aws_url_style + # ) + # }) + # Sys.setenv( + # AWS_DEFAULT_REGION = "eu-west-1", + # AWS_S3_URL_STYLE = "virtual" + # ) + + if (ver == 1) { + url_prefix <- "https://opendata-movilidad.mitma.es/" + } else { + url_prefix <- "https://movilidad-opendata.mitma.es/" + } + + s3 <- paws.storage::s3( + config = list( + credentials = list( + anonymous = TRUE + ) + ) + ) + + all_objects <- list_objects_v2_all(s3, bucket) + + # all_objects <- aws.s3::get_bucket_df( + # bucket = bucket, + # prefix = "", # root of bucket + # max = Inf # fetch beyond the default 1000 + # ) + + all_objects <- all_objects |> + dplyr::as_tibble() |> + dplyr::mutate( + target_url = paste0(url_prefix, .data$Key), + pub_ts = as.POSIXct( + .data$LastModified, + format = "%Y-%m-%dT%H:%M:%OSZ", + tz = "UTC" + ), + file_size_bytes = as.numeric(.data$Size), + etag = gsub('\\"', '', .data$ETag) + ) |> + dplyr::select( + .data$target_url, + .data$pub_ts, + .data$file_size_bytes, + .data$etag + ) + + return(all_objects) +} + +spod_available_data_s3_memoised <- memoise::memoise( + spod_available_data_s3_function +) + +list_objects_v2_all <- function(s3, bucket, prefix = "", max_keys = 10000) { + pages <- paws.storage::paginate( + s3$list_objects_v2( + Bucket = bucket, + Prefix = prefix, + MaxKeys = max_keys + ), + PageSize = max_keys + ) + + all_objects <- unlist( + lapply(pages, `[[`, "Contents"), + recursive = FALSE + ) + + metadata <- dplyr::tibble( + Key = vapply(all_objects, `[[`, character(1), "Key"), + LastModified = as.POSIXct( + vapply(all_objects, `[[`, numeric(1), "LastModified"), + origin = "1970-01-01", + tz = "UTC" + ), + Size = vapply(all_objects, `[[`, numeric(1), "Size"), + ETag = vapply(all_objects, `[[`, character(1), "ETag") + ) + + # S3 generate download urls + # urls <- metadata$Key |> + # purrr::map( + # ~ s3$generate_presigned_url( + # client_method = "get_object", + # params = list(Bucket = "mitma-movilidad-v1", Key = .x) + # ), + # .progress = TRUE + # ) + + return(metadata) +} diff --git a/R/available-data.R b/R/available-data.R index e1caf007..1d6bbdc3 100644 --- a/R/available-data.R +++ b/R/available-data.R @@ -1,12 +1,16 @@ #' Get available data list -#' +#' #' @description -#' +#' #' `r lifecycle::badge("stable")` -#' -#' Get a table with links to available data files for the specified data version. Optionally check (see arguments) if certain files have already been downloaded into the cache directory specified with SPANISH_OD_DATA_DIR environment variable (set by \link{spod_set_data_dir}) or a custom path specified with `data_dir` argument. -#' -#' @param ver Integer. Can be 1 or 2. The version of the data to use. v1 spans 2020-2021, v2 covers 2022 and onwards. +#' +#' Get a table with links to available data files for the specified data version. Optionally check (see arguments) the file size and availability of data files previously downloaded into the cache directory specified with SPANISH_OD_DATA_DIR environment variable (set by [spod_set_data_dir()]) or a custom path specified with `data_dir` argument. By default the data is fetched from Amazon S3 bucket where the data is stored. If that fails, the function falls back to downloading an XML file from the Spanish Ministry of Transport website. You can also control this behaviour with `use_s3` argument. +#' +#' @param ver Integer. Can be 1 or 2. The version of the data to use. v1 spans 2020-2021, v2 covers 2022 and onwards. See more details in codebooks with [spod_codebook()]. +#' @param check_local_files Logical. Whether to check if the local files exist and get the file size. Defaults to `FALSE`. +#' @param data_dir The directory where the data is stored. Defaults to the value returned by `spod_get_data_dir()`. +#' @param use_s3 `r lifecycle::badge("experimental")` Logical. If `TRUE`, use Amazon S3 to get available data list, which does not require downloading the XML file and caching it locally, which may be a bit faster. If `FALSE`, use the XML file to get available data list. +#' @param force Logical. If `TRUE`, force re-download of metadata. For Amazon S3 this queries the S3 bucket for the XML file it re-downloads. If `FALSE`, only update the available data list if it is older than 1 day. #' @inheritParams spod_available_data_v1 #' @inheritParams global_quiet_param #' @return A tibble with links, release dates of files in the data, dates of data coverage, local paths to files, and the download status. @@ -16,47 +20,65 @@ #' \item{file_extension}{\code{character}. The file extension of the data file (e.g., 'tar', 'gz').} #' \item{data_ym}{\code{Date}. The year and month of the data coverage, if available.} #' \item{data_ymd}{\code{Date}. The specific date of the data coverage, if available.} -#' \item{local_path}{\code{character}. The local file path where the data is stored.} +#' \item{local_path}{\code{character}. The local file path where the data is (or going to be) stored.} #' \item{downloaded}{\code{logical}. Indicator of whether the data file has been downloaded locally. This is only available if `check_local_files` is `TRUE`.} #' } #' @export #' @examplesIf interactive() #' \donttest{ -#' +#' #' # Set data dir for file downloads #' spod_set_data_dir(tempdir()) -#' +#' #' # Get available data list for v1 (2020-2021) data #' spod_available_data(ver = 1) -#' +#' #' # Get available data list for v2 (2022 onwards) data #' spod_available_data(ver = 2) -#' +#' #' # Get available data list for v2 (2022 onwards) data #' # while also checking for local files that are already downloaded #' spod_available_data(ver = 2, check_local_files = TRUE) #' } -#' +#' spod_available_data <- function( ver = 2, check_local_files = FALSE, quiet = FALSE, - data_dir = spod_get_data_dir() + data_dir = spod_get_data_dir(), + use_s3 = TRUE, + force = FALSE ) { # Validate input checkmate::assertIntegerish(ver, max.len = 1) if (!ver %in% c(1, 2)) { - stop("Invalid version number. Must be 1 (for v1 2020-2021 data) or 2 (for v2 2022 onwards).") + stop( + "Invalid version number. Must be 1 (for v1 2020-2021 data) or 2 (for v2 2022 onwards)." + ) } checkmate::assert_flag(check_local_files) checkmate::assert_flag(quiet) checkmate::assert_directory_exists(data_dir, access = "rw") if (ver == 1) { - return(spod_available_data_v1(data_dir = data_dir, check_local_files = check_local_files, quiet = quiet)) + available_data <- spod_available_data_v1( + data_dir = data_dir, + check_local_files = check_local_files, + quiet = quiet, + use_s3 = use_s3, + force = force + ) } else if (ver == 2) { - return(spod_available_data_v2(data_dir = data_dir, check_local_files = check_local_files, quiet = quiet)) + available_data <- spod_available_data_v2( + data_dir = data_dir, + check_local_files = check_local_files, + quiet = quiet, + use_s3 = use_s3, + force = force + ) } + + return(available_data) } #' Get latest file list from the XML for MITMA open mobility data v1 (2020-2021) @@ -70,32 +92,38 @@ spod_get_latest_v1_file_list <- function( data_dir = spod_get_data_dir(), xml_url = "https://opendata-movilidad.mitma.es/RSS.xml" ) { -if (!dir.exists(data_dir)) { - fs::dir_create(data_dir) -} + if (!dir.exists(data_dir)) { + fs::dir_create(data_dir) + } -current_date <- format(Sys.Date(), format = "%Y-%m-%d") -current_filename <- glue::glue("{data_dir}/{spod_subfolder_metadata_cache()}/data_links_v1_{current_date}.xml") + current_date <- format(Sys.Date(), format = "%Y-%m-%d") + current_filename <- glue::glue( + "{data_dir}/{spod_subfolder_metadata_cache()}/data_links_v1_{current_date}.xml" + ) -# ensure dir exists -if (!dir.exists(dirname(current_filename))) { - fs::dir_create(dirname(current_filename), recurse = TRUE) -} + # ensure dir exists + if (!dir.exists(dirname(current_filename))) { + fs::dir_create(dirname(current_filename), recurse = TRUE) + } -message("Saving the file to: ", current_filename) -xml_requested <- curl::multi_download( - urls = xml_url, - destfiles = current_filename -) -return(current_filename) + message("Saving the file to: ", current_filename) + utils::download.file(xml_url, current_filename, mode = "wb") + # disable curl::multi_download() for now + # xml_requested <- curl::multi_download( + # urls = xml_url, + # destfiles = current_filename + # ) + if (!fs::file_exists(current_filename)) { + stop("Failed to download XML file.") + } + return(current_filename) } #' Get the available v1 data list #' #' This function provides a table of the available data list of MITMA v1 (2020-2021), both remote and local. #' -#' @param data_dir The directory where the data is stored. Defaults to the value returned by `spod_get_data_dir()`. -#' @param check_local_files Whether to check if the local files exist. Defaults to `FALSE`. +#' @inheritParams spod_available_data #' @inheritParams global_quiet_param #' @inherit spod_available_data return #' @importFrom rlang .data @@ -104,128 +132,284 @@ spod_available_data_v1 <- function( data_dir = spod_get_data_dir(), # check_local_files (below) is FALSE by default to avoid excessive filesystem access, perhaps should be TRUE. Download functions use it to load the xml file, but we probably do not want the script to check all local cache directories every time we run a get data function. Perhaps it is better to offload this check to a separate function and have a csv file or some other way to keep track of the files that were downloaded and cached. An output of curl::multi_download() could be used for this purpose. check_local_files = FALSE, - quiet = FALSE) { - -metadata_folder <- glue::glue("{data_dir}/{spod_subfolder_metadata_cache()}") -if(!dir.exists(metadata_folder)){ - fs::dir_create(metadata_folder) -} - -xml_files_list <- fs::dir_ls(metadata_folder, type = "file", regexp = "data_links_v1") |> sort() -if (length(xml_files_list) == 0) { - if (isFALSE(quiet)) { - message("No data links xml files found, getting latest v1 data links xml") - } - latest_data_links_xml_path <- spod_get_latest_v1_file_list(data_dir = data_dir) -} else { - latest_data_links_xml_path <- utils::tail(xml_files_list, 1) -} - -# Check if the XML file is 1 day old or older from its name -file_date <- stringr::str_extract(latest_data_links_xml_path, "[0-9]{4}-[0-9]{2}-[0-9]{2}") - -if (file_date < format(Sys.Date(), format = "%Y-%m-%d")) { - if (isFALSE(quiet)) { - message("File list xml is 1 day old or older, getting latest data links xml") - } - latest_data_links_xml_path <- spod_get_latest_v1_file_list(data_dir = data_dir) -} else { - if (isFALSE(quiet)) { - message("Using existing data links xml: ", latest_data_links_xml_path) + use_s3 = TRUE, + force = FALSE, + quiet = FALSE +) { + metadata_folder <- glue::glue("{data_dir}/{spod_subfolder_metadata_cache()}") + if (!dir.exists(metadata_folder)) { + fs::dir_create(metadata_folder) } -} -if (length(latest_data_links_xml_path) == 0) { - if (isFALSE(quiet)) { - message("Getting latest data links xml") + if (use_s3) { + files_table <- tryCatch( + { + spod_available_data_s3(ver = 1, force = force, quiet = quiet) + }, + error = function(e) { + message( + "S3 fetch failed (", + e$message, + "); falling back to XML sequence." + ) + read_data_links_xml( + metadata_folder = metadata_folder, + data_dir = data_dir, + force = force, + quiet = quiet, + latest_file_function = spod_get_latest_v1_file_list + ) + } + ) + } else { + files_table <- read_data_links_xml( + metadata_folder = metadata_folder, + data_dir = data_dir, + force = force, + quiet = quiet, + latest_file_function = spod_get_latest_v1_file_list + ) } - latest_data_links_xml_path <- spod_get_latest_v1_file_list(data_dir = data_dir) -} -x_xml <- xml2::read_xml(latest_data_links_xml_path) - -files_table <- tibble::tibble( - target_url = xml2::xml_find_all(x = x_xml, xpath = "//link") |> xml2::xml_text(), - pub_date = xml2::xml_find_all(x = x_xml, xpath = "//pubDate") |> xml2::xml_text() -) - -files_table$pub_ts <- lubridate::dmy_hms(files_table$pub_date) -files_table$file_extension <- tools::file_ext(files_table$target_url) -files_table <- files_table[files_table$file_extension != "", ] -files_table$pub_date <- NULL - -files_table$data_ym <- lubridate::ym(stringr::str_extract(files_table$target_url, "[0-9]{4}-[0-9]{2}")) -files_table$data_ymd <- lubridate::ymd(stringr::str_extract(files_table$target_url, "[0-9]{8}")) -# order by pub_ts -files_table <- files_table[order(files_table$pub_ts, decreasing = TRUE), ] -files_table$local_path <- file.path( - data_dir, - stringr::str_replace(files_table$target_url, ".*mitma.es/", spod_subfolder_raw_data_cache(ver = 1)) -) + files_table$file_extension <- tools::file_ext(files_table$target_url) + files_table <- files_table[files_table$file_extension != "", ] -files_table$local_path <- stringr::str_replace_all(files_table$local_path, "\\/\\/\\/|\\/\\/", "/") + files_table$data_ym <- lubridate::ym(stringr::str_extract( + files_table$target_url, + "[0-9]{4}-[0-9]{2}" + )) + files_table$data_ymd <- lubridate::ymd(stringr::str_extract( + files_table$target_url, + "[0-9]{8}" + )) + # order by pub_ts + files_table <- files_table[order(files_table$pub_ts, decreasing = TRUE), ] + files_table$local_path <- file.path( + data_dir, + stringr::str_replace( + files_table$target_url, + ".*mitma.es/", + spod_subfolder_raw_data_cache(ver = 1) + ) + ) -# change path for daily data files to be in hive-style format -files_table$local_path <- gsub("([0-9]{4})-([0-9]{2})\\/[0-9]{6}([0-9]{2})_", "year=\\1\\/month=\\2\\/day=\\3\\/", files_table$local_path) + files_table$local_path <- stringr::str_replace_all( + files_table$local_path, + "\\/\\/\\/|\\/\\/", + "/" + ) -# fix paths for files that are in '0000-referencia' folder -files_table$local_path <- gsub("0000-referencia\\/([0-9]{4})([0-9]{2})([0-9]{2})_", "year=\\1\\/month=\\2\\/day=\\3\\/", files_table$local_path) + # change path for daily data files to be in hive-style format + files_table$local_path <- gsub( + "([0-9]{4})-([0-9]{2})\\/[0-9]{6}([0-9]{2})_", + "year=\\1\\/month=\\2\\/day=\\3\\/", + files_table$local_path + ) -# replace 2 digit month with 1 digit month -files_table$local_path <- gsub("month=0([1-9])", "month=\\1", files_table$local_path) + # fix paths for files that are in '0000-referencia' folder + files_table$local_path <- gsub( + "0000-referencia\\/([0-9]{4})([0-9]{2})([0-9]{2})_", + "year=\\1\\/month=\\2\\/day=\\3\\/", + files_table$local_path + ) -# replace 2 digit day with 1 digit day -files_table$local_path <- gsub("day=0([1-9])", "day=\\1", files_table$local_path) + # replace 2 digit month with 1 digit month + files_table$local_path <- gsub( + "month=0([1-9])", + "month=\\1", + files_table$local_path + ) -# change txt.gz to csv.gz -files_table$local_path <- gsub("\\.txt\\.gz", "\\.csv\\.gz", files_table$local_path) + # replace 2 digit day with 1 digit day + files_table$local_path <- gsub( + "day=0([1-9])", + "day=\\1", + files_table$local_path + ) -# replace all municipal data download links with districts links -# this is to address the bugs described in detail in: -# http://www.ekotov.pro/mitma-data-issues/issues/011-v1-tpp-mismatch-zone-ids-in-table-and-spatial-data.html -# http://www.ekotov.pro/mitma-data-issues/issues/012-v1-tpp-district-files-in-municipality-folders.html -# the decision was to use distrcit data and aggregate it to replicate municipal data -files_table$target_url <- gsub("mitma-municipios", "mitma-distritos", files_table$target_url) -files_table$target_url <- gsub("mitma_municipio", "mitma_distrito", files_table$target_url) + # change txt.gz to csv.gz + files_table$local_path <- gsub( + "\\.txt\\.gz", + "\\.csv\\.gz", + files_table$local_path + ) -# add known file sizes from cached data -file_sizes <- readr::read_csv(system.file("extdata", "url_file_sizes_v1.txt.gz", package = "spanishoddata"), show_col_types = FALSE) -files_table <- dplyr::left_join(files_table, file_sizes, by = "target_url") + # replace all municipal data download links with districts links + # this is to address the bugs described in detail in: + # http://www.ekotov.pro/mitma-data-issues/issues/011-v1-tpp-mismatch-zone-ids-in-table-and-spatial-data.html + # http://www.ekotov.pro/mitma-data-issues/issues/012-v1-tpp-district-files-in-municipality-folders.html + # the decision was to use distrcit data and aggregate it to replicate municipal data + files_table$target_url <- gsub( + "mitma-municipios", + "mitma-distritos", + files_table$target_url + ) + files_table$target_url <- gsub( + "mitma_municipio", + "mitma_distrito", + files_table$target_url + ) -# if there are files with missing sizes, impute them -if (any(is.na(files_table$remote_file_size_mb))) { - # impute uknown file sizes - # primitive file categorisation - # Extract file category from the target URL files_table <- files_table |> dplyr::mutate( - file_category = stringr::str_extract(.data$target_url, "\\/maestra(\\d)-mitma-(distritos|municipios)\\/(ficheros-diarios|meses-completos)\\/") + study = dplyr::case_when( + grepl("maestra", .data$target_url) ~ "basic", + TRUE ~ "" + ), + + type = dplyr::case_when( + grepl("maestra2", .data$target_url) ~ "number_of_trips", + grepl("maestra1", .data$target_url) ~ "origin-destination", + grepl("RSS\\.xml", .data$target_url) ~ "metadata", + grepl("zonificacion", .data$target_url) ~ "zones", + grepl("relacion", .data$target_url) ~ "relations", + grepl("index\\.html", .data$target_url) ~ "index", + grepl(".\\pdf", .data$target_url) ~ "documentation", + TRUE ~ "" + ), + + period = dplyr::case_when( + grepl("ficheros-diarios", .data$target_url) ~ "day", + grepl("meses-completos|mensual", .data$target_url) ~ "month", + TRUE ~ "" + ), + + zones = dplyr::case_when( + grepl("distrito", .data$target_url) ~ "district", + grepl("municipio", .data$target_url) ~ "municipality", + TRUE ~ "" + ) ) - # Set other category for non-categorized files - files_table$file_category[is.na(files_table$file_category)] <- "other" + # add known file sizes from cached data + if (use_s3) { + # replace remote file sizes for v1 + replacement_file_sizes_distr <- files_table |> + dplyr::filter(grepl("mitma-distr", .data$local_path)) |> + dplyr::select(.data$target_url, .data$file_size_bytes) + replaced_file_sizes_municip <- files_table |> + dplyr::filter(grepl("mitma-municip", .data$local_path)) |> + dplyr::select(-"file_size_bytes") |> + dplyr::left_join(replacement_file_sizes_distr, by = "target_url") + files_table_replaced_file_sizes <- files_table |> + dplyr::filter(!grepl("mitma-municip", .data$local_path)) |> + dplyr::bind_rows(replaced_file_sizes_municip) |> + dplyr::arrange(dplyr::desc(.data$pub_ts)) + files_table <- files_table_replaced_file_sizes + + files_table$remote_file_size_mb <- round( + files_table$file_size_bytes / 1024^2, + 2 + ) - # Calculate mean file sizes by category - size_by_file_category <- files_table |> - dplyr::group_by(.data$file_category) |> - dplyr::summarise(mean_file_size_mb = mean(.data$remote_file_size_mb, na.rm = TRUE)) + file_sizes <- readRDS( + system.file( + "extdata", + "available_data_v1.rds", + package = "spanishoddata" + ) + ) + files_table <- dplyr::left_join( + files_table |> dplyr::select(-"file_size_bytes"), + file_sizes |> + dplyr::select( + "target_url", + "etag", + "true_etag", + file_size_bytes = "true_remote_file_size_bytes" + ), + by = c("target_url", "etag") + ) |> + dplyr::relocate("file_size_bytes", .after = "pub_ts") |> + dplyr::mutate( + etag = dplyr::if_else( + condition = !is.na(.data$true_etag), + true = .data$true_etag, + false = .data$etag + ) + ) |> + dplyr::select(-"true_etag") + } else { + file_sizes <- readRDS( + system.file( + "extdata", + "available_data_v1.rds", + package = "spanishoddata" + ) + ) + files_table <- dplyr::left_join( + files_table |> dplyr::select(-"file_size_bytes"), + file_sizes |> + dplyr::select( + "target_url", + "etag", + "true_etag", + file_size_bytes = "true_remote_file_size_bytes" + ), + by = c("target_url", "etag") + ) |> + dplyr::relocate("file_size_bytes", .after = "pub_ts") |> + dplyr::mutate( + etag = dplyr::if_else( + condition = !is.na(.data$true_etag), + true = .data$true_etag, + false = .data$etag + ) + ) |> + dplyr::select(-"true_etag") - # Impute missing file sizes - files_table <- files_table |> - dplyr::left_join(size_by_file_category, by = "file_category") - files_table$remote_file_size_mb[is.na(files_table$remote_file_size_mb)] <- mean(files_table$mean_file_size_mb) + # if there are files with missing sizes, impute them + if (any(is.na(files_table$remote_file_size_mb))) { + # impute uknown file sizes + # primitive file categorisation + # Extract file category from the target URL + files_table <- files_table |> + dplyr::mutate( + file_category = stringr::str_extract( + .data$target_url, + "\\/maestra(\\d)-mitma-(distritos|municipios)\\/(ficheros-diarios|meses-completos)\\/" + ) + ) - # Clean up temporary columns - files_table <- files_table |> - dplyr::select(-"mean_file_size_mb", -"file_category") -} + # Set other category for non-categorized files + files_table$file_category[is.na(files_table$file_category)] <- "other" -# now check if any of local files exist -if( check_local_files == TRUE){ - files_table$downloaded <- fs::file_exists(files_table$local_path) -} + # Calculate mean file sizes by category + size_by_file_category <- files_table |> + dplyr::group_by(.data$file_category) |> + dplyr::summarise( + mean_file_size_mb = mean(.data$remote_file_size_mb, na.rm = TRUE) + ) + + # Impute missing file sizes + files_table <- files_table |> + dplyr::left_join(size_by_file_category, by = "file_category") + files_table$remote_file_size_mb[is.na( + files_table$remote_file_size_mb + )] <- mean(files_table$mean_file_size_mb) + + # Clean up temporary columns + files_table <- files_table |> + dplyr::select(-"mean_file_size_mb", -"file_category") + } + } -return(files_table) + # check file sizes + if (check_local_files == TRUE) { + files_table <- files_table |> + dplyr::mutate( + local_file_size = fs::file_size(.data$local_path) + ) |> + dplyr::mutate( + downloaded = dplyr::if_else( + condition = is.na(.data$local_file_size), + true = FALSE, + false = TRUE + ) + ) + } + + return(files_table) } #' Get latest file list from the XML for MITMA open mobility data v2 (2022 onwards) @@ -236,15 +420,17 @@ return(files_table) #' @return The path to the downloaded XML file. #' @keywords internal spod_get_latest_v2_file_list <- function( - data_dir = spod_get_data_dir(), - xml_url = "https://movilidad-opendata.mitma.es/RSS.xml" + data_dir = spod_get_data_dir(), + xml_url = "https://movilidad-opendata.mitma.es/RSS.xml" ) { if (!dir.exists(data_dir)) { fs::dir_create(data_dir) } current_date <- format(Sys.Date(), format = "%Y-%m-%d") - current_filename <- glue::glue("{data_dir}/{spod_subfolder_metadata_cache()}/data_links_v2_{current_date}.xml") + current_filename <- glue::glue( + "{data_dir}/{spod_subfolder_metadata_cache()}/data_links_v2_{current_date}.xml" + ) # ensure dir exists if (!dir.exists(dirname(current_filename))) { @@ -252,10 +438,16 @@ spod_get_latest_v2_file_list <- function( } message("Saving the file to: ", current_filename) - xml_requested <- curl::multi_download( - urls = xml_url, - destfiles = current_filename - ) + utils::download.file(xml_url, current_filename, mode = "wb") + # disable curl::multi_download() for now + # xml_requested <- curl::multi_download( + # urls = xml_url, + # destfiles = current_filename + # ) + if (!fs::file_exists(current_filename)) { + stop("Failed to download the XML file.") + } + return(current_filename) } @@ -272,141 +464,288 @@ spod_get_latest_v2_file_list <- function( spod_available_data_v2 <- function( data_dir = spod_get_data_dir(), check_local_files = FALSE, + use_s3 = TRUE, + force = FALSE, quiet = FALSE ) { - metadata_folder <- glue::glue("{data_dir}/{spod_subfolder_metadata_cache()}") - if(!dir.exists(metadata_folder)){ + if (!dir.exists(metadata_folder)) { fs::dir_create(metadata_folder) } - xml_files_list <- fs::dir_ls(metadata_folder, type = "file", regexp = "data_links_v2") |> sort() - if (length(xml_files_list) == 0) { - if (isFALSE(quiet)) { - message("No data links xml files found, getting latest v2 data links xml.") - } - latest_data_links_xml_path <- spod_get_latest_v2_file_list(data_dir = data_dir) - } else { - latest_data_links_xml_path <- utils::tail(xml_files_list, 1) - } - - # Check if the XML file is 1 day old or older from its name - file_date <- stringr::str_extract(latest_data_links_xml_path, "[0-9]{4}-[0-9]{2}-[0-9]{2}") - - if (file_date < format(Sys.Date(), format = "%Y-%m-%d")) { - if (isFALSE(quiet)) { - message("File list xml is 1 day old or older, getting latest data links xml") - } - latest_data_links_xml_path <- spod_get_latest_v2_file_list(data_dir = data_dir) + if (use_s3) { + files_table <- tryCatch( + { + spod_available_data_s3(ver = 2, force = force, quiet = quiet) + }, + error = function(e) { + message( + "S3 fetch failed (", + e$message, + "); falling back to XML sequence." + ) + read_data_links_memoised( + metadata_folder = metadata_folder, + data_dir = data_dir, + force = force, + quiet = quiet, + latest_file_function = spod_get_latest_v2_file_list + ) + } + ) } else { - if (isFALSE(quiet)) { - message("Using existing data links xml: ", latest_data_links_xml_path) - } - } - - if (length(latest_data_links_xml_path) == 0) { - if (isFALSE(quiet)) { - message("Getting latest data links xml") - } - latest_data_links_xml_path <- spod_get_latest_v2_file_list(data_dir = data_dir) + files_table <- read_data_links_memoised( + metadata_folder = metadata_folder, + data_dir = data_dir, + force = force, + quiet = quiet, + latest_file_function = spod_get_latest_v2_file_list + ) } - - - x_xml <- xml2::read_xml(latest_data_links_xml_path) - files_table <- tibble::tibble( - target_url = xml2::xml_find_all(x = x_xml, xpath = "//link") |> xml2::xml_text(), - pub_date = xml2::xml_find_all(x = x_xml, xpath = "//pubDate") |> xml2::xml_text() - ) - - files_table$pub_ts <- lubridate::dmy_hms(files_table$pub_date) files_table$file_extension <- tools::file_ext(files_table$target_url) files_table <- files_table[files_table$file_extension != "", ] - files_table$pub_date <- NULL - files_table$data_ym <- lubridate::ym(stringr::str_extract(files_table$target_url, "[0-9]{4}-[0-9]{2}")) - files_table$data_ymd <- lubridate::ymd(stringr::str_extract(files_table$target_url, "[0-9]{8}")) + files_table$data_ym <- lubridate::ym(stringr::str_extract( + files_table$target_url, + "[0-9]{4}-[0-9]{2}" + )) + files_table$data_ymd <- lubridate::ymd(stringr::str_extract( + files_table$target_url, + "[0-9]{8}" + )) # order by pub_ts files_table <- files_table[order(files_table$pub_ts, decreasing = TRUE), ] files_table$local_path <- file.path( data_dir, - stringr::str_replace(files_table$target_url, ".*mitma.es/", - spod_subfolder_raw_data_cache(ver = 2)) + stringr::str_replace( + files_table$target_url, + ".*mitma.es/", + spod_subfolder_raw_data_cache(ver = 2) + ) + ) + files_table$local_path <- stringr::str_replace_all( + files_table$local_path, + "\\/\\/\\/|\\/\\/", + "/" ) - files_table$local_path <- stringr::str_replace_all(files_table$local_path, "\\/\\/\\/|\\/\\/", "/") # change path for daily data files to be in hive-style format # TODO: check if this is needed for estudios completo and rutas - files_table$local_path <- gsub("([0-9]{4})-([0-9]{2})\\/[0-9]{6}([0-9]{2})_", "year=\\1\\/month=\\2\\/day=\\3\\/", files_table$local_path) + files_table$local_path <- gsub( + "([0-9]{4})-([0-9]{2})\\/[0-9]{6}([0-9]{2})_", + "year=\\1\\/month=\\2\\/day=\\3\\/", + files_table$local_path + ) # replace 2 digit month with 1 digit month - files_table$local_path <- gsub("month=0([1-9])", "month=\\1", files_table$local_path) + files_table$local_path <- gsub( + "month=0([1-9])", + "month=\\1", + files_table$local_path + ) # replace 2 digit day with 1 digit day - files_table$local_path <- gsub("day=0([1-9])", "day=\\1", files_table$local_path) + files_table$local_path <- gsub( + "day=0([1-9])", + "day=\\1", + files_table$local_path + ) # lowercase GAU to avoid problems with case-sensitive matching files_table$local_path <- gsub("GAU", "gau", files_table$local_path) - # now check if any of local files exist - if( check_local_files == TRUE){ - files_table$downloaded <- fs::file_exists(files_table$local_path) - } + files_table <- files_table |> + dplyr::mutate( + study = dplyr::case_when( + grepl("estudios_basicos", .data$target_url) ~ "basic", + grepl("estudios_completos", .data$target_url) ~ "complete", + grepl("rutas", .data$target_url) ~ "routes", + TRUE ~ "" + ), + + type = dplyr::case_when( + grepl("personas", .data$target_url) ~ "number_of_trips", + grepl("viajes", .data$target_url) ~ "origin-destination", + grepl("pernoctaciones", .data$target_url) ~ "overnight_stays", + grepl("calidad", .data$target_url) ~ "data_quality", + grepl("RSS\\.xml", .data$target_url) ~ "metadata", + TRUE ~ "" + ), + + period = dplyr::case_when( + grepl("ficheros-diarios", .data$target_url) ~ "day", + grepl("meses-completos|mensual", .data$target_url) ~ "month", + TRUE ~ "" + ), + + zones = dplyr::case_when( + grepl("distritos", .data$target_url) ~ "district", + grepl("municipios", .data$target_url) ~ "municipality", + grepl("GAU", .data$target_url) ~ "gau", + TRUE ~ "" + ) + ) # add known file sizes from cached data - file_sizes <- readr::read_csv(system.file("extdata", "url_file_sizes_v2.txt.gz", package = "spanishoddata"), show_col_types = FALSE) - files_table <- dplyr::left_join(files_table, file_sizes, by = "target_url") + if (use_s3) { + files_table$remote_file_size_mb <- round( + files_table$file_size_bytes / 1024^2, + 2 + ) + } else { + file_sizes <- readr::read_csv( + system.file( + "extdata", + "url_file_sizes_v2.txt.gz", + package = "spanishoddata" + ), + show_col_types = FALSE + ) + files_table <- dplyr::left_join(files_table, file_sizes, by = "target_url") + + # if there are files with missing sizes, impute them + if (any(is.na(files_table$remote_file_size_mb))) { + # impute uknown file sizes + # primitive file categorisation + files_table <- files_table |> + dplyr::mutate( + cleaned_url = stringr::str_remove_all( + .data$target_url, + "/[0-9]{4}[-_][0-9]{2}[-_][0-9]{2}|/[0-9]{6,8}" + ) |> + stringr::str_remove("/[^/]+$"), + file_category = dplyr::case_when( + stringr::str_detect(.data$cleaned_url, "calidad") ~ "quality", + stringr::str_detect(.data$cleaned_url, "rutas") ~ "routes", + stringr::str_detect(.data$cleaned_url, "estudios_basicos") ~ + paste0( + "basic_studies_", + dplyr::case_when( + stringr::str_detect(.data$cleaned_url, "por-distritos") ~ + "district_", + stringr::str_detect(.data$cleaned_url, "por-municipios") ~ + "municipal_", + stringr::str_detect(.data$cleaned_url, "por-GAU") ~ "GAU_", + TRUE ~ "unknown_" + ), + dplyr::case_when( + stringr::str_detect(.data$cleaned_url, "viajes") ~ "trips_", + stringr::str_detect(.data$cleaned_url, "personas") ~ + "people_", + stringr::str_detect(.data$cleaned_url, "pernoctaciones") ~ + "overnight_", + TRUE ~ "unknown_" + ), + ifelse( + stringr::str_detect(.data$cleaned_url, "ficheros-diarios"), + "daily", + "monthly" + ) + ), + TRUE ~ "other" + ) + ) |> + dplyr::select(-"cleaned_url") + + # Calculate mean file sizes by category + size_by_file_category <- files_table |> + dplyr::group_by(.data$file_category) |> + dplyr::summarise( + mean_file_size_mb = mean(.data$remote_file_size_mb, na.rm = TRUE) + ) + + # Impute missing file sizes + files_table <- dplyr::left_join( + files_table, + size_by_file_category, + by = "file_category" + ) + files_table <- files_table |> + dplyr::mutate( + size_imputed = ifelse(is.na(.data$remote_file_size_mb), TRUE, FALSE) + ) + if ( + length(files_table$remote_file_size_mb[is.na( + files_table$remote_file_size_mb + )]) > + 0 + ) { + files_table <- files_table |> + dplyr::mutate( + remote_file_size_mb = ifelse( + is.na(.data$remote_file_size_mb), + .data$mean_file_size_mb, + .data$remote_file_size_mb + ) + ) + } + files_table$mean_file_size_mb <- NULL + files_table$file_category <- NULL + } else { + files_table$size_imputed <- FALSE + } + } - # if there are files with missing sizes, impute them - if (any(is.na(files_table$remote_file_size_mb))) { - # impute uknown file sizes - # primitive file categorisation + # check file sizes + if (check_local_files == TRUE) { files_table <- files_table |> dplyr::mutate( - cleaned_url = stringr::str_remove_all(.data$target_url, "/[0-9]{4}[-_][0-9]{2}[-_][0-9]{2}|/[0-9]{6,8}") |> - stringr::str_remove("/[^/]+$"), - file_category = dplyr::case_when( - stringr::str_detect(.data$cleaned_url, "calidad") ~ "quality", - stringr::str_detect(.data$cleaned_url, "rutas") ~ "routes", - stringr::str_detect(.data$cleaned_url, "estudios_basicos") ~ paste0( - "basic_studies_", - dplyr::case_when( - stringr::str_detect(.data$cleaned_url, "por-distritos") ~ "district_", - stringr::str_detect(.data$cleaned_url, "por-municipios") ~ "municipal_", - stringr::str_detect(.data$cleaned_url, "por-GAU") ~ "GAU_", - TRUE ~ "unknown_" - ), - dplyr::case_when( - stringr::str_detect(.data$cleaned_url, "viajes") ~ "trips_", - stringr::str_detect(.data$cleaned_url, "personas") ~ "people_", - stringr::str_detect(.data$cleaned_url, "pernoctaciones") ~ "overnight_", - TRUE ~ "unknown_" - ), - ifelse(stringr::str_detect(.data$cleaned_url, "ficheros-diarios"), "daily", "monthly") - ), - TRUE ~ "other" - ) + local_file_size = fs::file_size(.data$local_path) ) |> - dplyr::select(-"cleaned_url") + dplyr::mutate( + downloaded = dplyr::if_else( + condition = is.na(.data$local_file_size), + true = FALSE, + false = TRUE + ) + ) + } - # Calculate mean file sizes by category - size_by_file_category <- files_table |> - dplyr::group_by(.data$file_category) |> - dplyr::summarise(mean_file_size_mb = mean(.data$remote_file_size_mb, na.rm = TRUE)) + return(files_table) +} - # Impute missing file sizes - files_table <- dplyr::left_join(files_table, size_by_file_category, by = "file_category") - files_table <- files_table |> - dplyr::mutate(size_imputed = ifelse(is.na(.data$remote_file_size_mb), TRUE, FALSE)) - if(length(files_table$remote_file_size_mb[is.na(files_table$remote_file_size_mb)]) > 0){ - files_table <- files_table |> - dplyr::mutate(remote_file_size_mb = ifelse(is.na(.data$remote_file_size_mb), .data$mean_file_size_mb, .data$remote_file_size_mb)) - } - files_table$mean_file_size_mb <- NULL - files_table$file_category <- NULL +read_data_links_xml <- function( + metadata_folder, + data_dir, + force = FALSE, + quiet = FALSE, + latest_file_function +) { + xml_files_list <- fs::dir_ls( + metadata_folder, + type = "file", + regexp = "data_links_v1" + ) |> + sort() + latest_file <- utils::tail(xml_files_list, 1) + + needs_update <- isTRUE(force) || + length(xml_files_list) == 0 || + as.Date( + stringr::str_extract(latest_file, "\\d{4}-\\d{2}-\\d{2}") + ) < + Sys.Date() + + if (needs_update) { + if (!quiet) message("Fetching latest data links xml") + latest_data_links_xml_path <- latest_file_function( + data_dir = data_dir + ) } else { - files_table$size_imputed <- FALSE + if (!quiet) message("Using existing data links xml: ", latest_file) + latest_data_links_xml_path <- latest_file } - return(files_table) + x_xml <- xml2::read_xml(latest_data_links_xml_path) + files_table <- tibble::tibble( + target_url = xml2::xml_find_all(x_xml, "//link") |> xml2::xml_text(), + pub_date = xml2::xml_find_all(x_xml, "//pubDate") |> xml2::xml_text() + ) + files_table$pub_ts <- lubridate::dmy_hms(files_table$pub_date) + files_table$pub_date <- NULL + + files_table } + +read_data_links_memoised <- memoise::memoise(read_data_links_xml) diff --git a/R/check-files.R b/R/check-files.R new file mode 100644 index 00000000..0313f7e5 --- /dev/null +++ b/R/check-files.R @@ -0,0 +1,263 @@ +#' Check cached files consistency against checksums from S3 +#' +#' @description +#' +#' `r lifecycle::badge("experimental")` +#' +#' **WARNING: The checks may fail for May 2022 data and for some 2024 data, as the remote cheksums that are used for checking the file consistency are incorrect. We are working on solving this in future updates, for now, kindly rely on the built-in file size checks of \code{\link{spod_download}}, \code{\link{spod_get}}, and \code{\link{spod_convert}}.** This function checks downloaded data files whether they are consistent with their checksums in Amazon S3 by computing ETag for each file. This involves computing MD5 for each part of the file and concatenating them and computing MD5 again on the resulting concatenated MD5s. This may take very long time if you check all files, so use with caution. +#' @inheritParams spod_get +#' @inheritParams spod_download +#' @inheritParams global_quiet_param +#' @param n_threads Numeric. Number of threads to use for file verificaiton. Defaults to 1. When set to 2 or more threads, uses `future.mirai` as a backend for parallelization, resulting in significant (~4x) speedup, unless disk read speed is a bottleneck. +#' +#' @return A tibble similar to the output of `spod_available_data`, but with an extra column `local_file_consistent`, where `TRUE` indicates that the file cheksum matches the expected checksums in Amazon S3. Note: some v1 (2020-2021) files were not stored correctly on S3 and their ETag checksums are incorrectly reported by Amazon S3, so their true file sizes and ETag checksums were cached inside the `spanishoddata` package. +#' +#' @export +#' +#' @examplesIf interactive() +#' \donttest{ +#' spod_set_data_dir(tempdir()) +#' spod_download( +#' type = "number_of_trips", +#' zones = "distr", +#' dates = "2020-03-14" +#' ) +#' +#' # now check the consistency +#' check_results <- spod_check_files( +#' type = "number_of_trips", +#' zones = "distr", +#' dates = "2020-03-14" +#' ) +#' all(check_results$local_file_consistent) +#' } +spod_check_files <- function( + type = c( + "od", + "origin-destination", + "os", + "overnight_stays", + "nt", + "number_of_trips" + ), + zones = c( + "districts", + "dist", + "distr", + "distritos", + "municipalities", + "muni", + "municip", + "municipios", + "lua", + "large_urban_areas", + "gau", + "grandes_areas_urbanas" + ), + dates = NULL, + data_dir = spod_get_data_dir(), + quiet = FALSE, + ignore_missing_dates = FALSE, + n_threads = 1 +) { + # Validate inputs + checkmate::assert_choice( + type, + choices = c( + "od", + "origin-destination", + "os", + "overnight_stays", + "nt", + "number_of_trips" + ) + ) + checkmate::assert_choice( + zones, + choices = c( + "districts", + "dist", + "distr", + "distritos", + "municipalities", + "muni", + "municip", + "municipios", + "lua", + "large_urban_areas", + "gau", + "grandes_areas_urbanas" + ) + ) + checkmate::assert_directory_exists(data_dir, access = "r") + checkmate::assert_flag(quiet) + checkmate::assertNumber(n_threads, lower = 1) + + # normalise zones + zones <- spod_zone_names_en2es(zones) + + # simple null check is enough here, as spod_dates_arugument_to_dates_seq will do additional checks anyway + if (is.null(dates)) { + message( + "`dates` argument is undefined. Please set `dates='cached_v1'` or `dates='cached_v2'` to convert all data that was previously downloaded. Alternatively, specify at least one date between 2020-02-14 and 2021-05-09 (for v1 data) or between 2022-01-01 onwards (for v2). Any missing data will be downloaded before conversion. For more details on the dates argument, see ?spod_download." + ) + } + + dates_to_use <- spod_dates_argument_to_dates_seq(dates = dates) + + # check version + ver <- spod_infer_data_v_from_dates( + dates = dates_to_use, + ignore_missing_dates = ignore_missing_dates + ) + + if (isFALSE(quiet)) { + message("Data version detected from dates: ", ver) + } + + # convert english data type names to spanish words used in the default data paths + type <- match.arg(type) + matched_type <- spod_match_data_type_for_local_folders(type = type, ver = ver) + + available_data <- spod_available_data( + ver = ver, + check_local_files = TRUE, + data_dir = data_dir, + quiet = quiet, + use_s3 = TRUE + ) + + if (all(as.character(dates) %in% c("cached_v1", "cached_v2"))) { + dates_to_use <- available_data |> + dplyr::filter(.data$downloaded == TRUE & !is.na(.data$data_ymd)) |> + dplyr::pull(.data$data_ymd) + available_data <- available_data |> + dplyr::filter(.data$downloaded == TRUE & !is.na(.data$data_ymd)) + } + + # match the available_data to type, zones, version and dates + if (ver == 1) { + requested_files <- available_data[ + # selecting districts files for v1 to avoid issues with municipalities # this is to address the bugs described in detail in: + # http://www.ekotov.pro/mitma-data-issues/issues/011-v1-tpp-mismatch-zone-ids-in-table-and-spatial-data.html + # http://www.ekotov.pro/mitma-data-issues/issues/012-v1-tpp-district-files-in-municipality-folders.html + # the decision was to use distrcit data and aggregate it to replicate municipal data + grepl( + glue::glue("v{ver}.*{matched_type}.*distritos"), + available_data$local_path + ) & + available_data$data_ymd %in% dates_to_use, + ] + } else if (ver == 2) { + requested_files <- available_data[ + grepl( + glue::glue("v{ver}.*{zones}.*{matched_type}"), + available_data$local_path + ) & + available_data$data_ymd %in% dates_to_use, + ] + } + + # if some requested files are missing issue a warning + if (!all(requested_files$downloaded)) { + dates_missing_downloads <- requested_files |> + dplyr::filter(.data$downloaded == FALSE) |> + dplyr::filter(.data$data_ymd %in% dates_to_use) |> + dplyr::pull(.data$data_ymd) + warning(glue::glue( + 'Some files for the requested dates are missing ({paste(spod_convert_dates_to_ranges(dates_missing_downloads), collapse = ", ")}). Make sure you have downloaded all files requested to be checked for consistency with `spod_download()`. For now, `spod_check_files()` will only check the files that were previously downloaded and currently exist on disk.', + )) + requested_files <- requested_files |> + dplyr::filter(.data$downloaded == TRUE) + } + + # compute ETag for each file + if (n_threads == 1) { + local_etags <- requested_files$local_path |> + purrr::map_chr(~ spod_compute_s3_etag(.x), .progress = TRUE) + } else if (n_threads > 1) { + spod_assert_package(c("future", "furrr", "future.mirai")) + with( + future::plan(future.mirai::mirai_multisession, workers = n_threads), + local = TRUE + ) + local_etags <- requested_files$local_path |> + furrr::future_map_chr( + ~ spod_compute_s3_etag(.x), + .progress = TRUE, + future.seed = TRUE + ) + } + + # compare ETags + requested_files <- requested_files |> + dplyr::mutate( + local_etag = local_etags, + ) |> + dplyr::mutate( + local_file_consistent = dplyr::if_else( + condition = .data$local_etag == .data$etag, + true = TRUE, + false = FALSE, + missing = FALSE + ) + ) + + # issue a warning if there are mismatches or inform that everything is ok + if (isFALSE(quiet)) { + if (!all(requested_files$local_file_consistent)) { + broken_dates <- requested_files |> + dplyr::filter(.data$local_file_consistent == FALSE) |> + dplyr::pull(.data$data_ymd) + warning(glue::glue( + 'Some files are inconsistent with the reference data ({paste(spod_convert_dates_to_ranges(broken_dates), collapse = ", ")}). Please inspect the returned table by filtering the output table by "local_file_consistent == FALSE". To re-download the inconsistent files, use `spod_download()` and specify the missing dates.' + )) + } else { + message("All checked files are consistent.") + } + } + + return(requested_files) +} + +#' Compute ETag for a file +#' @param file_path Character. The path to the file. +#' @param part_size Numeric. The size of each part in bytes. Do not change, as this is a default for S3 Etag. +#' @return Character. The ETag for the file. +#' @keywords internal +spod_compute_s3_etag <- function(file_path, part_size = 8 * 1024^2) { + con <- file(file_path, "rb") + on.exit(close(con)) + + # MD5 each part + part_md5s <- list() + repeat { + buf <- readBin(con, "raw", n = part_size) + if (length(buf) == 0) { + break + } + part_md5s[[length(part_md5s) + 1]] <- digest::digest( + buf, + algo = "md5", + serialize = FALSE, + raw = TRUE + ) + } + + # Single‐part fallback + if (length(part_md5s) == 1) { + return(digest::digest(file = file_path, algo = "md5", serialize = FALSE)) + } + + # Concatenate raw MD5s and MD5 again + combined <- do.call(c, part_md5s) + final_raw <- digest::digest( + combined, + algo = "md5", + serialize = FALSE, + raw = TRUE + ) + raw_to_hex <- function(r) paste(sprintf("%02x", as.integer(r)), collapse = "") + etag_hex <- raw_to_hex(final_raw) + paste0(etag_hex, "-", length(part_md5s)) +} diff --git a/R/data-dir.R b/R/data-dir.R index e23c2775..ca581d77 100644 --- a/R/data-dir.R +++ b/R/data-dir.R @@ -1,66 +1,81 @@ #' Set the data directory -#' +#' #' @description -#' +#' #' `r lifecycle::badge("stable")` -#' +#' #' This function sets the data directory in the environment variable SPANISH_OD_DATA_DIR, so that all other functions in the package can access the data. It also creates the directory if it doesn't exist. -#' +#' #' @param data_dir The data directory to set. #' @inheritParams global_quiet_param #' @return Nothing. If quiet is `FALSE`, prints a message with the path and confirmation that the path exists. #' @export #' @examples #' spod_set_data_dir(tempdir()) -#' +#' spod_set_data_dir <- function( data_dir, quiet = FALSE -){ +) { checkmate::assert_character(data_dir, len = 1, null.ok = FALSE) checkmate::assert_flag(quiet) data_dir_abs_path <- fs::path_abs(data_dir) - - tryCatch({ - # Check if the directory exists; if not, attempt to create it - if (!dir.exists(data_dir_abs_path)) { - if(quiet == FALSE){ - message("Data directory ", data_dir_abs_path, " does not exist. Attempting to create it.") + + tryCatch( + { + # Check if the directory exists; if not, attempt to create it + if (!dir.exists(data_dir_abs_path)) { + if (quiet == FALSE) { + message( + "Data directory ", + data_dir_abs_path, + " does not exist. Attempting to create it." + ) + } + fs::dir_create(data_dir_abs_path, recurse = TRUE) } - fs::dir_create(data_dir_abs_path, recurse = TRUE) - } - data_dir_real_path <- fs::path_real(data_dir_abs_path) - # Check for write permissions - test_file <- fs::path(data_dir_real_path, ".test_write") - file.create(test_file) - fs::file_delete(test_file) - if(quiet == FALSE){ - message("Data directory is writeable.") - } - - # Set the environment variable - Sys.setenv(SPANISH_OD_DATA_DIR = data_dir_real_path) - - if(quiet == FALSE){ - message("Data directory successfully set to: ", data_dir_real_path) + + # Check for write permissions + test_file <- fs::path(data_dir_abs_path, ".test_write") + file.create(test_file) + fs::file_delete(test_file) + if (quiet == FALSE) { + message("Data directory is writeable.") + } + + # Set the environment variable + Sys.setenv(SPANISH_OD_DATA_DIR = data_dir_abs_path) + + if (quiet == FALSE) { + message("Data directory successfully set to: ", data_dir_abs_path) + } + }, + error = function(e) { + message( + "Error: Unable to create or access the directory at '", + data_dir_abs_path, + "'." + ) + message( + "This may be due to write access restrictions or system permissions issues." + ) + message( + "Please verify that you have write permissions for the specified path and try again." + ) + stop(e) # Re-throw the error for debugging purposes, if needed } - }, error = function(e) { - message("Error: Unable to create or access the directory at '", data_dir_abs_path, "'.") - message("This may be due to write access restrictions or system permissions issues.") - message("Please verify that you have write permissions for the specified path and try again.") - stop(e) # Re-throw the error for debugging purposes, if needed - }) - + ) + return(invisible(TRUE)) } #' Get the data directory #' #' @description -#' +#' #' `r lifecycle::badge("stable")` -#' +#' #' This function retrieves the data directory from the environment variable SPANISH_OD_DATA_DIR. #' If the environment variable is not set, it returns the temporary directory. #' @inheritParams global_quiet_param @@ -69,12 +84,15 @@ spod_set_data_dir <- function( #' @examples #' spod_set_data_dir(tempdir()) #' spod_get_data_dir() -#' +#' spod_get_data_dir <- function(quiet = FALSE) { checkmate::assert_flag(quiet) data_dir_env <- Sys.getenv("SPANISH_OD_DATA_DIR") if (data_dir_env == "") { - if (isFALSE(quiet)) warning("Warning: SPANISH_OD_DATA_DIR is not set. Using the temporary directory, which is not recommended, as the data will be deleted when the session ends.\n\n To set the data directory, use `Sys.setenv(SPANISH_OD_DATA_DIR = '/path/to/data')` or set SPANISH_OD_DATA_DIR permanently in the environment by editing the `.Renviron` file locally for current project with `usethis::edit_r_environ('project')` or `file.edit('.Renviron')` or globally for all projects with `usethis::edit_r_environ('user')` or `file.edit('~/.Renviron')`.") + if (isFALSE(quiet)) + warning( + "Warning: SPANISH_OD_DATA_DIR is not set. Using the temporary directory, which is not recommended, as the data will be deleted when the session ends.\n\n To set the data directory, use `Sys.setenv(SPANISH_OD_DATA_DIR = '/path/to/data')` or set SPANISH_OD_DATA_DIR permanently in the environment by editing the `.Renviron` file locally for current project with `usethis::edit_r_environ('project')` or `file.edit('.Renviron')` or globally for all projects with `usethis::edit_r_environ('user')` or `file.edit('~/.Renviron')`." + ) data_dir_env <- tempdir() # if not set, use the temp directory } # check if dir exists and create it if it doesn't @@ -82,6 +100,6 @@ spod_get_data_dir <- function(quiet = FALSE) { if (!dir.exists(data_dir_env_abs)) { fs::dir_create(data_dir_env_abs, recurse = TRUE) } - data_dir_env_real <- fs::path_real(data_dir_env_abs) + data_dir_env_real <- fs::path_abs(data_dir_env_abs) return(data_dir_env_real) } diff --git a/R/dev-tools.R b/R/dev-tools.R index e8ccf1dc..1b08235f 100644 --- a/R/dev-tools.R +++ b/R/dev-tools.R @@ -3,34 +3,36 @@ #' Get files sizes for remote files of v1 and v2 data and save them into a csv.gz file in the inst/extdata folder. #' @param ver The version of the data (1 or 2). Can be both. Defaults to 2, as v1 data is not being updated since 2021. #' @return Nothing. Only saves a csv.gz file with up to date file sizes in the inst/extdata folder. -#' +#' #' @keywords internal -#' +#' spod_files_sizes <- function(ver = 2) { data_dir <- spod_get_data_dir() - - if (any(ver %in% 1)){ + + if (any(ver %in% 1)) { v1 <- spod_available_data(1) - + # takes about 1 minute - future::plan(future::multisession, workers = 6) - v1$remote_file_size_mb <- furrr::future_map_dbl( + # tictoc::tic() + future::plan(future::multisession, workers = 8) + v1$true_remote_file_size_bytes <- furrr::future_map_dbl( .x = v1$target_url, .f = ~ spod_get_file_size_from_url(x_url = .x), .progress = TRUE ) future::plan(future::sequential) + # tictoc::toc() - v1_url_file_sizes <- v1[, c("target_url", "remote_file_size_mb")] - readr::write_csv( - x = v1_url_file_sizes, - file = "inst/extdata/url_file_sizes_v1.txt.gz" - ) + v1 <- v1 |> + dplyr::arrange(!!!dplyr::syms(c("data_ymd", "target_url"))) |> + dplyr::select(-"local_path") + + saveRDS(v1, file.path("inst", "extdata", "available_data_v1.rds")) } - if (any(ver %in% 2)){ + if (any(ver %in% 2)) { v2 <- spod_available_data(2) - if(all(v2$size_imputed == FALSE)){ + if (all(v2$size_imputed == FALSE)) { stop("all file sizes are known") } v2_known_size <- v2[v2$size_imputed == FALSE, ] @@ -52,7 +54,6 @@ spod_files_sizes <- function(ver = 2) { file = "inst/extdata/url_file_sizes_v2.txt.gz" ) } - } @@ -61,15 +62,31 @@ spod_files_sizes <- function(ver = 2) { #' @return File size in MB #' @importFrom utils URLencode #' @keywords internal -spod_get_file_size_from_url <- function(x_url){ - +spod_get_file_size_from_url <- function(x_url) { url <- utils::URLencode(x_url) headers <- curlGetHeaders(url) content_length_line <- grep("Content-Length", headers, value = TRUE) - content_length_value <- sub("Content-Length:\\s*(\\d+).*", "\\1", content_length_line) - - # Convert bytes to MB (1 MB = 1024 * 1024 bytes) - file_size_mb <- as.numeric(content_length_value) / (1024 * 1024) - - return(file_size_mb) + content_length_value <- sub( + "Content-Length:\\s*(\\d+).*", + "\\1", + content_length_line + ) |> + as.numeric() + + return(content_length_value) +} + +#' Get Etags for locally saved v1 data files and save them into a RDS file in the inst/extdata folder. +#' @return Returns a tibble with the local path, local ETag and remote ETag. +#' @keywords internal +spod_store_etags <- function() { + available_data <- spod_available_data(1, check_local_files = TRUE) + available_data <- available_data |> + dplyr::filter(.data$downloaded == TRUE) + local_etags <- available_data$local_path |> + purrr::map_chr(~ spod_compute_s3_etag(.x), .progress = TRUE) + available_data <- available_data |> + dplyr::mutate(local_etag = local_etags) |> + dplyr::as_tibble() + return(available_data) } diff --git a/R/download_data.R b/R/download_data.R index 140b5b53..26bccbef 100644 --- a/R/download_data.R +++ b/R/download_data.R @@ -1,9 +1,9 @@ #' Download the data files of specified type, zones, and dates #' #' @description -#' +#' #' `r lifecycle::badge("stable")` -#' +#' #' This function downloads the data files of the specified type, zones, dates and data version. #' @param type The type of data to download. Can be `"origin-destination"` (or ust `"od"`), or `"number_of_trips"` (or just `"nt"`) for v1 data. For v2 data `"overnight_stays"` (or just `"os"`) is also available. More data types to be supported in the future. See codebooks for v1 and v2 data in vignettes with `spod_codebook(1)` and `spod_codebook(2)` (\link{spod_codebook}). #' @param zones The zones for which to download the data. Can be `"districts"` (or `"dist"`, `"distr"`, or the original Spanish `"distritos"`) or `"municipalities"` (or `"muni"`, `"municip"`, or the original Spanish `"municipios"`) for both data versions. Additionaly, these can be `"large_urban_areas"` (or `"lua"`, or the original Spanish `"grandes_areas_urbanas"`, or `"gau"`) for v2 data (2022 onwards). @@ -12,17 +12,18 @@ #' @param max_download_size_gb The maximum download size in gigabytes. Defaults to 1. #' @param return_local_file_paths Logical. If `TRUE`, the function returns a character vector of the paths to the downloaded files. If `FALSE`, the function returns `NULL`. #' @param ignore_missing_dates Logical. If `TRUE`, the function will not raise an error if the some of the specified dates are missing. Any dates that are missing will be skipped, however the data for any valid dates will be acquired. Defaults to `FALSE`. +#' @param check_local_files Logical. Whether to check the file size of local files against known remote file sizes on the Amazon S3 storage. Defaults to `TRUE`, which fetches the metadata from Amazon S3. This setting ensures your downloaded files are not broken, so it is recommended to keep it `TRUE`. #' @inheritParams global_quiet_param -#' +#' #' @return Nothing. If `return_local_file_paths = TRUE`, a `character` vector of the paths to the downloaded files. #' #' @export #' @examplesIf interactive() #' \donttest{ -#' +#' #' # Set data dir for file downloads #' spod_set_data_dir(tempdir()) -#' +#' #' # Download the number of trips on district level for the a date range in March 2020 #' spod_download( #' type = "number_of_trips", zones = "districts", @@ -42,33 +43,67 @@ #' dates = "2020032[0-4]" #' ) #' } -#' +#' spod_download <- function( - type = c( - "od", "origin-destination", - "os", "overnight_stays", - "nt", "number_of_trips" - ), - zones = c( - "districts", "dist", "distr", "distritos", - "municipalities", "muni", "municip", "municipios", - "lua", "large_urban_areas", "gau", "grandes_areas_urbanas" - ), - dates = NULL, - max_download_size_gb = 1, # 1GB - data_dir = spod_get_data_dir(), - quiet = FALSE, - return_local_file_paths = FALSE, - ignore_missing_dates = FALSE - ) { - + type = c( + "od", + "origin-destination", + "os", + "overnight_stays", + "nt", + "number_of_trips" + ), + zones = c( + "districts", + "dist", + "distr", + "distritos", + "municipalities", + "muni", + "municip", + "municipios", + "lua", + "large_urban_areas", + "gau", + "grandes_areas_urbanas" + ), + dates = NULL, + max_download_size_gb = 1, # 1GB + data_dir = spod_get_data_dir(), + quiet = FALSE, + return_local_file_paths = FALSE, + ignore_missing_dates = FALSE, + check_local_files = TRUE +) { # Validate inputs - checkmate::assert_choice(type, choices = c("od", "origin-destination", "os", "overnight_stays", "nt", "number_of_trips")) - checkmate::assert_choice(zones, choices = c( - "districts", "dist", "distr", "distritos", - "municipalities", "muni", "municip", "municipios", - "lua", "large_urban_areas", "gau", "grandes_areas_urbanas" - )) + checkmate::assert_choice( + type, + choices = c( + "od", + "origin-destination", + "os", + "overnight_stays", + "nt", + "number_of_trips" + ) + ) + checkmate::assert_choice( + zones, + choices = c( + "districts", + "dist", + "distr", + "distritos", + "municipalities", + "muni", + "municip", + "municipios", + "lua", + "large_urban_areas", + "gau", + "grandes_areas_urbanas" + ) + ) checkmate::assert_number(max_download_size_gb, lower = 0.1) checkmate::assert_directory_exists(data_dir, access = "rw") checkmate::assert_flag(quiet) @@ -80,16 +115,20 @@ spod_download <- function( # simple null check is enough here, as spod_dates_arugument_to_dates_seq will do additional checks anyway if (is.null(dates)) { - message("`dates` argument is undefined. Please set `dates='cached_v1'` or `dates='cached_v2'` to convert all data that was previously downloaded. Alternatively, specify at least one date between 2020-02-14 and 2021-05-09 (for v1 data) or between 2022-01-01 onwards (for v2). Any missing data will be downloaded before conversion. For more details on the dates argument, see ?spod_download.") + message( + "`dates` argument is undefined. Please set `dates='cached_v1'` or `dates='cached_v2'` to convert all data that was previously downloaded. Alternatively, specify at least one date between 2020-02-14 and 2021-05-09 (for v1 data) or between 2022-01-01 onwards (for v2). Any missing data will be downloaded before conversion. For more details on the dates argument, see ?spod_download." + ) } dates_to_use <- spod_dates_argument_to_dates_seq(dates = dates) - # check version - ver <- spod_infer_data_v_from_dates(dates = dates_to_use, ignore_missing_dates = ignore_missing_dates) + ver <- spod_infer_data_v_from_dates( + dates = dates_to_use, + ignore_missing_dates = ignore_missing_dates + ) # this leads to a second call to an internal spod_get_valid_dates() which in turn causes a second call to spod_available_data(). This results in reading xml files with metadata for the second time. This is not optimal and should be fixed. - + if (isFALSE(quiet)) { message("Data version detected from dates: ", ver) } @@ -98,13 +137,14 @@ spod_download <- function( type <- match.arg(type) type <- spod_match_data_type_for_local_folders(type = type, ver = ver) - - # get the available data list while checking for files already cached on disk + # TODO: make requests faster by providing filtering prefix for Amazon S3 to only get the files we need? available_data <- spod_available_data( ver = ver, - check_local_files = TRUE, - data_dir = data_dir + check_local_files = check_local_files, + data_dir = data_dir, + quiet = quiet, + use_s3 = TRUE ) # match the available_data to type, zones, version and dates @@ -114,7 +154,10 @@ spod_download <- function( # http://www.ekotov.pro/mitma-data-issues/issues/011-v1-tpp-mismatch-zone-ids-in-table-and-spatial-data.html # http://www.ekotov.pro/mitma-data-issues/issues/012-v1-tpp-district-files-in-municipality-folders.html # the decision was to use distrcit data and aggregate it to replicate municipal data - grepl(glue::glue("v{ver}.*{type}.*distritos"), available_data$local_path) & + grepl( + glue::glue("v{ver}.*{type}.*distritos"), + available_data$local_path + ) & available_data$data_ymd %in% dates_to_use, ] } else if (ver == 2) { @@ -124,43 +167,88 @@ spod_download <- function( ] } - files_to_download <- requested_files[!requested_files$downloaded, ] + # compare file sizes + requested_files <- requested_files |> + dplyr::mutate( + complete_download = dplyr::if_else( + condition = .data$file_size_bytes == as.numeric(.data$local_file_size), + true = TRUE, + false = FALSE, + missing = FALSE + ) + ) + + files_to_download <- requested_files |> + dplyr::filter(.data$complete_download == FALSE) # only download files if some are missing if (nrow(files_to_download) > 0) { - total_size_to_download_gb <- round(sum(files_to_download$remote_file_size_mb / 1024, na.rm = TRUE), 4) + total_size_to_download_gb <- round( + sum(files_to_download$remote_file_size_mb / 1024, na.rm = TRUE), + 4 + ) # warn if more than 1 GB is to be downloaded - if( total_size_to_download_gb > max_download_size_gb) { - message(glue::glue("Approximately {total_size_to_download_gb} GB of data will be downloaded.")) + if (total_size_to_download_gb > max_download_size_gb) { + message(glue::glue( + "Approximately {round(total_size_to_download_gb, 2)} GB of data will be downloaded." + )) # ask for confirmation - response <- readline(prompt = "Are you sure you would like to continue with this download? (yes/no) ") + response <- readline( + prompt = "Are you sure you would like to continue with this download? (yes/no) " + ) response <- tolower(response) %in% c("y", "yes", "Yes") if (!response) { - message(glue::glue("Exiting without downloading missing files by user request.")) + message(glue::glue( + "Exiting without downloading missing files by user request. Requested data download size is larger than {max_download_size_gb} GB. Please increase `max_download_size_gb` parameter when running the function again." + )) return() } } if (isFALSE(quiet)) { - message(glue::glue("Downloading approximately {total_size_to_download_gb} GB of data.")) + message(glue::glue( + "Downloading approximately {round(total_size_to_download_gb, 2)} GB of data." + )) } - + # pre-generate target paths for the files to download - fs::dir_create( - unique(fs::path_dir(files_to_download$local_path)), - recurse = TRUE - ) + # and create all directories in the path + # fs::dir_create( + # unique(fs::path_dir(files_to_download$local_path)), + # recurse = TRUE + # ) # download the missing files - downloaded_files <- curl::multi_download( - urls = files_to_download$target_url, - destfiles = files_to_download$local_path, - progress = TRUE, - resume = TRUE + # TODO: disable mass curl::multi_download due to multiple failures on some connections + # downloaded_files <- curl::multi_download( + # urls = files_to_download$target_url, + # destfiles = files_to_download$local_path, + # progress = TRUE, + # resume = TRUE, + # multiplex = FALSE + # ) + + # use curl::multi_download in a loop on one file at a time with manual progress bar + downloaded_files <- spod_download_in_batches( + files_to_download ) + # downloaded_files <- spod_multi_download_with_progress( + # files_to_download + # ) # set download status for downloaded files as TRUE in requested_files - requested_files$downloaded[requested_files$local_path %in% downloaded_files$destfile] <- TRUE + # update the columns in requested_files to have new local file size, downloaded and download complete status columns + requested_files <- requested_files |> + dplyr::rows_update( + downloaded_files |> + dplyr::select( + .data$local_path, + .data$local_file_size, + .data$downloaded, + .data$complete_download + ), + by = "local_path" + ) if (isFALSE(quiet)) { message("Retrieved data for requested dates.") @@ -171,3 +259,453 @@ spod_download <- function( return(requested_files$local_path) } } + +#' Download multiple files with progress bar sequentially +#' +#' @description +#' Download multiple files with a progress bar. Retries failed downloads up to 3 times. +#' +#' @param files_to_download A data frame with columns `target_url`, `local_path` and `file_size_bytes`. +#' @param chunk_size Number of bytes to download at a time. +#' @param bar_width Width of the progress bar. +#' @param show_progress Whether to show the progress bar. +#' +#' @return A data frame with columns `target_url`, `local_path`, `file_size_bytes` and `local_file_size`. +#' +#' @keywords internal +#' +spod_multi_download_with_progress <- function( + files_to_download, + chunk_size = 1024 * 1024, + bar_width = 20, + show_progress = interactive() && !isTRUE(getOption("knitr.in.progress")) +) { + if (!interactive() || isTRUE(getOption("knitr.in.progress"))) { + show_progress <- FALSE + } + + # Sort and create directories + files_to_download <- files_to_download[order(files_to_download$data_ymd), ] + dirs <- unique(dirname(files_to_download$local_path)) + for (d in dirs) { + if (!dir.exists(d)) { + dir.create(d, recursive = TRUE, showWarnings = FALSE) + } + } + + total_files <- nrow(files_to_download) + total_expected_bytes <- sum(files_to_download$file_size_bytes, na.rm = TRUE) + total_gb <- total_expected_bytes / 2^30 + + cum_bytes <- 0L + files_counted <- 0L + + if (!"downloaded" %in% names(files_to_download)) { + files_to_download$downloaded <- FALSE + } + if (!"complete_download" %in% names(files_to_download)) { + files_to_download$complete_download <- FALSE + } + + # Helper: ETA formatter + format_eta <- function(eta_secs) { + if (is.na(eta_secs) || eta_secs < 0) { + return("--") + } + if (eta_secs > 3600) { + return(sprintf("%.1fh", eta_secs / 3600)) + } else if (eta_secs > 60) { + return(sprintf("%.0fm", eta_secs / 60)) + } else { + return(sprintf("%.0fs", eta_secs)) + } + } + + # Initial redraw function + if (show_progress) { + redraw_bar <- function(date_str, bytes_so_far, file_bytes = 0L) { + pct <- bytes_so_far / total_expected_bytes + nfill <- floor(pct * bar_width) + bar <- if (nfill < bar_width) { + paste0(strrep("=", nfill), ">", strrep(" ", bar_width - nfill - 1)) + } else { + strrep("=", bar_width) + } + + elapsed <- as.numeric(Sys.time() - start_time, units = "secs") + speed <- (bytes_so_far) / max(elapsed, 0.1) # bytes/sec + speed_mb <- speed / 2^20 + eta <- (total_expected_bytes - bytes_so_far) / speed + eta_str <- format_eta(eta) + + msg <- sprintf( + "Downloading: %s [%s] %3.0f%% (%d/%d files, %.2f/%.2f GB, %.1f MB/s, ETA: %s)", + date_str, + bar, + pct * 100, + files_counted, + total_files, + bytes_so_far / 2^30, + total_gb, + speed_mb, + eta_str + ) + cat(sprintf("\r%-120s", msg)) + utils::flush.console() + } + + start_time <- Sys.time() + redraw_bar("----", 0) + } + + # Download loop + for (i in seq_len(total_files)) { + if (!is.na(files_to_download$data_ymd[i])) { + date_str <- format(files_to_download$data_ymd[i], "%Y-%m-%d") + } else { + date_str <- basename(files_to_download$local_path[i]) + } + url <- files_to_download$target_url[i] + dest <- files_to_download$local_path[i] + exp_bytes <- files_to_download$file_size_bytes[i] + + local_sz <- if (file.exists(dest)) file.info(dest)$size else NA_real_ + if (!is.na(local_sz) && local_sz == exp_bytes) { + cum_bytes <- cum_bytes + local_sz + files_counted <- files_counted + 1L + files_to_download$local_file_size[i] <- local_sz + files_to_download$downloaded[i] <- TRUE + files_to_download$complete_download[i] <- TRUE + if (show_progress) { + redraw_bar(date_str, cum_bytes) + } + next + } + + success <- FALSE + actual_sz <- 0L + for (attempt in 1:3) { + file_bytes <- 0L + con_in <- url(url, "rb") + con_out <- file(dest, "wb") + + repeat { + chunk <- readBin(con_in, "raw", n = chunk_size) + if (length(chunk) == 0) { + break + } + writeBin(chunk, con_out) + file_bytes <- file_bytes + length(chunk) + + if (show_progress) { + redraw_bar(date_str, cum_bytes + file_bytes, file_bytes) + } + } + + close(con_in) + close(con_out) + actual_sz <- file.info(dest)$size + + if (identical(actual_sz, exp_bytes)) { + success <- TRUE + break + } else if (attempt == 1) { + warning( + sprintf( + "Size mismatch on %s (expected %d, got %d). Retrying...", + date_str, + exp_bytes, + actual_sz + ), + call. = FALSE + ) + } + } + + if (!success) { + warning( + sprintf( + "After retry, %s still mismatched: expected %d, got %d. Proceeding.", + date_str, + exp_bytes, + actual_sz + ), + call. = FALSE + ) + } + + cum_bytes <- cum_bytes + actual_sz + files_counted <- files_counted + 1L + if (!"local_file_size" %in% names(files_to_download)) { + files_to_download$local_file_size <- NA_real_ + } + files_to_download$local_file_size[i] <- actual_sz + files_to_download$downloaded[i] <- TRUE + files_to_download$complete_download[i] <- identical(actual_sz, exp_bytes) + + if (show_progress) redraw_bar(date_str, cum_bytes) + } + + if (show_progress) { + cat("\nAll downloads complete.\n") + } + return(files_to_download) +} + +#' Download multiple files with progress bar in parallel +#' +#' @description +#' Download multiple files with a progress bar. Retries failed downloads up to 3 times. Downloads are in parallel and in batches to show progress. First 10 Mb of a file is downloaded to check the speed. +#' +#' @param files_to_download A data frame with columns `target_url`, `local_path` and `file_size_bytes`. +#' @param batch_size Numeric. Number of files to download at a time. +#' @param bar_width Numeric. Width of the progress bar. +#' @param chunk_size Numeric. Number of bytes to download at a time for speed test. +#' @param show_progress Logical. Whether to show the progress bar. +#' @param max_retries Integer. Maximum number of retries for failed downloads. +#' @param timeout Numeric. Timeout in seconds for each download. +#' @return A data frame with columns `target_url`, `local_path`, `file_size_bytes` and `local_file_size`. +#' +#' @keywords internal +#' +spod_download_in_batches <- function( + files_to_download, + batch_size = 5, + bar_width = 20, + chunk_size = 1024 * 1024, + test_size = 10 * 1024 * 1024, # 10 MB test + max_retries = 3L, + timeout = 900, + show_progress = interactive() && !isTRUE(getOption("knitr.in.progress")) +) { + # Check interactive context + if (!interactive() || isTRUE(getOption("knitr.in.progress"))) { + show_progress <- FALSE + } + + original_timeout <- getOption("timeout") + options(timeout = timeout) + on.exit(options(timeout = original_timeout)) + + # Sort and ensure directories exist + files_to_download <- files_to_download[order(files_to_download$data_ymd), ] + dirs <- unique(dirname(files_to_download$local_path)) + for (d in dirs) { + if (!dir.exists(d)) dir.create(d, recursive = TRUE, showWarnings = FALSE) + } + + # Totals for progress + total_files <- nrow(files_to_download) + total_expected_bytes <- sum(files_to_download$file_size_bytes, na.rm = TRUE) + total_gb <- total_expected_bytes / 2^30 + + # Ensure tracking columns + if (!"downloaded" %in% names(files_to_download)) { + files_to_download$downloaded <- FALSE + } + if (!"local_file_size" %in% names(files_to_download)) { + files_to_download$local_file_size <- NA_integer_ + } + if (!"complete_download" %in% names(files_to_download)) { + files_to_download$complete_download <- FALSE + } + + # Pre-skip already-completed files + cum_bytes <- 0L + files_counted <- 0L + to_download <- logical(total_files) + for (i in seq_len(total_files)) { + expb <- files_to_download$file_size_bytes[i] + dest <- files_to_download$local_path[i] + if (isTRUE(files_to_download$complete_download[i])) { + actual <- if (file.exists(dest)) file.info(dest)$size else NA_integer_ + if (!is.na(actual) && actual == expb) { + cum_bytes <- cum_bytes + expb + files_counted <- files_counted + 1L + to_download[i] <- FALSE + next + } else { + files_to_download$complete_download[i] <- FALSE + } + } + if ( + isTRUE(files_to_download$downloaded[i]) && + !is.na(files_to_download$local_file_size[i]) && + files_to_download$local_file_size[i] == expb + ) { + files_to_download$complete_download[i] <- TRUE + cum_bytes <- cum_bytes + expb + files_counted <- files_counted + 1L + to_download[i] <- FALSE + } else if (file.exists(dest) && file.info(dest)$size == expb) { + files_to_download$downloaded[i] <- TRUE + files_to_download$complete_download[i] <- TRUE + files_to_download$local_file_size[i] <- expb + cum_bytes <- cum_bytes + expb + files_counted <- files_counted + 1L + to_download[i] <- FALSE + } else { + to_download[i] <- TRUE + } + } + + # ETA formatter + format_eta <- function(eta) { + if (is.na(eta) || eta <= 0 || !is.finite(eta)) { + return("--") + } + if (eta > 3600) { + sprintf("%.1fh", eta / 3600) + } else if (eta > 60) { + sprintf("%.0fm", eta / 60) + } else { + sprintf("%.0fs", eta) + } + } + + # Progress bar redraw + redraw_bar <- function(bytes_done, speed_bytes = NULL) { + pct <- bytes_done / total_expected_bytes + nfill <- max(floor(pct * bar_width), 1L) + bar <- if (nfill < bar_width) { + paste0(strrep("=", nfill), ">", strrep(" ", bar_width - nfill - 1)) + } else { + strrep("=", bar_width) + } + elapsed <- as.numeric(Sys.time() - start_time, "secs") + speed_bps <- if (!is.null(speed_bytes)) { + speed_bytes + } else { + (bytes_done / max(elapsed, 0.1)) + } + speed_mb <- speed_bps / 2^20 + eta_secs <- if (speed_bps > 0) { + (total_expected_bytes - bytes_done) / speed_bps + } else { + NA + } + eta <- format_eta(eta_secs) + + msg <- sprintf( + "Downloading: [%s] %3.0f%% (%d/%d files, %.2f/%.2f GB, %.1f MB/s, ETA: %s)", + bar, + pct * 100, + files_counted, + total_files, + bytes_done / 2^30, + total_gb, + speed_mb, + eta + ) + cat(sprintf("\r%-120s", msg)) + utils::flush.console() + } + + # Speed test with smallest file, read up to test_size without saving + if (show_progress) { + start_time <- Sys.time() + redraw_bar(cum_bytes) + rem_idx <- which(to_download) + if (length(rem_idx) > 0) { + sizes <- files_to_download$file_size_bytes[rem_idx] + first_i <- rem_idx[which.min(sizes)] + url1 <- files_to_download$target_url[first_i] + + bytes_read <- 0L + t0 <- Sys.time() + con <- url(url1, "rb") + repeat { + to_read <- min(chunk_size, test_size - bytes_read) + if (to_read <= 0) { + break + } + chunk <- readBin(con, "raw", n = to_read) + if (length(chunk) == 0) { + break + } + bytes_read <- bytes_read + length(chunk) + } + close(con) + t1 <- Sys.time() + + dt <- as.numeric(t1 - t0, "secs") + bps <- if (dt > 0) bytes_read / dt else NA + redraw_bar(cum_bytes, speed_bytes = bps) + } + } + + # Prepare batches of all to_download (including test file) + rem <- which(to_download) + idx_batches <- split(rem, ceiling(seq_along(rem) / batch_size)) + + # Download batches in parallel via libcurl + for (batch in idx_batches) { + urls <- files_to_download$target_url[batch] + dests <- files_to_download$local_path[batch] + + res <- utils::download.file( + url = urls, + destfile = dests, + method = "libcurl", + mode = "wb", + quiet = TRUE + ) + if (length(res) == 1L) { + res <- rep(res, length(batch)) + } + + # Retry on size mismatch or initial failure + for (k in seq_along(batch)) { + i <- batch[k] + expected <- files_to_download$file_size_bytes[i] + dest <- dests[k] + # if initial status not ok or size mismatch + if ( + res[k] != 0L || !file.exists(dest) || file.info(dest)$size != expected + ) { + attempts <- 1L + while (attempts < max_retries) { + attempts <- attempts + 1L + status2 <- utils::download.file( + url = urls[k], + destfile = dest, + method = "libcurl", + mode = "wb", + quiet = TRUE + ) + actual <- if (file.exists(dest)) file.info(dest)$size else NA_integer_ + if (status2 == 0L && identical(actual, expected)) { + res[k] <- 0L + break + } + } + } + } + + for (k in seq_along(batch)) { + i <- batch[k] + if (res[k] == 0L && file.exists(dests[k])) { + sz <- file.info(dests[k])$size + files_to_download$downloaded[i] <- TRUE + files_to_download$complete_download[i] <- TRUE + files_to_download$local_file_size[i] <- sz + cum_bytes <- cum_bytes + sz + } else { + warning(sprintf( + "Failed to download %s after %d attempts (got %s bytes, expected %s)", + urls[k], + max_retries, + if (file.exists(dests[k])) file.info(dests[k])$size else NA, + files_to_download$file_size_bytes[i] + )) + } + files_counted <- files_counted + 1L + if (show_progress) redraw_bar(cum_bytes) + } + } + + if (show_progress) { + cat("\nAll downloads complete.\n") + } + files_to_download +} diff --git a/R/get-zones.R b/R/get-zones.R index 77207be2..f4920926 100644 --- a/R/get-zones.R +++ b/R/get-zones.R @@ -1,15 +1,15 @@ #' Get zones -#' +#' #' @description -#' +#' #' `r lifecycle::badge("stable")` -#' +#' #' Get spatial zones for the specified data version. Supports both v1 (2020-2021) and v2 (2022 onwards) data. -#' +#' #' @inheritParams spod_download #' @inheritParams spod_available_data #' @return An `sf` object (Simple Feature collection). -#' +#' #' The columns for v1 (2020-2021) data include: #' \describe{ #' \item{id}{A character vector containing the unique identifier for each district, assigned by the data provider. This `id` matches the `id_origin`, `id_destination`, and `id` in district-level origin-destination and number of trips data.} @@ -20,7 +20,7 @@ #' \item{district_ids_in_v2/municipality_ids_in_v2}{A string with semicolon-separated district identifiers (from the v2 version of this data) corresponding to each district `id` in v1.} #' \item{geometry}{A `MULTIPOLYGON` column containing the spatial geometry of each district, stored as an sf object. The geometry is projected in the ETRS89 / UTM zone 30N coordinate reference system (CRS), with XY dimensions.} #' } -#' +#' #' The columns for v2 (2022 onwards) data include: #' \describe{ #' \item{id}{A character vector containing the unique identifier for each zone, assigned by the data provider.} @@ -34,50 +34,80 @@ #' \item{district_ids_in_v1/municipality_ids_in_v1}{A string with semicolon-separated district identifiers from v1 data corresponding to each district in v2. If no match exists, it is marked as `NA`.} #' \item{geometry}{A `MULTIPOLYGON` column containing the spatial geometry of each district, stored as an sf object. The geometry is projected in the ETRS89 / UTM zone 30N coordinate reference system (CRS), with XY dimensions.} #' } -#' +#' #' @export #' @examplesIf interactive() #' \donttest{ #' # get polygons for municipalities for the v2 data #' municip_v2 <- spod_get_zones(zones = "municipalities", ver = 2) -#' +#' #' # get polygons for the districts for the v1 data #' distr_v1 <- spod_get_zones(zones = "districts", ver = 1) #' } -#' +#' spod_get_zones <- function( zones = c( - "districts", "dist", "distr", "distritos", - "municipalities", "muni", "municip", "municipios", - "lua", "large_urban_areas", "gau", "grandes_areas_urbanas" + "districts", + "dist", + "distr", + "distritos", + "municipalities", + "muni", + "municip", + "municipios", + "lua", + "large_urban_areas", + "gau", + "grandes_areas_urbanas" ), ver = NULL, data_dir = spod_get_data_dir(), quiet = FALSE ) { # Validate inputs - checkmate::assert_choice(zones, choices = c( - "districts", "dist", "distr", "distritos", - "municipalities", "muni", "municip", "municipios", - "lua", "large_urban_areas", "gau", "grandes_areas_urbanas" - )) + checkmate::assert_choice( + zones, + choices = c( + "districts", + "dist", + "distr", + "distritos", + "municipalities", + "muni", + "municip", + "municipios", + "lua", + "large_urban_areas", + "gau", + "grandes_areas_urbanas" + ) + ) checkmate::assertIntegerish(ver, max.len = 1) if (!ver %in% c(1, 2)) { - stop("Invalid version number. Must be 1 (for v1 2020-2021 data) or 2 (for v2 2022 onwards).") + stop( + "Invalid version number. Must be 1 (for v1 2020-2021 data) or 2 (for v2 2022 onwards)." + ) } - - + checkmate::assert_directory_exists(data_dir, access = "rw") checkmate::assert_flag(quiet) - + # normalise zones zones <- spod_zone_names_en2es(zones) - + if (ver == 1) { - zones_sf <- spod_get_zones_v1(zones = zones, data_dir = data_dir, quiet = quiet) + zones_sf <- spod_get_zones_v1( + zones = zones, + data_dir = data_dir, + quiet = quiet + ) } else if (ver == 2) { - zones_sf <- spod_get_zones_v2(zones = zones, data_dir = data_dir, quiet = quiet) + zones_sf <- spod_get_zones_v2( + zones = zones, + data_dir = data_dir, + quiet = quiet + ) } return(zones_sf) @@ -99,46 +129,65 @@ spod_get_zones <- function( #' } #' @keywords internal spod_get_zones_v1 <- function( - zones = c("districts", "dist", "distr", "distritos", "municipalities", "muni", "municip", "municipios"), + zones = c( + "districts", + "dist", + "distr", + "distritos", + "municipalities", + "muni", + "municip", + "municipios" + ), data_dir = spod_get_data_dir(), quiet = FALSE ) { -zones <- match.arg(zones) -zones <- spod_zone_names_en2es(zones) - -metadata <- spod_available_data(ver = 1, data_dir = data_dir, check_local_files = FALSE) - -# Ensure the raw data is downloaded and extracted -spod_download_zones_v1(zones, data_dir, quiet) - -# check if gpkg files are already saved and load them if available -expected_gpkg_path <- fs::path( - data_dir, - glue::glue(spod_subfolder_clean_data_cache(ver = 1), "/zones/{zones}_mitma.gpkg") -) -if (fs::file_exists(expected_gpkg_path)) { - if (isFALSE(quiet)) { - message("Loading .gpkg file that already exists in data dir: ", expected_gpkg_path) + zones <- match.arg(zones) + zones <- spod_zone_names_en2es(zones) + + metadata <- spod_available_data( + ver = 1, + data_dir = data_dir, + check_local_files = FALSE + ) + + # Ensure the raw data is downloaded and extracted + spod_download_zones_v1(zones, data_dir, quiet) + + # check if gpkg files are already saved and load them if available + expected_gpkg_path <- fs::path( + data_dir, + glue::glue( + spod_subfolder_clean_data_cache(ver = 1), + "/zones/{zones}_mitma.gpkg" + ) + ) + if (fs::file_exists(expected_gpkg_path)) { + if (isFALSE(quiet)) { + message( + "Loading .gpkg file that already exists in data dir: ", + expected_gpkg_path + ) + } + return(sf::read_sf(expected_gpkg_path)) } - return(sf::read_sf(expected_gpkg_path)) -} -zones_path <- fs::dir_ls( - path = fs::path(data_dir, spod_subfolder_raw_data_cache(ver = 1)), - glob = glue::glue("*v1**{zones}/*.shp"), - recurse = TRUE -) - -zones_sf <- spod_clean_zones_v1(zones_path, zones = zones) -fs::dir_create(fs::path_dir(expected_gpkg_path), recurse = TRUE) -sf::st_write( - zones_sf, - expected_gpkg_path, - delete_dsn = TRUE, - delete_layer = TRUE -) - -return(zones_sf) + zones_path <- fs::dir_ls( + path = fs::path(data_dir, spod_subfolder_raw_data_cache(ver = 1)), + glob = glue::glue("*v1**{zones}/*.shp"), + recurse = TRUE + ) + + zones_sf <- spod_clean_zones_v1(zones_path, zones = zones) + fs::dir_create(fs::path_dir(expected_gpkg_path), recurse = TRUE) + sf::st_write( + zones_sf, + expected_gpkg_path, + delete_dsn = TRUE, + delete_layer = TRUE + ) + + return(zones_sf) } @@ -148,13 +197,12 @@ return(zones_sf) #' #' @param zones_path The path to the zones spatial data file. #' @inheritParams spod_get_zones -#' @return A spatial object containing the cleaned zones data. +#' @return A spatial object containing the cleaned zones data. #' @keywords internal #' @importFrom rlang .data #' spod_clean_zones_v1 <- function(zones_path, zones) { - - if(fs::file_exists(zones_path) == FALSE) { + if (fs::file_exists(zones_path) == FALSE) { stop("File does not exist: ", zones_path) } suppressWarnings({ @@ -169,26 +217,50 @@ spod_clean_zones_v1 <- function(zones_path, zones) { # load and prepare id relations for districts relations_districts <- readr::read_delim( - file = paste0(spod_get_data_dir(), "/", + file = paste0( + spod_get_data_dir(), + "/", spod_subfolder_raw_data_cache(1), - "relaciones_distrito_mitma.csv"), - delim = "|", show_col_types = FALSE + "relaciones_distrito_mitma.csv" + ), + delim = "|", + show_col_types = FALSE ) relations_districts_col_names <- names(relations_districts) - relations_districts_col_names <- gsub("distrito", "district", relations_districts_col_names) - relations_districts_col_names <- gsub("municipio", "municipality", relations_districts_col_names) - relations_districts_col_names <- gsub("^district$", "census_district", relations_districts_col_names) + relations_districts_col_names <- gsub( + "distrito", + "district", + relations_districts_col_names + ) + relations_districts_col_names <- gsub( + "municipio", + "municipality", + relations_districts_col_names + ) + relations_districts_col_names <- gsub( + "^district$", + "census_district", + relations_districts_col_names + ) names(relations_districts) <- relations_districts_col_names - + # load and prepare id relations for municipalities relations_municipalities <- readr::read_delim( - file = paste0(spod_get_data_dir(), "/", + file = paste0( + spod_get_data_dir(), + "/", spod_subfolder_raw_data_cache(1), - "relaciones_municipio_mitma.csv"), - delim = "|", show_col_types = FALSE + "relaciones_municipio_mitma.csv" + ), + delim = "|", + show_col_types = FALSE ) relations_municipalities_col_names <- names(relations_municipalities) - relations_municipalities_col_names <- gsub("municipio", "municipality", relations_municipalities_col_names) + relations_municipalities_col_names <- gsub( + "municipio", + "municipality", + relations_municipalities_col_names + ) names(relations_municipalities) <- relations_municipalities_col_names # summarise districts relations including municipality data @@ -199,8 +271,9 @@ spod_clean_zones_v1 <- function(zones_path, zones) { dplyr::summarize( municipalities = paste(.data$municipality, collapse = "; ") ), - by = "municipality_mitma") |> - dplyr::group_by(.data$district_mitma) |> + by = "municipality_mitma" + ) |> + dplyr::group_by(.data$district_mitma) |> dplyr::summarize( census_districts = paste(.data$census_district, collapse = "; "), municipalities_mitma = paste(.data$municipality_mitma, collapse = "; "), @@ -210,14 +283,15 @@ spod_clean_zones_v1 <- function(zones_path, zones) { # summarise municipalities relations relations_municipalities_aggregated <- relations_municipalities |> dplyr::left_join( - relations_districts |> + relations_districts |> dplyr::group_by(.data$municipality_mitma) |> dplyr::summarize( census_districts = paste(.data$census_district, collapse = "; "), districts_mitma = paste(.data$district_mitma, collapse = "; ") - ) - , by = "municipality_mitma") |> - dplyr::group_by(.data$municipality_mitma) |> + ), + by = "municipality_mitma" + ) |> + dplyr::group_by(.data$municipality_mitma) |> dplyr::summarize( municipalities = paste(.data$municipality, collapse = "; "), districts_mitma = paste(.data$districts_mitma, collapse = "; "), @@ -225,60 +299,71 @@ spod_clean_zones_v1 <- function(zones_path, zones) { ) # cleanup duplacate ids in municipalities - relations_municipalities_aggregated <- relations_municipalities_aggregated |> + relations_municipalities_aggregated <- relations_municipalities_aggregated |> dplyr::mutate( dplyr::across( c(.data$municipalities, .data$districts_mitma, .data$census_districts), spod_unique_separated_ids ) ) - names(relations_municipalities_aggregated)[names(relations_municipalities_aggregated) == "municipality_mitma"] <- "id" - + names(relations_municipalities_aggregated)[ + names(relations_municipalities_aggregated) == "municipality_mitma" + ] <- "id" + # cleanup duplicate ids in districts - relations_districts_aggregated <- relations_districts_aggregated |> + relations_districts_aggregated <- relations_districts_aggregated |> dplyr::mutate( dplyr::across( - c(.data$census_districts, .data$municipalities_mitma), spod_unique_separated_ids + c(.data$census_districts, .data$municipalities_mitma), + spod_unique_separated_ids ) ) - names(relations_districts_aggregated)[names(relations_districts_aggregated) == "district_mitma"] <- "id" + names(relations_districts_aggregated)[ + names(relations_districts_aggregated) == "district_mitma" + ] <- "id" if (zones == "distritos") { - zones_sf <- zones_sf |> + zones_sf <- zones_sf |> dplyr::left_join(relations_districts_aggregated, by = "id") |> dplyr::relocate(.data$geometry, .after = dplyr::last_col()) } else if (zones == "municipios") { - zones_sf <- zones_sf |> + zones_sf <- zones_sf |> dplyr::left_join(relations_municipalities_aggregated, by = "id") |> dplyr::relocate(.data$geometry, .after = dplyr::last_col()) } # add metadata from v2 zones zones_v2_sf <- spod_get_zones_v2(zones = zones) - zones_v2_sf <- zones_v2_sf[,c("id", "name")] + zones_v2_sf <- zones_v2_sf[, c("id", "name")] names(zones_v2_sf)[names(zones_v2_sf) == "id"] <- "id_in_v2" names(zones_v2_sf)[names(zones_v2_sf) == "name"] <- "name_in_v2" suppressWarnings( zones_v2_sf_centroids <- zones_v2_sf |> sf::st_point_on_surface() ) - v2_to_v1 <- sf::st_join(zones_sf, zones_v2_sf_centroids, left = TRUE) |> - sf::st_drop_geometry() + v2_to_v1 <- sf::st_join(zones_sf, zones_v2_sf_centroids, left = TRUE) |> + sf::st_drop_geometry() v2_v_1ref <- v2_to_v1 |> - dplyr::group_by(.data$id) |> - dplyr::summarize( + dplyr::group_by(.data$id) |> + dplyr::summarize( names_in_v2_data = paste(.data$name_in_v2, collapse = "; "), ids_in_v2_data = paste(.data$id_in_v2, collapse = "; ") ) - eng_zones <- dplyr::if_else(zones == "distritos", true = "district", false = "municipality") - names(v2_v_1ref)[names(v2_v_1ref) == "names_in_v2_data"] <- glue::glue("{eng_zones}_names_in_v2") - names(v2_v_1ref)[names(v2_v_1ref) == "ids_in_v2_data"] <- glue::glue("{eng_zones}_ids_in_v2") - + eng_zones <- dplyr::if_else( + zones == "distritos", + true = "district", + false = "municipality" + ) + names(v2_v_1ref)[names(v2_v_1ref) == "names_in_v2_data"] <- glue::glue( + "{eng_zones}_names_in_v2" + ) + names(v2_v_1ref)[names(v2_v_1ref) == "ids_in_v2_data"] <- glue::glue( + "{eng_zones}_ids_in_v2" + ) - zones_sf <- zones_sf |> - dplyr::left_join(v2_v_1ref, by = "id") |> + zones_sf <- zones_sf |> + dplyr::left_join(v2_v_1ref, by = "id") |> dplyr::relocate(.data$geometry, .after = dplyr::last_col()) - return(zones_sf) } @@ -293,50 +378,85 @@ spod_clean_zones_v1 <- function(zones_path, zones) { #' @return A `character` string containing the path to the downloaded and extracted file. #' @keywords internal spod_download_zones_v1 <- function( - zones = c("districts", "dist", "distr", "distritos", "municipalities", "muni", "municip", "municipios"), + zones = c( + "districts", + "dist", + "distr", + "distritos", + "municipalities", + "muni", + "municip", + "municipios" + ), data_dir = spod_get_data_dir(), quiet = FALSE ) { -zones <- match.arg(zones) -zones <- spod_zone_names_en2es(zones) + zones <- match.arg(zones) + zones <- spod_zone_names_en2es(zones) -metadata <- spod_available_data(ver = 1, data_dir = data_dir, check_local_files = FALSE) + metadata <- spod_available_data( + ver = 1, + data_dir = data_dir, + use_s3 = TRUE, + check_local_files = FALSE + ) -# download id relation files if missing -relation_files <- metadata[grepl("relaciones_(distrito|municipio)_mitma.csv", metadata$target_url),] -if (any(!fs::file_exists(relation_files$local_path))) { - fs::dir_create(unique(fs::path_dir(relation_files$local_path)), recurse = TRUE) - invisible(curl::multi_download(urls = relation_files$target_url, destfile = relation_files$local_path, resume = FALSE, progress = TRUE)) -} + # download id relation files if missing + relation_files <- metadata[ + grepl("relaciones_(distrito|municipio)_mitma.csv", metadata$target_url), + ] + if (any(!fs::file_exists(relation_files$local_path))) { + fs::dir_create( + unique(fs::path_dir(relation_files$local_path)), + recurse = TRUE + ) + # disable curl::multi_download() for now + # invisible(curl::multi_download(urls = relation_files$target_url, destfile = relation_files$local_path, resume = FALSE, progress = TRUE)) + # relation_files <- spod_multi_download_with_progress(relation_files) + relation_files <- spod_download_in_batches(relation_files) + } -regex <- glue::glue("zonificacion_{zones}\\.") -sel_zones <- stringr::str_detect(metadata$target_url, regex) -metadata_zones <- metadata[sel_zones, ] -dir_name <- fs::path_dir(metadata_zones$local_path[1]) -if (!dir.exists(dir_name)) { - fs::dir_create(dir_name, recurse = TRUE) -} + regex <- glue::glue("zonificacion_{zones}\\.") + sel_zones <- stringr::str_detect(metadata$target_url, regex) + metadata_zones <- metadata[sel_zones, ] + dir_name <- fs::path_dir(metadata_zones$local_path[1]) + if (!dir.exists(dir_name)) { + fs::dir_create(dir_name, recurse = TRUE) + } -if (!fs::file_exists(metadata_zones$local_path)) { - if (isFALSE(quiet)) message("Downloading the file to: ", metadata_zones$local_path) - downloaded_file <- curl::multi_download(metadata_zones$target_url, destfiles = metadata_zones$local_path, resume = TRUE, progress = TRUE) - downloaded_file <- downloaded_file$destfile -} else { - if (isFALSE(quiet)) message("File already exists: ", metadata_zones$local_path) - downloaded_file <- metadata_zones$local_path -} + if (!fs::file_exists(metadata_zones$local_path)) { + if (isFALSE(quiet)) + message("Downloading the file to: ", metadata_zones$local_path) + downloaded_file <- spod_download_in_batches(metadata_zones) + # downloaded_file <- spod_multi_download_with_progress(metadata_zones) + # disable curl::multi_download() for now + # downloaded_file <- curl::multi_download( + # metadata_zones$target_url, + # destfiles = metadata_zones$local_path, + # resume = TRUE, + # progress = TRUE + # ) + downloaded_file <- downloaded_file$local_path + } else { + if (isFALSE(quiet)) + message("File already exists: ", metadata_zones$local_path) + downloaded_file <- metadata_zones$local_path + } -if (isFALSE(quiet)) message("Unzipping the file: ", downloaded_file) -if (!dir.exists(fs::path_dir(downloaded_file))){ - fs::dir_create(fs::path_dir(downloaded_file), recurse = TRUE) -} -utils::unzip(downloaded_file, exdir = paste0(fs::path_dir(downloaded_file), "/")) + if (isFALSE(quiet)) message("Unzipping the file: ", downloaded_file) + if (!dir.exists(fs::path_dir(downloaded_file))) { + fs::dir_create(fs::path_dir(downloaded_file), recurse = TRUE) + } + utils::unzip( + downloaded_file, + exdir = paste0(fs::path_dir(downloaded_file), "/") + ) -# remove artifacts (remove __MACOSX if exists) -junk_path <- paste0(fs::path_dir(downloaded_file), "/__MACOSX") -if (dir.exists(junk_path)) fs::dir_delete(junk_path) + # remove artifacts (remove __MACOSX if exists) + junk_path <- paste0(fs::path_dir(downloaded_file), "/__MACOSX") + if (dir.exists(junk_path)) fs::dir_delete(junk_path) -return(metadata_zones$local_path) + return(metadata_zones$local_path) } @@ -359,9 +479,18 @@ return(metadata_zones$local_path) #' @keywords internal spod_get_zones_v2 <- function( zones = c( - "districts", "dist", "distr", "distritos", - "municipalities", "muni", "municip", "municipios", - "lua", "large_urban_areas", "gau", "grandes_areas_urbanas" + "districts", + "dist", + "distr", + "distritos", + "municipalities", + "muni", + "municip", + "municipios", + "lua", + "large_urban_areas", + "gau", + "grandes_areas_urbanas" ), data_dir = spod_get_data_dir(), quiet = FALSE @@ -372,24 +501,32 @@ spod_get_zones_v2 <- function( # check if gpkg files are already saved and load them if available expected_gpkg_path <- fs::path( data_dir, - glue::glue(spod_subfolder_clean_data_cache(ver = 2), + glue::glue( + spod_subfolder_clean_data_cache(ver = 2), "/zones/{zones}_mitma.gpkg" ) ) if (fs::file_exists(expected_gpkg_path)) { if (isFALSE(quiet)) { - message("Loading .gpkg file that already exists in data dir: ", expected_gpkg_path) + message( + "Loading .gpkg file that already exists in data dir: ", + expected_gpkg_path + ) } return(sf::read_sf(expected_gpkg_path)) } - + # if no existing gpkg found above, continue here with download and data cleanup metadata <- spod_available_data_v2(data_dir, check_local_files = TRUE) - zones_regex <- glue::glue("(zonificacion_{zones}\\.*)|(poblacion\\.csv)|(relacion_ine_zonificacionMitma\\.csv)") + zones_regex <- glue::glue( + "(zonificacion_{zones}\\.*)|(poblacion\\.csv)|(relacion_ine_zonificacionMitma\\.csv)" + ) sel_zones <- stringr::str_detect(metadata$local_path, zones_regex) metadata_zones <- metadata[sel_zones, ] - metadata_zones_for_download <- metadata_zones[metadata_zones$downloaded == FALSE, ] - if (nrow(metadata_zones_for_download) > 0){ + metadata_zones_for_download <- metadata_zones[ + metadata_zones$downloaded == FALSE, + ] + if (nrow(metadata_zones_for_download) > 0) { dir_names <- unique(fs::path_dir(metadata_zones_for_download$local_path)) if (any(!dir.exists(dir_names))) { fs::dir_create(dir_names, recurse = TRUE) @@ -397,14 +534,21 @@ spod_get_zones_v2 <- function( if (isFALSE(quiet)) { message("Downloading missing zones data...") } - curl::multi_download( - urls = metadata_zones_for_download$target_url, - destfiles = metadata_zones_for_download$local_path, - resume = TRUE, - progress = TRUE + # disable curl::multi_download() for now + # curl::multi_download( + # urls = metadata_zones_for_download$target_url, + # destfiles = metadata_zones_for_download$local_path, + # resume = TRUE, + # progress = TRUE + # ) + metadata_zones_for_download <- spod_download_in_batches( + metadata_zones_for_download ) + # metadata_zones_for_download <- spod_multi_download_with_progress( + # metadata_zones_for_download + # ) } - + zones_path <- fs::dir_ls( path = fs::path(data_dir, spod_subfolder_raw_data_cache(ver = 2)), regexp = glue::glue("zonificacion_{tolower(zones)}s?\\.shp$"), @@ -421,14 +565,14 @@ spod_get_zones_v2 <- function( ) return(zones_sf) - } +} #' Fixes common issues in the zones data and cleans up variable names #' #' This function fixes any invalid geometries in the zones data and renames the "ID" column to "id". It also attacches the population counts and zone names provided in the csv files supplied by the original data provider. #' #' @param zones_path The path to the zones spatial data file. -#' @return A spatial `sf`` object containing the cleaned zones data. +#' @return A spatial `sf`` object containing the cleaned zones data. #' @importFrom stats median #' @keywords internal #' @@ -436,7 +580,7 @@ spod_clean_zones_v2 <- function(zones_path) { # detect what kind of zones find out if it is distritos, municipios or GAU zones <- stringr::str_extract(zones_path, "distritos|municipios|gaus") - if(fs::file_exists(zones_path) == FALSE) { + if (fs::file_exists(zones_path) == FALSE) { stop("File does not exist: ", zones_path) } suppressWarnings({ @@ -460,7 +604,7 @@ spod_clean_zones_v2 <- function(zones_path) { col_types = c("c", "i") ) - if (zones %in% c("distritos","gaus")) { + if (zones %in% c("distritos", "gaus")) { zone_names <- readr::read_delim( glue::glue(fs::path_dir(zones_path), "/nombres_{zones}.csv"), skip = 1, @@ -475,19 +619,24 @@ spod_clean_zones_v2 <- function(zones_path) { delim = "|", col_names = c("row", "id", "name"), col_types = c("i", "c", "i") - ) |> + ) |> dplyr::select(-"row") } - + # zones reference zones_ref <- readr::read_delim( - glue::glue(spod_get_data_dir(quiet = TRUE), "/", spod_subfolder_raw_data_cache(ver = 2), "zonificacion/relacion_ine_zonificacionMitma.csv"), + glue::glue( + spod_get_data_dir(quiet = TRUE), + "/", + spod_subfolder_raw_data_cache(ver = 2), + "zonificacion/relacion_ine_zonificacionMitma.csv" + ), delim = "|", col_types = rep("c", 6) ) zone_mitma <- glue::glue("{gsub('s$', '', zones)}_mitma") - + zones_ref_renamed <- zones_ref |> dplyr::rename( census_sections = "seccion_ine", @@ -498,15 +647,16 @@ spod_clean_zones_v2 <- function(zones_path) { luas_mitma = "gau_mitma", id = zone_mitma ) - + zones_ref_aggregated <- zones_ref_renamed |> - dplyr::group_by(.data$id) |> - dplyr::summarise( - dplyr::across( - .cols = dplyr::everything(), - .fns = ~ paste(.x, collapse = "; "), - .names = "{.col}" - )) + dplyr::group_by(.data$id) |> + dplyr::summarise( + dplyr::across( + .cols = dplyr::everything(), + .fns = ~ paste(.x, collapse = "; "), + .names = "{.col}" + ) + ) # cleanup duplacate ids in zones_ref_aggregated zones_ref_aggregated <- zones_ref_aggregated |> @@ -519,17 +669,19 @@ spod_clean_zones_v2 <- function(zones_path) { # combine zones with population, names, and zones reference zones_sf <- zones_sf |> - dplyr::left_join(zone_names, by = "id") |> - dplyr::left_join(population, by = "id") |> + dplyr::left_join(zone_names, by = "id") |> + dplyr::left_join(population, by = "id") |> dplyr::left_join(zones_ref_aggregated, by = "id") |> dplyr::relocate(.data$geometry, .after = dplyr::last_col()) - # load v1 zones to join ids, unless it's gau zones - if(zones != "gaus") { + if (zones != "gaus") { spod_download_zones_v1(zones = zones, quiet = TRUE) zones_v1_path <- fs::dir_ls( - path = fs::path(spod_get_data_dir(), spod_subfolder_raw_data_cache(ver = 1)), + path = fs::path( + spod_get_data_dir(), + spod_subfolder_raw_data_cache(ver = 1) + ), glob = glue::glue("*v1**{zones}/*.shp"), recurse = TRUE ) @@ -541,26 +693,32 @@ spod_clean_zones_v2 <- function(zones_path) { fixed_zones_v1_sf <- sf::st_make_valid(zones_v1_sf[invalid_geometries, ]) zones_v1_sf <- rbind(zones_v1_sf[!invalid_geometries, ], fixed_zones_sf) } - + names(zones_v1_sf)[names(zones_v1_sf) == "ID"] <- "id_in_v1" suppressWarnings( zones_v2_sf_centroids <- zones_sf |> sf::st_point_on_surface() ) - v2_to_v1 <- sf::st_join(zones_v1_sf, zones_v2_sf_centroids, left = TRUE) |> + v2_to_v1 <- sf::st_join(zones_v1_sf, zones_v2_sf_centroids, left = TRUE) |> sf::st_drop_geometry() v2_v_1ref <- v2_to_v1 |> - dplyr::group_by(.data$id) |> - dplyr::summarize( + dplyr::group_by(.data$id) |> + dplyr::summarize( ids_in_v1_data = paste(.data$id_in_v1, collapse = "; ") ) - eng_zones <- dplyr::if_else(zones == "distritos", true = "district", false = "municipality") - names(v2_v_1ref)[names(v2_v_1ref) == "ids_in_v1_data"] <- glue::glue("{eng_zones}_ids_in_v1") + eng_zones <- dplyr::if_else( + zones == "distritos", + true = "district", + false = "municipality" + ) + names(v2_v_1ref)[names(v2_v_1ref) == "ids_in_v1_data"] <- glue::glue( + "{eng_zones}_ids_in_v1" + ) - zones_sf <- zones_sf |> - dplyr::left_join(v2_v_1ref, by = "id") |> + zones_sf <- zones_sf |> + dplyr::left_join(v2_v_1ref, by = "id") |> dplyr::relocate(.data$geometry, .after = dplyr::last_col()) } - + return(zones_sf) } diff --git a/R/get.R b/R/get.R index 55135123..20a0fa33 100644 --- a/R/get.R +++ b/R/get.R @@ -1,16 +1,16 @@ #' Get tabular mobility data -#' -#' +#' +#' #' @description -#' +#' #' `r lifecycle::badge("stable")` -#' -#' This function creates a DuckDB lazy table connection object from the specified type and zones. It checks for missing data and downloads it if necessary. The connnection is made to the raw CSV files in gzip archives, so analysing the data through this connection may be slow if you select more than a few days. You can manipulate this object using `dplyr` functions such as \link[dplyr]{select}, \link[dplyr]{filter}, \link[dplyr]{mutate}, \link[dplyr]{group_by}, \link[dplyr]{summarise}, etc. In the end of any sequence of commands you will need to add \link[dplyr]{collect} to execute the whole chain of data manipulations and load the results into memory in an R `data.frame`/`tibble`. See codebooks for v1 and v2 data in vignettes with \link{spod_codebook}(1) and \link{spod_codebook}(2). -#' -#' If you want to analyse longer periods of time (especiially several months or even the whole data over several years), consider using the \link{spod_convert} and then \link{spod_connect}. -#' -#' If you want to quickly get the origin-destination data with flows aggregated for a single day at municipal level and without any extra socio-economic variables, consider using the \link[spanishoddata]{spod_quick_get_od} function. -#' +#' +#' This function creates a DuckDB lazy table connection object from the specified type and zones. It checks for missing data and downloads it if necessary. The connnection is made to the raw CSV files in gzip archives, so analysing the data through this connection may be slow if you select more than a few days. You can manipulate this object using `dplyr` functions such as \code{\link[dplyr]{select}}, \code{\link[dplyr]{filter}}, \code{\link[dplyr]{mutate}}, \code{\link[dplyr]{group_by}}, \code{\link[dplyr]{summarise}}, etc. In the end of any sequence of commands you will need to add \code{\link[dplyr]{collect}} to execute the whole chain of data manipulations and load the results into memory in an R `data.frame`/`tibble`. See codebooks for v1 and v2 data in vignettes with \code{\link{spod_codebook}(1)} and \code{\link{spod_codebook}(2)}. +#' +#' If you want to analyse longer periods of time (especiially several months or even the whole data over several years), consider using the \code{\link{spod_convert}} and then \code{\link{spod_connect}}. +#' +#' If you want to quickly get the origin-destination data with flows aggregated for a single day at municipal level and without any extra socio-economic variables, consider using the \code{\link{spod_quick_get_od}} function. +#' #' @param duckdb_target (Optional) The path to the duckdb file to save the data to, if a convertation from CSV is reuqested by the `spod_convert` function. If not specified, it will be set to ":memory:" and the data will be stored in memory. #' @inheritParams spod_download #' @inheritParams spod_duckdb_limit_resources @@ -20,7 +20,7 @@ #' @export #' @examplesIf interactive() #' \donttest{ -#' +#' #' # create a connection to the v1 data #' spod_set_data_dir(tempdir()) #' dates <- c("2020-02-14", "2020-03-14", "2021-02-14", "2021-02-14", "2021-02-15") @@ -32,21 +32,33 @@ #' # access the source connection with all dates #' # list tables #' DBI::dbListTables(nt_dist$src$con) -#' +#' #' # disconnect #' spod_disconnect(nt_dist) #' } -#' +#' spod_get <- function( type = c( - "od", "origin-destination", - "os", "overnight_stays", - "nt", "number_of_trips" + "od", + "origin-destination", + "os", + "overnight_stays", + "nt", + "number_of_trips" ), zones = c( - "districts", "dist", "distr", "distritos", - "municipalities", "muni", "municip", "municipios", - "lua", "large_urban_areas", "gau", "grandes_areas_urbanas" + "districts", + "dist", + "distr", + "distritos", + "municipalities", + "muni", + "municip", + "municipios", + "lua", + "large_urban_areas", + "gau", + "grandes_areas_urbanas" ), dates = NULL, data_dir = spod_get_data_dir(), @@ -58,14 +70,35 @@ spod_get <- function( temp_path = spod_get_temp_dir(), ignore_missing_dates = FALSE ) { - # Validate inputs - checkmate::assert_choice(type, choices = c("od", "origin-destination", "os", "overnight_stays", "nt", "number_of_trips")) - checkmate::assert_choice(zones, choices = c( - "districts", "dist", "distr", "distritos", - "municipalities", "muni", "municip", "municipios", - "lua", "large_urban_areas", "gau", "grandes_areas_urbanas" - )) + checkmate::assert_choice( + type, + choices = c( + "od", + "origin-destination", + "os", + "overnight_stays", + "nt", + "number_of_trips" + ) + ) + checkmate::assert_choice( + zones, + choices = c( + "districts", + "dist", + "distr", + "distritos", + "municipalities", + "muni", + "municip", + "municipios", + "lua", + "large_urban_areas", + "gau", + "grandes_areas_urbanas" + ) + ) checkmate::assert_flag(quiet) checkmate::assert_number(max_mem_gb, lower = 1) checkmate::assert_integerish(max_n_cpu, lower = 1) @@ -77,9 +110,11 @@ spod_get <- function( # simple null check is enough here, as spod_dates_arugument_to_dates_seq will do additional checks anyway if (is.null(dates)) { - message("`dates` argument is undefined. Please set `dates='cached_v1'` or `dates='cached_v2'` to convert all data that was previously downloaded. Alternatively, specify at least one date between 2020-02-14 and 2021-05-09 (for v1 data) or between 2022-01-01 onwards (for v2). Any missing data will be downloaded before conversion. For more details on the dates argument, see ?spod_get.") + message( + "`dates` argument is undefined. Please set `dates='cached_v1'` or `dates='cached_v2'` to convert all data that was previously downloaded. Alternatively, specify at least one date between 2020-02-14 and 2021-05-09 (for v1 data) or between 2022-01-01 onwards (for v2). Any missing data will be downloaded before conversion. For more details on the dates argument, see ?spod_get." + ) } - + # normalise type type <- spod_match_data_type(type = type) @@ -95,16 +130,16 @@ spod_get <- function( # normalise zones zones <- spod_zone_names_en2es(zones) - + # check if user is requesting to just get all cached data cached_data_requested <- length(dates) == 1 && all(as.character(dates) %in% c("cached_v1", "cached_v2")) - - + if (isFALSE(cached_data_requested)) { dates <- spod_dates_argument_to_dates_seq(dates = dates) ver <- spod_infer_data_v_from_dates( - dates = dates, ignore_missing_dates = ignore_missing_dates + dates = dates, + ignore_missing_dates = ignore_missing_dates ) # use the spot_download_data() function to download any missing data spod_download( @@ -120,8 +155,7 @@ spod_get <- function( } else if (isTRUE(cached_data_requested)) { ver <- as.numeric(stringr::str_extract(dates, "(1|2)$")) } - - + # create in memory duckdb connection drv <- duckdb::duckdb() con <- DBI::dbConnect(drv, dbdir = duckdb_target, read_only = FALSE) @@ -156,7 +190,7 @@ spod_get <- function( data_dir = data_dir ) } - + clean_csv_view_name <- glue::glue("{type}_csv_clean") clean_filtered_csv_view_name <- glue::glue("{type}_csv_clean_filtered") @@ -175,7 +209,7 @@ spod_get <- function( # https://duckdb.org/2024/07/09/memory-management.html#intermediate-spilling # if target were set as a database file, temp would be created at the same path # however, when the working in-memory on folder of CSV files, temp is created in the root of R working directory, which may be undesirable - if ( duckdb_target == ":memory:" ) { + if (duckdb_target == ":memory:") { con <- spod_duckdb_set_temp(con, temp_path = temp_path) } diff --git a/R/internal-utils.R b/R/internal-utils.R index 5a631e5d..208ab76e 100644 --- a/R/internal-utils.R +++ b/R/internal-utils.R @@ -561,3 +561,37 @@ spod_request_length <- function(graphql_query) { return(content_length) } + +# reworked is_installed2 from https://github.com/dataheld/elf/blob/main/R/dependencies.R +#' Checks if a package is installed and *informs* the user if not +#' +#' This is wrapper around [rlang::check_installed]; +#' instead of erroring out if the check fails it returns `FALSE`. +#' However, unlike [rlang::is_installed], it emits a message to the user. +#' +#' @inheritParams rlang::check_installed +#' @inheritDotParams rlang::check_installed +#' @keywords internal +spod_assert_package <- function(...) { + if (rlang::is_installed(...)) { + return(TRUE) + } + + withRestarts( + tryCatch( + rlang::check_installed(...), + error = function(cnd) { + if (inherits(cnd, "rlib_error_package_not_found")) { + message("The required package is not installed.") + stop(cnd) # Re-throw the error + } + } + ), + abort = function(cnd) { + message("The required package is not installed.") + stop(cnd) # Re-throw the error + } + ) + + rlang::is_installed(...) +} diff --git a/_pkgdown.yml b/_pkgdown.yml index 9f049d11..29fed25f 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -63,3 +63,4 @@ reference: - spod_set_data_dir - spod_get_data_dir - spod_cite + - spod_check_files diff --git a/inst/extdata/available_data_v1.rds b/inst/extdata/available_data_v1.rds new file mode 100644 index 00000000..0a1bef69 Binary files /dev/null and b/inst/extdata/available_data_v1.rds differ diff --git a/inst/extdata/url_file_sizes_v1.txt.gz b/inst/extdata/url_file_sizes_v1.txt.gz deleted file mode 100644 index 1583b31b..00000000 Binary files a/inst/extdata/url_file_sizes_v1.txt.gz and /dev/null differ diff --git a/man/spod_assert_package.Rd b/man/spod_assert_package.Rd new file mode 100644 index 00000000..dc7ac4b7 --- /dev/null +++ b/man/spod_assert_package.Rd @@ -0,0 +1,40 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/internal-utils.R +\name{spod_assert_package} +\alias{spod_assert_package} +\title{Checks if a package is installed and \emph{informs} the user if not} +\usage{ +spod_assert_package(...) +} +\arguments{ +\item{...}{ + Arguments passed on to \code{\link[rlang:is_installed]{rlang::check_installed}} + \describe{ + \item{\code{pkg}}{The package names. Can include version requirements, +e.g. \code{"pkg (>= 1.0.0)"}.} + \item{\code{version}}{Minimum versions for \code{pkg}. If supplied, must be the +same length as \code{pkg}. \code{NA} elements stand for any versions.} + \item{\code{compare}}{A character vector of comparison operators to use +for \code{version}. If supplied, must be the same length as +\code{version}. If \code{NULL}, \code{>=} is used as default for all +elements. \code{NA} elements in \code{compare} are also set to \code{>=} by +default.} + \item{\code{reason}}{Optional string indicating why is \code{pkg} needed. +Appears in error messages (if non-interactive) and user prompts +(if interactive).} + \item{\code{action}}{An optional function taking \code{pkg} and \code{...} +arguments. It is called by \code{check_installed()} when the user +chooses to update outdated packages. The function is passed the +missing and outdated packages as a character vector of names.} + \item{\code{call}}{The execution environment of a currently +running function, e.g. \code{caller_env()}. The function will be +mentioned in error messages as the source of the error. See the +\code{call} argument of \code{\link[rlang:abort]{abort()}} for more information.} + }} +} +\description{ +This is wrapper around \link[rlang:is_installed]{rlang::check_installed}; +instead of erroring out if the check fails it returns \code{FALSE}. +However, unlike \link[rlang:is_installed]{rlang::is_installed}, it emits a message to the user. +} +\keyword{internal} diff --git a/man/spod_available_data.Rd b/man/spod_available_data.Rd index 5ba51450..b1396ad2 100644 --- a/man/spod_available_data.Rd +++ b/man/spod_available_data.Rd @@ -8,17 +8,23 @@ spod_available_data( ver = 2, check_local_files = FALSE, quiet = FALSE, - data_dir = spod_get_data_dir() + data_dir = spod_get_data_dir(), + use_s3 = TRUE, + force = FALSE ) } \arguments{ -\item{ver}{Integer. Can be 1 or 2. The version of the data to use. v1 spans 2020-2021, v2 covers 2022 and onwards.} +\item{ver}{Integer. Can be 1 or 2. The version of the data to use. v1 spans 2020-2021, v2 covers 2022 and onwards. See more details in codebooks with \code{\link[=spod_codebook]{spod_codebook()}}.} -\item{check_local_files}{Whether to check if the local files exist. Defaults to \code{FALSE}.} +\item{check_local_files}{Logical. Whether to check if the local files exist and get the file size. Defaults to \code{FALSE}.} \item{quiet}{A \code{logical} value indicating whether to suppress messages. Default is \code{FALSE}.} \item{data_dir}{The directory where the data is stored. Defaults to the value returned by \code{spod_get_data_dir()}.} + +\item{use_s3}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} Logical. If \code{TRUE}, use Amazon S3 to get available data list, which does not require downloading the XML file and caching it locally, which may be a bit faster. If \code{FALSE}, use the XML file to get available data list.} + +\item{force}{Logical. If \code{TRUE}, force re-download of metadata. For Amazon S3 this queries the S3 bucket for the XML file it re-downloads. If \code{FALSE}, only update the available data list if it is older than 1 day.} } \value{ A tibble with links, release dates of files in the data, dates of data coverage, local paths to files, and the download status. @@ -28,14 +34,14 @@ A tibble with links, release dates of files in the data, dates of data coverage, \item{file_extension}{\code{character}. The file extension of the data file (e.g., 'tar', 'gz').} \item{data_ym}{\code{Date}. The year and month of the data coverage, if available.} \item{data_ymd}{\code{Date}. The specific date of the data coverage, if available.} -\item{local_path}{\code{character}. The local file path where the data is stored.} +\item{local_path}{\code{character}. The local file path where the data is (or going to be) stored.} \item{downloaded}{\code{logical}. Indicator of whether the data file has been downloaded locally. This is only available if \code{check_local_files} is \code{TRUE}.} } } \description{ \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#stable}{\figure{lifecycle-stable.svg}{options: alt='[Stable]'}}}{\strong{[Stable]}} -Get a table with links to available data files for the specified data version. Optionally check (see arguments) if certain files have already been downloaded into the cache directory specified with SPANISH_OD_DATA_DIR environment variable (set by \link{spod_set_data_dir}) or a custom path specified with \code{data_dir} argument. +Get a table with links to available data files for the specified data version. Optionally check (see arguments) the file size and availability of data files previously downloaded into the cache directory specified with SPANISH_OD_DATA_DIR environment variable (set by \code{\link[=spod_set_data_dir]{spod_set_data_dir()}}) or a custom path specified with \code{data_dir} argument. By default the data is fetched from Amazon S3 bucket where the data is stored. If that fails, the function falls back to downloading an XML file from the Spanish Ministry of Transport website. You can also control this behaviour with \code{use_s3} argument. } \examples{ \dontshow{if (interactive()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} diff --git a/man/spod_available_data_s3.Rd b/man/spod_available_data_s3.Rd new file mode 100644 index 00000000..924710a4 --- /dev/null +++ b/man/spod_available_data_s3.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/available-data-s3.R +\name{spod_available_data_s3} +\alias{spod_available_data_s3} +\title{Get available data list from Amazon S3 storage} +\usage{ +spod_available_data_s3( + ver = c(1, 2), + force = FALSE, + quiet = FALSE, + data_dir = spod_get_data_dir() +) +} +\arguments{ +\item{ver}{Integer. Can be 1 or 2. The version of the data to use. v1 spans 2020-2021, v2 covers 2022 and onwards. See more details in codebooks with \code{\link[=spod_codebook]{spod_codebook()}}.} + +\item{force}{Logical. If \code{TRUE}, force re-download of metadata. For Amazon S3 this queries the S3 bucket for the XML file it re-downloads. If \code{FALSE}, only update the available data list if it is older than 1 day.} + +\item{quiet}{A \code{logical} value indicating whether to suppress messages. Default is \code{FALSE}.} + +\item{data_dir}{The directory where the data is stored. Defaults to the value returned by \code{spod_get_data_dir()}.} +} +\value{ +A tibble with links, release dates of files in the data, dates of data coverage, local paths to files, and the download status. +} +\description{ +Get a table with links to available data files for the specified data version from Amazon S3 storage. +} +\keyword{internal} diff --git a/man/spod_available_data_v1.Rd b/man/spod_available_data_v1.Rd index ed862c0c..8dd3aa86 100644 --- a/man/spod_available_data_v1.Rd +++ b/man/spod_available_data_v1.Rd @@ -7,13 +7,19 @@ spod_available_data_v1( data_dir = spod_get_data_dir(), check_local_files = FALSE, + use_s3 = TRUE, + force = FALSE, quiet = FALSE ) } \arguments{ \item{data_dir}{The directory where the data is stored. Defaults to the value returned by \code{spod_get_data_dir()}.} -\item{check_local_files}{Whether to check if the local files exist. Defaults to \code{FALSE}.} +\item{check_local_files}{Logical. Whether to check if the local files exist and get the file size. Defaults to \code{FALSE}.} + +\item{use_s3}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} Logical. If \code{TRUE}, use Amazon S3 to get available data list, which does not require downloading the XML file and caching it locally, which may be a bit faster. If \code{FALSE}, use the XML file to get available data list.} + +\item{force}{Logical. If \code{TRUE}, force re-download of metadata. For Amazon S3 this queries the S3 bucket for the XML file it re-downloads. If \code{FALSE}, only update the available data list if it is older than 1 day.} \item{quiet}{A \code{logical} value indicating whether to suppress messages. Default is \code{FALSE}.} } @@ -25,7 +31,7 @@ A tibble with links, release dates of files in the data, dates of data coverage, \item{file_extension}{\code{character}. The file extension of the data file (e.g., 'tar', 'gz').} \item{data_ym}{\code{Date}. The year and month of the data coverage, if available.} \item{data_ymd}{\code{Date}. The specific date of the data coverage, if available.} -\item{local_path}{\code{character}. The local file path where the data is stored.} +\item{local_path}{\code{character}. The local file path where the data is (or going to be) stored.} \item{downloaded}{\code{logical}. Indicator of whether the data file has been downloaded locally. This is only available if \code{check_local_files} is \code{TRUE}.} } } diff --git a/man/spod_available_data_v2.Rd b/man/spod_available_data_v2.Rd index 64b327c1..82f746d6 100644 --- a/man/spod_available_data_v2.Rd +++ b/man/spod_available_data_v2.Rd @@ -7,13 +7,19 @@ spod_available_data_v2( data_dir = spod_get_data_dir(), check_local_files = FALSE, + use_s3 = TRUE, + force = FALSE, quiet = FALSE ) } \arguments{ \item{data_dir}{The directory where the data is stored. Defaults to the value returned by \code{spod_get_data_dir()}.} -\item{check_local_files}{Whether to check if the local files exist. Defaults to \code{FALSE}.} +\item{check_local_files}{Logical. Whether to check if the local files exist and get the file size. Defaults to \code{FALSE}.} + +\item{use_s3}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} Logical. If \code{TRUE}, use Amazon S3 to get available data list, which does not require downloading the XML file and caching it locally, which may be a bit faster. If \code{FALSE}, use the XML file to get available data list.} + +\item{force}{Logical. If \code{TRUE}, force re-download of metadata. For Amazon S3 this queries the S3 bucket for the XML file it re-downloads. If \code{FALSE}, only update the available data list if it is older than 1 day.} \item{quiet}{A \code{logical} value indicating whether to suppress messages. Default is \code{FALSE}.} } @@ -25,7 +31,7 @@ A tibble with links, release dates of files in the data, dates of data coverage, \item{file_extension}{\code{character}. The file extension of the data file (e.g., 'tar', 'gz').} \item{data_ym}{\code{Date}. The year and month of the data coverage, if available.} \item{data_ymd}{\code{Date}. The specific date of the data coverage, if available.} -\item{local_path}{\code{character}. The local file path where the data is stored.} +\item{local_path}{\code{character}. The local file path where the data is (or going to be) stored.} \item{downloaded}{\code{logical}. Indicator of whether the data file has been downloaded locally. This is only available if \code{check_local_files} is \code{TRUE}.} } } diff --git a/man/spod_check_files.Rd b/man/spod_check_files.Rd new file mode 100644 index 00000000..76c53439 --- /dev/null +++ b/man/spod_check_files.Rd @@ -0,0 +1,73 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/check-files.R +\name{spod_check_files} +\alias{spod_check_files} +\title{Check cached files consistency against checksums from S3} +\usage{ +spod_check_files( + type = c("od", "origin-destination", "os", "overnight_stays", "nt", "number_of_trips"), + zones = c("districts", "dist", "distr", "distritos", "municipalities", "muni", + "municip", "municipios", "lua", "large_urban_areas", "gau", "grandes_areas_urbanas"), + dates = NULL, + data_dir = spod_get_data_dir(), + quiet = FALSE, + ignore_missing_dates = FALSE, + n_threads = 1 +) +} +\arguments{ +\item{type}{The type of data to download. Can be \code{"origin-destination"} (or ust \code{"od"}), or \code{"number_of_trips"} (or just \code{"nt"}) for v1 data. For v2 data \code{"overnight_stays"} (or just \code{"os"}) is also available. More data types to be supported in the future. See codebooks for v1 and v2 data in vignettes with \code{spod_codebook(1)} and \code{spod_codebook(2)} (\link{spod_codebook}).} + +\item{zones}{The zones for which to download the data. Can be \code{"districts"} (or \code{"dist"}, \code{"distr"}, or the original Spanish \code{"distritos"}) or \code{"municipalities"} (or \code{"muni"}, \code{"municip"}, or the original Spanish \code{"municipios"}) for both data versions. Additionaly, these can be \code{"large_urban_areas"} (or \code{"lua"}, or the original Spanish \code{"grandes_areas_urbanas"}, or \code{"gau"}) for v2 data (2022 onwards).} + +\item{dates}{A \code{character} or \code{Date} vector of dates to process. Kindly keep in mind that v1 and v2 data follow different data collection methodologies and may not be directly comparable. Therefore, do not try to request data from both versions for the same date range. If you need to compare data from both versions, please refer to the respective codebooks and methodology documents. The v1 data covers the period from 2020-02-14 to 2021-05-09, and the v2 data covers the period from 2022-01-01 to the present until further notice. The true dates range is checked against the available data for each version on every function run. + +The possible values can be any of the following: +\itemize{ +\item For the \code{spod_get()} and \code{spod_convert()} functions, the \code{dates} can be set to "cached_v1" or "cached_v2" to request data from cached (already previously downloaded) v1 (2020-2021) or v2 (2022 onwards) data. In this case, the function will identify and use all data files that have been downloaded and cached locally, (e.g. using an explicit run of \code{spod_download()}, or any data requests made using the \code{spod_get()} or \code{spod_convert()} functions). +\item A single date in ISO (YYYY-MM-DD) or YYYYMMDD format. \code{character} or \code{Date} object. +\item A vector of dates in ISO (YYYY-MM-DD) or YYYYMMDD format. \code{character} or \code{Date} object. Can be any non-consecutive sequence of dates. +\item A date range +\itemize{ +\item eigher a \code{character} or \code{Date} object of length 2 with clearly named elements \code{start} and \code{end} in ISO (YYYY-MM-DD) or YYYYMMDD format. E.g. \code{c(start = "2020-02-15", end = "2020-02-17")}; +\item or a \code{character} object of the form \code{YYYY-MM-DD_YYYY-MM-DD} or \code{YYYYMMDD_YYYYMMDD}. For example, \verb{2020-02-15_2020-02-17} or \verb{20200215_20200217}. +} +\item A regular expression to match dates in the format \code{YYYYMMDD}. \code{character} object. For example, \verb{^202002} will match all dates in February 2020. +}} + +\item{data_dir}{The directory where the data is stored. Defaults to the value returned by \code{spod_get_data_dir()} which returns the value of the environment variable \code{SPANISH_OD_DATA_DIR} or a temporary directory if the variable is not set. To set the data directory, use \link{spod_set_data_dir}.} + +\item{quiet}{A \code{logical} value indicating whether to suppress messages. Default is \code{FALSE}.} + +\item{ignore_missing_dates}{Logical. If \code{TRUE}, the function will not raise an error if the some of the specified dates are missing. Any dates that are missing will be skipped, however the data for any valid dates will be acquired. Defaults to \code{FALSE}.} + +\item{n_threads}{Numeric. Number of threads to use for file verificaiton. Defaults to 1. When set to 2 or more threads, uses \code{future.mirai} as a backend for parallelization, resulting in significant (~4x) speedup, unless disk read speed is a bottleneck.} +} +\value{ +A tibble similar to the output of \code{spod_available_data}, but with an extra column \code{local_file_consistent}, where \code{TRUE} indicates that the file cheksum matches the expected checksums in Amazon S3. Note: some v1 (2020-2021) files were not stored correctly on S3 and their ETag checksums are incorrectly reported by Amazon S3, so their true file sizes and ETag checksums were cached inside the \code{spanishoddata} package. +} +\description{ +\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} + +\strong{WARNING: The checks may fail for May 2022 data and for some 2024 data, as the remote cheksums that are used for checking the file consistency are incorrect. We are working on solving this in future updates, for now, kindly rely on the built-in file size checks of \code{\link{spod_download}}, \code{\link{spod_get}}, and \code{\link{spod_convert}}.} This function checks downloaded data files whether they are consistent with their checksums in Amazon S3 by computing ETag for each file. This involves computing MD5 for each part of the file and concatenating them and computing MD5 again on the resulting concatenated MD5s. This may take very long time if you check all files, so use with caution. +} +\examples{ +\dontshow{if (interactive()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +\donttest{ +spod_set_data_dir(tempdir()) +spod_download( + type = "number_of_trips", + zones = "distr", + dates = "2020-03-14" +) + +# now check the consistency +check_results <- spod_check_files( + type = "number_of_trips", + zones = "distr", + dates = "2020-03-14" +) +all(check_results$local_file_consistent) +} +\dontshow{\}) # examplesIf} +} diff --git a/man/spod_compute_s3_etag.Rd b/man/spod_compute_s3_etag.Rd new file mode 100644 index 00000000..6269c8bd --- /dev/null +++ b/man/spod_compute_s3_etag.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/check-files.R +\name{spod_compute_s3_etag} +\alias{spod_compute_s3_etag} +\title{Compute ETag for a file} +\usage{ +spod_compute_s3_etag(file_path, part_size = 8 * 1024^2) +} +\arguments{ +\item{file_path}{Character. The path to the file.} + +\item{part_size}{Numeric. The size of each part in bytes. Do not change, as this is a default for S3 Etag.} +} +\value{ +Character. The ETag for the file. +} +\description{ +Compute ETag for a file +} +\keyword{internal} diff --git a/man/spod_download.Rd b/man/spod_download.Rd index fe4262b1..4d1d8531 100644 --- a/man/spod_download.Rd +++ b/man/spod_download.Rd @@ -13,7 +13,8 @@ spod_download( data_dir = spod_get_data_dir(), quiet = FALSE, return_local_file_paths = FALSE, - ignore_missing_dates = FALSE + ignore_missing_dates = FALSE, + check_local_files = TRUE ) } \arguments{ @@ -45,6 +46,8 @@ The possible values can be any of the following: \item{return_local_file_paths}{Logical. If \code{TRUE}, the function returns a character vector of the paths to the downloaded files. If \code{FALSE}, the function returns \code{NULL}.} \item{ignore_missing_dates}{Logical. If \code{TRUE}, the function will not raise an error if the some of the specified dates are missing. Any dates that are missing will be skipped, however the data for any valid dates will be acquired. Defaults to \code{FALSE}.} + +\item{check_local_files}{Logical. Whether to check the file size of local files against known remote file sizes on the Amazon S3 storage. Defaults to \code{TRUE}, which fetches the metadata from Amazon S3. This setting ensures your downloaded files are not broken, so it is recommended to keep it \code{TRUE}.} } \value{ Nothing. If \code{return_local_file_paths = TRUE}, a \code{character} vector of the paths to the downloaded files. diff --git a/man/spod_download_in_batches.Rd b/man/spod_download_in_batches.Rd new file mode 100644 index 00000000..497ab133 --- /dev/null +++ b/man/spod_download_in_batches.Rd @@ -0,0 +1,39 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/download_data.R +\name{spod_download_in_batches} +\alias{spod_download_in_batches} +\title{Download multiple files with progress bar in parallel} +\usage{ +spod_download_in_batches( + files_to_download, + batch_size = 5, + bar_width = 20, + chunk_size = 1024 * 1024, + test_size = 10 * 1024 * 1024, + max_retries = 3L, + timeout = 900, + show_progress = interactive() && !isTRUE(getOption("knitr.in.progress")) +) +} +\arguments{ +\item{files_to_download}{A data frame with columns \code{target_url}, \code{local_path} and \code{file_size_bytes}.} + +\item{batch_size}{Numeric. Number of files to download at a time.} + +\item{bar_width}{Numeric. Width of the progress bar.} + +\item{chunk_size}{Numeric. Number of bytes to download at a time for speed test.} + +\item{max_retries}{Integer. Maximum number of retries for failed downloads.} + +\item{timeout}{Numeric. Timeout in seconds for each download.} + +\item{show_progress}{Logical. Whether to show the progress bar.} +} +\value{ +A data frame with columns \code{target_url}, \code{local_path}, \code{file_size_bytes} and \code{local_file_size}. +} +\description{ +Download multiple files with a progress bar. Retries failed downloads up to 3 times. Downloads are in parallel and in batches to show progress. First 10 Mb of a file is downloaded to check the speed. +} +\keyword{internal} diff --git a/man/spod_duckdb_number_of_trips.Rd b/man/spod_duckdb_number_of_trips.Rd index 749b7d5b..cb2c2d75 100644 --- a/man/spod_duckdb_number_of_trips.Rd +++ b/man/spod_duckdb_number_of_trips.Rd @@ -17,7 +17,7 @@ spod_duckdb_number_of_trips( \item{zones}{The zones for which to download the data. Can be \code{"districts"} (or \code{"dist"}, \code{"distr"}, or the original Spanish \code{"distritos"}) or \code{"municipalities"} (or \code{"muni"}, \code{"municip"}, or the original Spanish \code{"municipios"}) for both data versions. Additionaly, these can be \code{"large_urban_areas"} (or \code{"lua"}, or the original Spanish \code{"grandes_areas_urbanas"}, or \code{"gau"}) for v2 data (2022 onwards).} -\item{ver}{Integer. Can be 1 or 2. The version of the data to use. v1 spans 2020-2021, v2 covers 2022 and onwards.} +\item{ver}{Integer. Can be 1 or 2. The version of the data to use. v1 spans 2020-2021, v2 covers 2022 and onwards. See more details in codebooks with \code{\link[=spod_codebook]{spod_codebook()}}.} \item{data_dir}{The directory where the data is stored. Defaults to the value returned by \code{spod_get_data_dir()}.} } diff --git a/man/spod_duckdb_od.Rd b/man/spod_duckdb_od.Rd index 195bdcf1..ca057468 100644 --- a/man/spod_duckdb_od.Rd +++ b/man/spod_duckdb_od.Rd @@ -17,7 +17,7 @@ spod_duckdb_od( \item{zones}{The zones for which to download the data. Can be \code{"districts"} (or \code{"dist"}, \code{"distr"}, or the original Spanish \code{"distritos"}) or \code{"municipalities"} (or \code{"muni"}, \code{"municip"}, or the original Spanish \code{"municipios"}) for both data versions. Additionaly, these can be \code{"large_urban_areas"} (or \code{"lua"}, or the original Spanish \code{"grandes_areas_urbanas"}, or \code{"gau"}) for v2 data (2022 onwards).} -\item{ver}{Integer. Can be 1 or 2. The version of the data to use. v1 spans 2020-2021, v2 covers 2022 and onwards.} +\item{ver}{Integer. Can be 1 or 2. The version of the data to use. v1 spans 2020-2021, v2 covers 2022 and onwards. See more details in codebooks with \code{\link[=spod_codebook]{spod_codebook()}}.} \item{data_dir}{The directory where the data is stored. Defaults to the value returned by \code{spod_get_data_dir()}.} } diff --git a/man/spod_duckdb_overnight_stays.Rd b/man/spod_duckdb_overnight_stays.Rd index e7b1471b..73018c40 100644 --- a/man/spod_duckdb_overnight_stays.Rd +++ b/man/spod_duckdb_overnight_stays.Rd @@ -17,7 +17,7 @@ spod_duckdb_overnight_stays( \item{zones}{The zones for which to download the data. Can be \code{"districts"} (or \code{"dist"}, \code{"distr"}, or the original Spanish \code{"distritos"}) or \code{"municipalities"} (or \code{"muni"}, \code{"municip"}, or the original Spanish \code{"municipios"}) for both data versions. Additionaly, these can be \code{"large_urban_areas"} (or \code{"lua"}, or the original Spanish \code{"grandes_areas_urbanas"}, or \code{"gau"}) for v2 data (2022 onwards).} -\item{ver}{Integer. Can be 1 or 2. The version of the data to use. v1 spans 2020-2021, v2 covers 2022 and onwards.} +\item{ver}{Integer. Can be 1 or 2. The version of the data to use. v1 spans 2020-2021, v2 covers 2022 and onwards. See more details in codebooks with \code{\link[=spod_codebook]{spod_codebook()}}.} \item{data_dir}{The directory where the data is stored. Defaults to the value returned by \code{spod_get_data_dir()}.} } diff --git a/man/spod_get.Rd b/man/spod_get.Rd index 639b5f11..77f10c8e 100644 --- a/man/spod_get.Rd +++ b/man/spod_get.Rd @@ -61,11 +61,11 @@ A DuckDB lazy table connection object of class \code{tbl_duckdb_connection}. \description{ \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#stable}{\figure{lifecycle-stable.svg}{options: alt='[Stable]'}}}{\strong{[Stable]}} -This function creates a DuckDB lazy table connection object from the specified type and zones. It checks for missing data and downloads it if necessary. The connnection is made to the raw CSV files in gzip archives, so analysing the data through this connection may be slow if you select more than a few days. You can manipulate this object using \code{dplyr} functions such as \link[dplyr]{select}, \link[dplyr]{filter}, \link[dplyr]{mutate}, \link[dplyr]{group_by}, \link[dplyr]{summarise}, etc. In the end of any sequence of commands you will need to add \link[dplyr]{collect} to execute the whole chain of data manipulations and load the results into memory in an R \code{data.frame}/\code{tibble}. See codebooks for v1 and v2 data in vignettes with \link{spod_codebook}(1) and \link{spod_codebook}(2). +This function creates a DuckDB lazy table connection object from the specified type and zones. It checks for missing data and downloads it if necessary. The connnection is made to the raw CSV files in gzip archives, so analysing the data through this connection may be slow if you select more than a few days. You can manipulate this object using \code{dplyr} functions such as \code{\link[dplyr]{select}}, \code{\link[dplyr]{filter}}, \code{\link[dplyr]{mutate}}, \code{\link[dplyr]{group_by}}, \code{\link[dplyr]{summarise}}, etc. In the end of any sequence of commands you will need to add \code{\link[dplyr]{collect}} to execute the whole chain of data manipulations and load the results into memory in an R \code{data.frame}/\code{tibble}. See codebooks for v1 and v2 data in vignettes with \code{\link{spod_codebook}(1)} and \code{\link{spod_codebook}(2)}. -If you want to analyse longer periods of time (especiially several months or even the whole data over several years), consider using the \link{spod_convert} and then \link{spod_connect}. +If you want to analyse longer periods of time (especiially several months or even the whole data over several years), consider using the \code{\link{spod_convert}} and then \code{\link{spod_connect}}. -If you want to quickly get the origin-destination data with flows aggregated for a single day at municipal level and without any extra socio-economic variables, consider using the \link[spanishoddata]{spod_quick_get_od} function. +If you want to quickly get the origin-destination data with flows aggregated for a single day at municipal level and without any extra socio-economic variables, consider using the \code{\link{spod_quick_get_od}} function. } \examples{ \dontshow{if (interactive()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} diff --git a/man/spod_get_valid_dates.Rd b/man/spod_get_valid_dates.Rd index f4d20072..270f699a 100644 --- a/man/spod_get_valid_dates.Rd +++ b/man/spod_get_valid_dates.Rd @@ -7,7 +7,7 @@ spod_get_valid_dates(ver = NULL) } \arguments{ -\item{ver}{Integer. Can be 1 or 2. The version of the data to use. v1 spans 2020-2021, v2 covers 2022 and onwards.} +\item{ver}{Integer. Can be 1 or 2. The version of the data to use. v1 spans 2020-2021, v2 covers 2022 and onwards. See more details in codebooks with \code{\link[=spod_codebook]{spod_codebook()}}.} } \value{ A vector of type \code{Date} with all possible valid dates for the specified data version (v1 for 2020-2021 and v2 for 2020 onwards). diff --git a/man/spod_get_zones.Rd b/man/spod_get_zones.Rd index 81674c8c..22f903fc 100644 --- a/man/spod_get_zones.Rd +++ b/man/spod_get_zones.Rd @@ -15,7 +15,7 @@ spod_get_zones( \arguments{ \item{zones}{The zones for which to download the data. Can be \code{"districts"} (or \code{"dist"}, \code{"distr"}, or the original Spanish \code{"distritos"}) or \code{"municipalities"} (or \code{"muni"}, \code{"municip"}, or the original Spanish \code{"municipios"}) for both data versions. Additionaly, these can be \code{"large_urban_areas"} (or \code{"lua"}, or the original Spanish \code{"grandes_areas_urbanas"}, or \code{"gau"}) for v2 data (2022 onwards).} -\item{ver}{Integer. Can be 1 or 2. The version of the data to use. v1 spans 2020-2021, v2 covers 2022 and onwards.} +\item{ver}{Integer. Can be 1 or 2. The version of the data to use. v1 spans 2020-2021, v2 covers 2022 and onwards. See more details in codebooks with \code{\link[=spod_codebook]{spod_codebook()}}.} \item{data_dir}{The directory where the data is stored. Defaults to the value returned by \code{spod_get_data_dir()} which returns the value of the environment variable \code{SPANISH_OD_DATA_DIR} or a temporary directory if the variable is not set. To set the data directory, use \link{spod_set_data_dir}.} diff --git a/man/spod_match_data_type_for_local_folders.Rd b/man/spod_match_data_type_for_local_folders.Rd index dc1323a5..54f7fb51 100644 --- a/man/spod_match_data_type_for_local_folders.Rd +++ b/man/spod_match_data_type_for_local_folders.Rd @@ -10,7 +10,7 @@ spod_match_data_type_for_local_folders( ) } \arguments{ -\item{ver}{Integer. Can be 1 or 2. The version of the data to use. v1 spans 2020-2021, v2 covers 2022 and onwards.} +\item{ver}{Integer. Can be 1 or 2. The version of the data to use. v1 spans 2020-2021, v2 covers 2022 and onwards. See more details in codebooks with \code{\link[=spod_codebook]{spod_codebook()}}.} } \value{ A \code{character} string with the folder name for the specified data type. Or \code{NULL} if the data type is not recognized. diff --git a/man/spod_multi_download_with_progress.Rd b/man/spod_multi_download_with_progress.Rd new file mode 100644 index 00000000..96bc9b8f --- /dev/null +++ b/man/spod_multi_download_with_progress.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/download_data.R +\name{spod_multi_download_with_progress} +\alias{spod_multi_download_with_progress} +\title{Download multiple files with progress bar sequentially} +\usage{ +spod_multi_download_with_progress( + files_to_download, + chunk_size = 1024 * 1024, + bar_width = 20, + show_progress = interactive() && !isTRUE(getOption("knitr.in.progress")) +) +} +\arguments{ +\item{files_to_download}{A data frame with columns \code{target_url}, \code{local_path} and \code{file_size_bytes}.} + +\item{chunk_size}{Number of bytes to download at a time.} + +\item{bar_width}{Width of the progress bar.} + +\item{show_progress}{Whether to show the progress bar.} +} +\value{ +A data frame with columns \code{target_url}, \code{local_path}, \code{file_size_bytes} and \code{local_file_size}. +} +\description{ +Download multiple files with a progress bar. Retries failed downloads up to 3 times. +} +\keyword{internal} diff --git a/man/spod_store_etags.Rd b/man/spod_store_etags.Rd new file mode 100644 index 00000000..29d63cd7 --- /dev/null +++ b/man/spod_store_etags.Rd @@ -0,0 +1,15 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dev-tools.R +\name{spod_store_etags} +\alias{spod_store_etags} +\title{Get Etags for locally saved v1 data files and save them into a RDS file in the inst/extdata folder.} +\usage{ +spod_store_etags() +} +\value{ +Returns a tibble with the local path, local ETag and remote ETag. +} +\description{ +Get Etags for locally saved v1 data files and save them into a RDS file in the inst/extdata folder. +} +\keyword{internal} diff --git a/man/spod_subfolder_clean_data_cache.Rd b/man/spod_subfolder_clean_data_cache.Rd index 623c21f2..8ee66b7a 100644 --- a/man/spod_subfolder_clean_data_cache.Rd +++ b/man/spod_subfolder_clean_data_cache.Rd @@ -7,7 +7,7 @@ spod_subfolder_clean_data_cache(ver = 1) } \arguments{ -\item{ver}{Integer. Can be 1 or 2. The version of the data to use. v1 spans 2020-2021, v2 covers 2022 and onwards.} +\item{ver}{Integer. Can be 1 or 2. The version of the data to use. v1 spans 2020-2021, v2 covers 2022 and onwards. See more details in codebooks with \code{\link[=spod_codebook]{spod_codebook()}}.} } \value{ A \code{character} string with the subfolder name for the clean data cache. diff --git a/man/spod_subfolder_raw_data_cache.Rd b/man/spod_subfolder_raw_data_cache.Rd index 4044d0e6..b2e99971 100644 --- a/man/spod_subfolder_raw_data_cache.Rd +++ b/man/spod_subfolder_raw_data_cache.Rd @@ -7,7 +7,7 @@ spod_subfolder_raw_data_cache(ver = 1) } \arguments{ -\item{ver}{Integer. Can be 1 or 2. The version of the data to use. v1 spans 2020-2021, v2 covers 2022 and onwards.} +\item{ver}{Integer. Can be 1 or 2. The version of the data to use. v1 spans 2020-2021, v2 covers 2022 and onwards. See more details in codebooks with \code{\link[=spod_codebook]{spod_codebook()}}.} } \value{ A \code{character} string with the subfolder name for the raw data cache. diff --git a/tests/testthat/test-quick-get.R b/tests/testthat/test-quick-get.R index da1ae9aa..b0d6a655 100644 --- a/tests/testthat/test-quick-get.R +++ b/tests/testthat/test-quick-get.R @@ -1,4 +1,3 @@ -# some tests are disabled as the API endpoint is not working because of the new restrictions see https://github.com/rOpenSpain/spanishoddata/issues/162 test_that("spod_quick_get_od fails out of range dates", { skip_on_ci() skip_on_cran() @@ -10,6 +9,7 @@ test_that("spod_quick_get_od fails out of range dates", { ) }) + test_that("spod_quick_get_od fails on invalid date format", { expect_error( spod_quick_get_od(