rOpenSpain · e-kotov · Jun 13, 2025 · May 16, 2025 · May 16, 2025 · May 16, 2025
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -29,7 +29,6 @@ Depends:
     R (>= 4.1.0)
 Imports: 
     checkmate,
-    curl (>= 5.0.0),
     DBI,
     digest,
     dplyr,
@@ -45,6 +44,7 @@ Imports:
     memuse,
     openssl,
     parallelly,
+    paws.storage (>= 0.4.0),
     purrr,
     readr,
     rlang,
@@ -58,6 +58,7 @@ Suggests:
     flowmapper (>= 0.1.2),
     furrr,
     future,
+    future.mirai,
     hexSticker,
     mapSpain,
     quarto,

diff --git a/NAMESPACE b/NAMESPACE
@@ -1,6 +1,7 @@
 # Generated by roxygen2: do not edit by hand
 
 export(spod_available_data)
+export(spod_check_files)
 export(spod_cite)
 export(spod_codebook)
 export(spod_connect)

diff --git a/NEWS.md b/NEWS.md
@@ -2,13 +2,21 @@
 
 ## New features
 
-* `spod_quick_get_zones()` is a new function to quickly get municipality geometries to match with the data retrieved with `spod_quick_get_od()` [#163](https://github.com/rOpenSpain/spanishoddata/pull/163). Requests to get geometies are cached in memory of the current R session with `memoise` package.
+* `spod_quick_get_zones()` is a new function to quickly get municipality geometries to match with the data retrieved with `spod_quick_get_od()` [#163](https://github.com/rOpenSpain/spanishoddata/pull/163). Requests to get geometies are cached in memory of the current R session with `memoise` package. This function is experimental, just as the `spod_quick_get_od()` function, as the API of the Spanish Ministry of Transport may change in the future. It is only intended for quick analysis in educational or other demonstration purposes, as it downloads very little data compared to the regular `spod_get_od()`, `spod_download()` and `spod_convert()` functions.
+
+* `spod_check_files()` function allows to check consistency of downloaded files with Amazon S3 checksums (PR [#165](https://github.com/rOpenSpain/spanishoddata/pull/165)). ETags for v1 data are stored with the package, and for v2 data they are fetched from Amazon S3. This function is experimental.
+
+## Improvements
+
+* Metadata is now fetched from Amazon S3 storage of the original data files, which allows validation of downloaded files ((#126)[https://github.com/rOpenSpain/spanishoddata/issues/126]) with both size and checksum. PR [#165](https://github.com/rOpenSpain/spanishoddata/pull/165).
 
 ## Bug fixes
 
-* `spod_quick_get_od()` is working again. We fixed it to work with the updated API of the Spanish Ministry of Transport (PR [#163](https://github.com/rOpenSpain/spanishoddata/pull/163), issue [#162](https://github.com/rOpenSpain/spanishoddata/issues/162)). It will remain experimental, as the API may change in the future.
+* More reliable, but still multi-threaded data file downloads using base R `utils::download.file()` instead of `curl::multi_download()` which failed on some connections ([#127](https://github.com/rOpenSpain/spanishoddata/issues/127)), so now `curl` dependency is no longer required. PR [#165](https://github.com/rOpenSpain/spanishoddata/pull/165).
+
+* `spod_quick_get_od()` is working again. We fixed it to work with the updated API of the Spanish Ministry of Transport (PR [#163](https://github.com/rOpenSpain/spanishoddata/pull/163), issue [#162](https://github.com/rOpenSpain/spanishoddata/issues/162)). This function will remain experimental, just as the `spod_quick_get_zones()` function, as the API of the Spanish Ministry of Transport may change in the future. It is only intended for quick analysis in educational or other demonstration purposes, as it downloads very little data compared to the regular `spod_get_od()`, `spod_download()` and `spod_convert()` functions.
 
-* `spod_convert()` can now accept `overwrite = 'update'` with `save_format = 'parquet'` ([#161](https://github.com/rOpenSpain/spanishoddata/pull/161)) previously it failed because of the incorrect check that asserted only `TRUE` or `FALSE` ([#160](https://github.com/rOpenSpain/spanishoddata/issues/160))
+* `spod_convert()` now accepts `overwrite = 'update'` with `save_format = 'parquet'` ([#161](https://github.com/rOpenSpain/spanishoddata/pull/161)) previously it failed because of the incorrect check that asserted only `TRUE` or `FALSE` ([#160](https://github.com/rOpenSpain/spanishoddata/issues/160))
 
 # spanishoddata 0.1.1
 

diff --git a/R/available-data-s3.R b/R/available-data-s3.R
@@ -0,0 +1,187 @@
+#' Get available data list from Amazon S3 storage
+#'
+#' @description
+#'
+#' Get a table with links to available data files for the specified data version from Amazon S3 storage.
+#'
+#' @inheritParams spod_available_data
+#' @inheritParams global_quiet_param
+#' @return A tibble with links, release dates of files in the data, dates of data coverage, local paths to files, and the download status.
+#'
+#' @keywords internal
+spod_available_data_s3 <- function(
+  ver = c(1, 2),
+  force = FALSE,
+  quiet = FALSE,
+  data_dir = spod_get_data_dir()
+) {
+  ver <- as.character(ver)
+  ver <- match.arg(ver)
+  metadata_folder <- glue::glue("{data_dir}/{spod_subfolder_metadata_cache()}")
+
+  # if forcing, evict the in-session cache now
+  if (isTRUE(force)) {
+    memoise::forget(spod_available_data_s3_memoised)
+  }
+
+  # shortcut: if we already have it memoised, return immediately
+  if (!force && memoise::has_cache(spod_available_data_s3_memoised)(ver)) {
+    if (!quiet) message("Using memory-cached available data from S3")
+    return(spod_available_data_s3_memoised(ver))
+  }
+
+  # no in-session data, check your on-disk RDS pool
+  pattern <- glue::glue("metadata_s3_v{ver}_\\d{{4}}-\\d{{2}}-\\d{{2}}\\.rds$")
+  rds_files <- fs::dir_ls(
+    path = metadata_folder,
+    type = "file",
+    regexp = pattern
+  ) |>
+    sort()
+
+  latest_file <- utils::tail(rds_files, 1)
+  latest_date <- if (length(latest_file) == 1) {
+    stringr::str_extract(basename(latest_file), "\\d{4}-\\d{2}-\\d{2}") |>
+      as.Date()
+  } else {
+    NA
+  }
+
+  needs_update <- isTRUE(force) ||
+    length(rds_files) == 0 ||
+    (!is.na(latest_date) && latest_date < Sys.Date())
+
+  if (!needs_update) {
+    if (!quiet) message("Using existing disk cache: ", latest_file)
+    return(readRDS(latest_file))
+  }
+
+  # if forcing, also clear old disk files
+  if (isTRUE(force) && length(rds_files) > 0) {
+    fs::file_delete(rds_files)
+  }
+
+  # fetch via the memoised function (this will re-hit S3 if we forgot it)
+  if (!quiet) message("Fetching latest metadata from AmazonS3 (v", ver, ")...")
+  dat <- spod_available_data_s3_memoised(ver)
+
+  # write a new RDS stamped with today's date
+  file_date <- format(Sys.Date(), "%Y-%m-%d")
+  out_path <- file.path(
+    metadata_folder,
+    glue::glue("metadata_s3_v{ver}_{file_date}.rds")
+  )
+  saveRDS(dat, out_path)
+  if (!quiet) message("Cached new data to: ", out_path)
+
+  dat
+}
+
+
+spod_available_data_s3_function <- function(
+  ver = c(1, 2)
+) {
+  ver <- as.character(ver)
+  ver <- match.arg(ver)
+
+  bucket <- paste0("mitma-movilidad-v", ver)
+
+  # original_aws_region <- Sys.getenv("AWS_DEFAULT_REGION")
+  # original_aws_url_style <- Sys.getenv("AWS_S3_URL_STYLE")
+  # on.exit({
+  #   Sys.setenv(
+  #     AWS_DEFAULT_REGION = original_aws_region,
+  #     AWS_S3_URL_STYLE = original_aws_url_style
+  #   )
+  # })
+  # Sys.setenv(
+  #   AWS_DEFAULT_REGION = "eu-west-1",
+  #   AWS_S3_URL_STYLE = "virtual"
+  # )
+
+  if (ver == 1) {
+    url_prefix <- "https://opendata-movilidad.mitma.es/"
+  } else {
+    url_prefix <- "https://movilidad-opendata.mitma.es/"
+  }
+
+  s3 <- paws.storage::s3(
+    config = list(
+      credentials = list(
+        anonymous = TRUE
+      )
+    )
+  )
+
+  all_objects <- list_objects_v2_all(s3, bucket)
+
+  # all_objects <- aws.s3::get_bucket_df(
+  #   bucket = bucket,
+  #   prefix = "", # root of bucket
+  #   max = Inf # fetch beyond the default 1000
+  # )
+
+  all_objects <- all_objects |>
+    dplyr::as_tibble() |>
+    dplyr::mutate(
+      target_url = paste0(url_prefix, .data$Key),
+      pub_ts = as.POSIXct(
+        .data$LastModified,
+        format = "%Y-%m-%dT%H:%M:%OSZ",
+        tz = "UTC"
+      ),
+      file_size_bytes = as.numeric(.data$Size),
+      etag = gsub('\\"', '', .data$ETag)
+    ) |>
+    dplyr::select(
+      .data$target_url,
-      .data$target_url,
+      target_url,
-      .data$target_url,
+      target_url,
+      .data$pub_ts,
+      .data$file_size_bytes,
+      .data$etag
+    )
+
+  return(all_objects)
+}
+
+spod_available_data_s3_memoised <- memoise::memoise(
+  spod_available_data_s3_function
+)
+
+list_objects_v2_all <- function(s3, bucket, prefix = "", max_keys = 10000) {
+  pages <- paws.storage::paginate(
+    s3$list_objects_v2(
+      Bucket = bucket,
+      Prefix = prefix,
+      MaxKeys = max_keys
+    ),
+    PageSize = max_keys
+  )
+
+  all_objects <- unlist(
+    lapply(pages, `[[`, "Contents"),
+    recursive = FALSE
+  )
+
+  metadata <- dplyr::tibble(
+    Key = vapply(all_objects, `[[`, character(1), "Key"),
+    LastModified = as.POSIXct(
+      vapply(all_objects, `[[`, numeric(1), "LastModified"),
+      origin = "1970-01-01",
+      tz = "UTC"
+    ),
+    Size = vapply(all_objects, `[[`, numeric(1), "Size"),
+    ETag = vapply(all_objects, `[[`, character(1), "ETag")
+  )
+
+  # S3 generate download urls
+  # urls <- metadata$Key |>
+  #   purrr::map(
+  #     ~ s3$generate_presigned_url(
+  #       client_method = "get_object",
+  #       params = list(Bucket = "mitma-movilidad-v1", Key = .x)
+  #     ),
+  #     .progress = TRUE
+  #   )
+
+  return(metadata)
+}