-
Notifications
You must be signed in to change notification settings - Fork 5
S3 metadata and file size checks #165
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
6ea2b56
f302f24
da696ac
da79487
e40d6bc
99cee6d
c9fe3fe
14bac0e
9afcd72
dce2147
6461eeb
11e1bc8
bdccdf9
6fb127c
933ae6c
fdb506d
436f83d
66b614b
9ce412d
0f0d5dd
61ce5c5
3f65500
4d4b126
74fd2fc
3aa7bda
4fb8894
748e03d
4b3e6e5
7a7b3d1
0f82114
3e68b93
ee622d7
6072d65
1d4b31a
ab3c8c6
146682b
8d55176
ef15db4
b6c778c
9dd8ae4
f135a7d
237f4a0
a7f079e
7b9f61f
be64225
7f5f87c
77cb0f3
f1fd5b1
de0cfe2
c47ad3a
44b24d2
dfdc1ca
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
| @@ -0,0 +1,187 @@ | ||||||
| #' Get available data list from Amazon S3 storage | ||||||
| #' | ||||||
| #' @description | ||||||
| #' | ||||||
| #' Get a table with links to available data files for the specified data version from Amazon S3 storage. | ||||||
| #' | ||||||
| #' @inheritParams spod_available_data | ||||||
| #' @inheritParams global_quiet_param | ||||||
| #' @return A tibble with links, release dates of files in the data, dates of data coverage, local paths to files, and the download status. | ||||||
| #' | ||||||
| #' @keywords internal | ||||||
| spod_available_data_s3 <- function( | ||||||
| ver = c(1, 2), | ||||||
| force = FALSE, | ||||||
| quiet = FALSE, | ||||||
| data_dir = spod_get_data_dir() | ||||||
| ) { | ||||||
| ver <- as.character(ver) | ||||||
| ver <- match.arg(ver) | ||||||
| metadata_folder <- glue::glue("{data_dir}/{spod_subfolder_metadata_cache()}") | ||||||
|
|
||||||
| # if forcing, evict the in-session cache now | ||||||
| if (isTRUE(force)) { | ||||||
| memoise::forget(spod_available_data_s3_memoised) | ||||||
| } | ||||||
|
|
||||||
| # shortcut: if we already have it memoised, return immediately | ||||||
| if (!force && memoise::has_cache(spod_available_data_s3_memoised)(ver)) { | ||||||
| if (!quiet) message("Using memory-cached available data from S3") | ||||||
| return(spod_available_data_s3_memoised(ver)) | ||||||
| } | ||||||
|
|
||||||
| # no in-session data, check your on-disk RDS pool | ||||||
| pattern <- glue::glue("metadata_s3_v{ver}_\\d{{4}}-\\d{{2}}-\\d{{2}}\\.rds$") | ||||||
| rds_files <- fs::dir_ls( | ||||||
| path = metadata_folder, | ||||||
| type = "file", | ||||||
| regexp = pattern | ||||||
| ) |> | ||||||
| sort() | ||||||
|
|
||||||
| latest_file <- utils::tail(rds_files, 1) | ||||||
| latest_date <- if (length(latest_file) == 1) { | ||||||
| stringr::str_extract(basename(latest_file), "\\d{4}-\\d{2}-\\d{2}") |> | ||||||
| as.Date() | ||||||
| } else { | ||||||
| NA | ||||||
| } | ||||||
|
|
||||||
| needs_update <- isTRUE(force) || | ||||||
| length(rds_files) == 0 || | ||||||
| (!is.na(latest_date) && latest_date < Sys.Date()) | ||||||
|
|
||||||
| if (!needs_update) { | ||||||
| if (!quiet) message("Using existing disk cache: ", latest_file) | ||||||
| return(readRDS(latest_file)) | ||||||
| } | ||||||
|
|
||||||
| # if forcing, also clear old disk files | ||||||
| if (isTRUE(force) && length(rds_files) > 0) { | ||||||
| fs::file_delete(rds_files) | ||||||
| } | ||||||
|
|
||||||
| # fetch via the memoised function (this will re-hit S3 if we forgot it) | ||||||
| if (!quiet) message("Fetching latest metadata from AmazonS3 (v", ver, ")...") | ||||||
| dat <- spod_available_data_s3_memoised(ver) | ||||||
|
|
||||||
| # write a new RDS stamped with today's date | ||||||
| file_date <- format(Sys.Date(), "%Y-%m-%d") | ||||||
| out_path <- file.path( | ||||||
| metadata_folder, | ||||||
| glue::glue("metadata_s3_v{ver}_{file_date}.rds") | ||||||
| ) | ||||||
| saveRDS(dat, out_path) | ||||||
| if (!quiet) message("Cached new data to: ", out_path) | ||||||
|
|
||||||
| dat | ||||||
| } | ||||||
|
|
||||||
|
|
||||||
| spod_available_data_s3_function <- function( | ||||||
| ver = c(1, 2) | ||||||
| ) { | ||||||
| ver <- as.character(ver) | ||||||
| ver <- match.arg(ver) | ||||||
|
|
||||||
| bucket <- paste0("mitma-movilidad-v", ver) | ||||||
|
|
||||||
| # original_aws_region <- Sys.getenv("AWS_DEFAULT_REGION") | ||||||
| # original_aws_url_style <- Sys.getenv("AWS_S3_URL_STYLE") | ||||||
| # on.exit({ | ||||||
| # Sys.setenv( | ||||||
| # AWS_DEFAULT_REGION = original_aws_region, | ||||||
| # AWS_S3_URL_STYLE = original_aws_url_style | ||||||
| # ) | ||||||
| # }) | ||||||
| # Sys.setenv( | ||||||
| # AWS_DEFAULT_REGION = "eu-west-1", | ||||||
| # AWS_S3_URL_STYLE = "virtual" | ||||||
| # ) | ||||||
|
|
||||||
| if (ver == 1) { | ||||||
| url_prefix <- "https://opendata-movilidad.mitma.es/" | ||||||
| } else { | ||||||
| url_prefix <- "https://movilidad-opendata.mitma.es/" | ||||||
| } | ||||||
|
|
||||||
| s3 <- paws.storage::s3( | ||||||
| config = list( | ||||||
| credentials = list( | ||||||
| anonymous = TRUE | ||||||
| ) | ||||||
| ) | ||||||
| ) | ||||||
|
|
||||||
| all_objects <- list_objects_v2_all(s3, bucket) | ||||||
|
|
||||||
| # all_objects <- aws.s3::get_bucket_df( | ||||||
| # bucket = bucket, | ||||||
| # prefix = "", # root of bucket | ||||||
| # max = Inf # fetch beyond the default 1000 | ||||||
| # ) | ||||||
|
|
||||||
| all_objects <- all_objects |> | ||||||
| dplyr::as_tibble() |> | ||||||
| dplyr::mutate( | ||||||
| target_url = paste0(url_prefix, .data$Key), | ||||||
| pub_ts = as.POSIXct( | ||||||
| .data$LastModified, | ||||||
| format = "%Y-%m-%dT%H:%M:%OSZ", | ||||||
| tz = "UTC" | ||||||
| ), | ||||||
| file_size_bytes = as.numeric(.data$Size), | ||||||
| etag = gsub('\\"', '', .data$ETag) | ||||||
| ) |> | ||||||
| dplyr::select( | ||||||
| .data$target_url, | ||||||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why not
Suggested change
out of interest?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I recall R CMD check complaining about undefined variables in functions and pointing to those tidy-eval column names, unless they are prepended by .data from rlang.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. for example, I forgot to fix the function below spod_store_etags <- function() {
available_data <- spod_available_data(1, check_local_files = TRUE)
available_data <- available_data |>
dplyr::filter(downloaded == TRUE)
local_etags <- available_data$local_path |>
purrr::map_chr(~ spod_compute_s3_etag(.x), .progress = TRUE)
available_data <- available_data |>
dplyr::mutate(local_etag = local_etags) |>
dplyr::as_tibble()
return(available_data)
}I get: Then if I prepend dplyr::filter(!!downloaded == TRUE)Same NOTE: Then replacing the problematic line with: dplyr::filter(.data$downloaded == TRUE)No notes anymore 🤷♂️
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, that's a fair reason for using the I think messages like this Can be resolved with |
||||||
| .data$pub_ts, | ||||||
| .data$file_size_bytes, | ||||||
| .data$etag | ||||||
| ) | ||||||
|
|
||||||
| return(all_objects) | ||||||
| } | ||||||
|
|
||||||
| spod_available_data_s3_memoised <- memoise::memoise( | ||||||
| spod_available_data_s3_function | ||||||
| ) | ||||||
|
|
||||||
| list_objects_v2_all <- function(s3, bucket, prefix = "", max_keys = 10000) { | ||||||
| pages <- paws.storage::paginate( | ||||||
e-kotov marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
| s3$list_objects_v2( | ||||||
| Bucket = bucket, | ||||||
| Prefix = prefix, | ||||||
| MaxKeys = max_keys | ||||||
| ), | ||||||
| PageSize = max_keys | ||||||
| ) | ||||||
|
|
||||||
| all_objects <- unlist( | ||||||
| lapply(pages, `[[`, "Contents"), | ||||||
| recursive = FALSE | ||||||
| ) | ||||||
|
|
||||||
| metadata <- dplyr::tibble( | ||||||
| Key = vapply(all_objects, `[[`, character(1), "Key"), | ||||||
| LastModified = as.POSIXct( | ||||||
| vapply(all_objects, `[[`, numeric(1), "LastModified"), | ||||||
| origin = "1970-01-01", | ||||||
| tz = "UTC" | ||||||
| ), | ||||||
| Size = vapply(all_objects, `[[`, numeric(1), "Size"), | ||||||
| ETag = vapply(all_objects, `[[`, character(1), "ETag") | ||||||
| ) | ||||||
|
|
||||||
| # S3 generate download urls | ||||||
| # urls <- metadata$Key |> | ||||||
| # purrr::map( | ||||||
| # ~ s3$generate_presigned_url( | ||||||
| # client_method = "get_object", | ||||||
| # params = list(Bucket = "mitma-movilidad-v1", Key = .x) | ||||||
| # ), | ||||||
| # .progress = TRUE | ||||||
| # ) | ||||||
|
|
||||||
| return(metadata) | ||||||
| } | ||||||
Uh oh!
There was an error while loading. Please reload this page.