|
| 1 | +#' Download and cache NHANES data locally |
| 2 | +#' |
| 3 | +#' Downloads all tables within the specified NHANES data groups for one or |
| 4 | +#' more survey years, merges them into a single data frame per year, and |
| 5 | +#' saves it as an RDS file. The resulting cache is used automatically by |
| 6 | +#' [sample_covariates_nhanes()]. |
| 7 | +#' |
| 8 | +#' @param groups character vector of NHANES data groups to download. Valid |
| 9 | +#' values: `"DEMO"` (Demographics), `"LAB"` (Laboratory), `"EXAM"` |
| 10 | +#' (Examination), `"Q"` (Questionnaire), `"DIET"` (Dietary). |
| 11 | +#' Defaults to `c("DEMO", "LAB", "EXAM")`. |
| 12 | +#' @param years character vector of NHANES survey cycles, e.g. |
| 13 | +#' `c("2015-2016", "2017-2018")`. Defaults to `"2017-2018"`. |
| 14 | +#' @param path directory where merged RDS files will be saved. Created |
| 15 | +#' automatically if it does not exist. Defaults to the package-level cache |
| 16 | +#' directory returned by `nhanes_default_cache_dir()`. |
| 17 | +#' @param overwrite logical. If `FALSE` (default), a year that already has a |
| 18 | +#' merged RDS in `path` is skipped. Set to `TRUE` to re-download and |
| 19 | +#' overwrite. |
| 20 | +#' @param ... additional arguments (currently unused) |
| 21 | +#' |
| 22 | +#' @details |
| 23 | +#' Each survey year is saved as a single file `nhanes_<year>.rds` |
| 24 | +#' (e.g. `nhanes_2017-2018.rds`) containing all variables from all downloaded |
| 25 | +#' tables, merged on the SEQN respondent sequence number. Tables with |
| 26 | +#' multiple rows per subject (e.g. dietary recall) are skipped automatically. |
| 27 | +#' |
| 28 | +#' Requires the `nhanesA` package. |
| 29 | +#' |
| 30 | +#' @returns the `path` directory, invisibly. |
| 31 | +#' |
| 32 | +#' @export |
| 33 | +download_nhanes_cache <- function( |
| 34 | + groups = c("DEMO", "LAB", "EXAM"), |
| 35 | + years = "2017-2018", |
| 36 | + path = nhanes_default_cache_dir(), |
| 37 | + overwrite = FALSE, |
| 38 | + ... |
| 39 | +) { |
| 40 | + if (!requireNamespace("nhanesA", quietly = TRUE)) { |
| 41 | + stop( |
| 42 | + "Package 'nhanesA' is required to download NHANES data. ", |
| 43 | + "Install it with: install.packages('nhanesA')", |
| 44 | + call. = FALSE |
| 45 | + ) |
| 46 | + } |
| 47 | + |
| 48 | + if (!dir.exists(path)) { |
| 49 | + dir.create(path, recursive = TRUE) |
| 50 | + } |
| 51 | + |
| 52 | + for (year in years) { |
| 53 | + nhanes_year_suffix(year) # validates year; errors on unsupported values |
| 54 | + out_file <- file.path(path, paste0("nhanes_", year, ".rds")) |
| 55 | + |
| 56 | + if (file.exists(out_file) && !overwrite) { |
| 57 | + message("Skipping ", year, " (cache exists; use overwrite = TRUE to re-download)") |
| 58 | + next |
| 59 | + } |
| 60 | + |
| 61 | + year_end <- as.integer(sub(".*-", "", year)) |
| 62 | + table_list <- list() |
| 63 | + |
| 64 | + for (group in groups) { |
| 65 | + message("Fetching table list for group ", group, ", year ", year, " ...") |
| 66 | + tbl_info <- nhanesA::nhanesTables(group, year_end) |
| 67 | + # nhanesTables() returns a data.frame; table names are in Data.File.Name |
| 68 | + tbl_names <- tbl_info[["Data.File.Name"]] |
| 69 | + |
| 70 | + for (tbl_name in tbl_names) { |
| 71 | + message(" Downloading ", tbl_name, " ...") |
| 72 | + tbl_data <- tryCatch( |
| 73 | + nhanesA::nhanes(tbl_name), |
| 74 | + error = function(e) { |
| 75 | + message(" Skipping ", tbl_name, ": ", conditionMessage(e)) |
| 76 | + NULL |
| 77 | + } |
| 78 | + ) |
| 79 | + if (is.null(tbl_data) || !"SEQN" %in% names(tbl_data)) next |
| 80 | + if (anyDuplicated(tbl_data[["SEQN"]]) > 0) { |
| 81 | + message(" Skipping ", tbl_name, " (multiple rows per subject)") |
| 82 | + next |
| 83 | + } |
| 84 | + table_list[[tbl_name]] <- tbl_data |
| 85 | + } |
| 86 | + } |
| 87 | + |
| 88 | + if (length(table_list) == 0) { |
| 89 | + warning("No tables downloaded for year ", year, "; skipping.") |
| 90 | + next |
| 91 | + } |
| 92 | + |
| 93 | + message("Merging ", length(table_list), " tables for ", year, " ...") |
| 94 | + merged <- Reduce( |
| 95 | + function(a, b) dplyr::full_join(a, b, by = "SEQN"), |
| 96 | + table_list |
| 97 | + ) |
| 98 | + |
| 99 | + saveRDS(merged, out_file) |
| 100 | + message("Saved merged NHANES data (", nrow(merged), " subjects, ", |
| 101 | + ncol(merged), " variables) to ", out_file) |
| 102 | + } |
| 103 | + |
| 104 | + invisible(path) |
| 105 | +} |
0 commit comments