diff --git a/NEWS.md b/NEWS.md index 42a231a4..bc68458a 100644 --- a/NEWS.md +++ b/NEWS.md @@ -51,6 +51,13 @@ data. The `wpp2017` package is now a suggested dependency rather than a required import (#258). +* Reduced cyclomatic complexity of `try_merge_additional_files()` by extracting + helper functions (#289). + +* Fixed unmatched-merge warning count when merging files with duplicate keys; + previously, the count could be wrong (or negative) due to counting join pairs + rather than distinct matched rows (#289). + * `get_survey()`, `download_survey()`, `list_surveys()`, `get_citation()`, and `survey_countries()` now warn unconditionally when called. These functions were soft-deprecated in 0.5.0 and users should switch to the diff --git a/R/globals.R b/R/globals.R index b58a03d0..ca9aa85e 100644 --- a/R/globals.R +++ b/R/globals.R @@ -88,7 +88,7 @@ utils::globalVariables(c( "title", # <.list_surveys> "creator", # <.list_surveys> "identifier.2", # <.list_surveys> - "..merge_id", # + "..merge_id", # "..main_id", # "..original.lower.age.limit", # "..segment", # diff --git a/R/load-survey-utils.R b/R/load-survey-utils.R index fcfd5521..0baa7787 100644 --- a/R/load-survey-utils.R +++ b/R/load-survey-utils.R @@ -112,6 +112,244 @@ join_compatible_files <- function(survey_files, contact_data) { ) } +#' Identify which survey files share columns with a main table +#' @noRd +get_mergeable_files <- function(survey_files, contact_data, main_cols) { + can_merge <- vapply( + survey_files, + function(x) { + any(colnames(contact_data[[x]]) %in% main_cols) + }, + logical(1) + ) + names(can_merge[can_merge]) +} + +#' Resolve the unique key for a merged data.table with duplicates +#' +#' Validates a user-provided participant_key or auto-detects one via +#' find_unique_key(). +#' @noRd +resolve_longitudinal_key <- function( + merged, + participant_key = NULL, + call = rlang::caller_env() +) { + if (!is.null(participant_key)) { + missing_cols <- setdiff(participant_key, names(merged)) + if ( + length(missing_cols) == 0 && + anyDuplicated(merged, by = participant_key) == 0L + ) { + return(participant_key) + } + if (length(missing_cols) > 0) { + cli::cli_warn( + "Provided {.arg participant_key} contains column{?s} not found in \\ + merged data: {.val {missing_cols}}; auto-detecting a key instead.", + call = call + ) + } else { + cli::cli_warn( + "Provided {.arg participant_key} {.val {participant_key}} did not \\ + uniquely identify rows; auto-detecting a key instead.", + call = call + ) + } + } + find_unique_key(merged, "part_id") +} + +#' Try merging a single additional file into a main survey table +#' +#' @return A list with components: merged (data.table or NULL) and detected_key +#' (character vector or NULL). +#' @autoglobal +#' @noRd +try_merge_one_file <- function( + file, + type, + main_survey, + contact_data, + participant_key = NULL, + call = rlang::caller_env() +) { + null_result <- list(merged = NULL, detected_key = NULL) + + contact_data[[file]] <- contact_data[[file]][, + ("..merge_id") := seq_len(.N) + ] + common_id <- intersect( + colnames(contact_data[[file]]), + colnames(main_survey) + ) + merged <- tryCatch( + { + merge( + main_survey, + contact_data[[file]], + by = common_id, + all.x = TRUE + ) + }, + error = function(cond) { + if (!grepl("cartesian", cond$message, fixed = TRUE)) { + cli::cli_abort( + "Merge failed for {.file {basename(file)}} on \\ + {.val {common_id}}: {cond$message}", + call = call + ) + } + NULL + } + ) + + if (is.null(merged)) { + return(null_result) + } + + has_duplicates <- anyDuplicated(merged[, "..main_id", with = FALSE]) > 0 + detected_key <- NULL + + if (has_duplicates) { + if (type == "contact") { + return(null_result) + } + detected_key <- resolve_longitudinal_key(merged, participant_key, call) + if (is.null(detected_key)) { + return(null_result) + } + merged[, ("..main_id") := seq_len(.N)] + } + + warn_merge_quality(merged, contact_data[[file]], common_id, file, type, call) + merged[, ("..merge_id") := NULL] + + list(merged = merged, detected_key = detected_key) +} + +#' Warn about unmatched rows after a merge +#' @noRd +warn_merge_quality <- function( + merged, + file_data, + common_id, + file, + type, + call +) { + matched_main <- sum(!is.na(merged[["..merge_id"]])) + unmatched_main <- nrow(merged) - matched_main + if (unmatched_main > 0) { + cli::cli_warn( + "Only {matched_main} matching value{?s} in {.val {common_id}} \\ + column{?s} when pulling {.file {basename(file)}} into \\ + {.val {type}} survey.", + call = call + ) + } + matched_merge <- uniqueN(merged[["..merge_id"]], na.rm = TRUE) + unmatched_merge <- nrow(file_data) - matched_merge + if (unmatched_merge > 0) { + cli::cli_warn( + "{unmatched_merge} row{?s} could not be matched when pulling \\ + {.file {basename(file)}} into {.val {type}} survey.", + call = call + ) + } +} + +#' Try merging all compatible files into a single main survey table +#' +#' Iteratively merges files that share columns with the main table, repeating +#' until no further merges are possible. +#' @return A list with merged (the updated main survey), detected_key, and +#' remaining survey_files. +#' @noRd +merge_all_files <- function( + type, + main_survey, + survey_files, + contact_data, + participant_key = NULL, + call = rlang::caller_env() +) { + detected_key <- NULL + merge_files <- get_mergeable_files( + survey_files, + contact_data, + colnames(main_survey) + ) + + while (length(merge_files) > 0) { + merged_files <- character(0) + for (file in merge_files) { + result <- try_merge_one_file( + file, + type, + main_survey, + contact_data, + participant_key = participant_key, + call = call + ) + if (!is.null(result$merged)) { + main_survey <- result$merged + merged_files <- c(merged_files, file) + } + if (!is.null(result$detected_key)) { + detected_key <- result$detected_key + } + } + survey_files <- setdiff(survey_files, merged_files) + if (length(merged_files) == 0) { + break + } + merge_files <- get_mergeable_files( + survey_files, + contact_data, + colnames(main_survey) + ) + } + + list( + merged = main_survey, + detected_key = detected_key, + survey_files = survey_files + ) +} + +#' Inform user about detected longitudinal data +#' @noRd +inform_longitudinal_key <- function( + detected_key, + participant_key = NULL, + call = rlang::caller_env() +) { + if (is.null(detected_key)) { + return(invisible(NULL)) + } + user_key_matches <- !is.null(participant_key) && + setequal(detected_key, participant_key) + if (user_key_matches) { + return(invisible(NULL)) + } + + key_code <- paste0( + "c(", + paste0('"', detected_key, '"', collapse = ", "), + ")" + ) + cli::cli_inform( + c( + "Detected longitudinal data with unique key: {.val {detected_key}}.", + "*" = "Will treat individuals with the same {.val part_id} as unique.", + i = "To suppress this message, use: \\ + {.code load_survey(..., participant_key = {key_code})}" + ), + call = call + ) +} + ## lastly, merge in any additional files that can be merged #' @autoglobal try_merge_additional_files <- function( @@ -122,178 +360,35 @@ try_merge_additional_files <- function( participant_key = NULL, call = rlang::caller_env() ) { - # Track the observation key for participants (returned to caller) observation_key <- NULL for (type in main_types) { - # Track final detected key for this type (to show one message at end) - final_detected_key <- NULL - - main_cols <- colnames(main_surveys[[type]]) - can_merge <- vapply( + result <- merge_all_files( + type, + main_surveys[[type]], survey_files, - function(x) { - any(colnames(contact_data[[x]]) %in% main_cols) - }, - TRUE + contact_data, + participant_key = participant_key, + call = call ) - merge_files <- survey_files[can_merge] - while (length(merge_files) > 0) { - merged_files <- NULL - for (file in merge_files) { - contact_data[[file]] <- contact_data[[file]][, - ..merge_id := seq_len(.N) - ] - common_id <- intersect( - colnames(contact_data[[file]]), - colnames(main_surveys[[type]]) - ) - merged <- tryCatch( - { - merge( - main_surveys[[type]], - contact_data[[file]], - by = common_id, - all.x = TRUE - ) - }, - error = function(cond) { - if (!grepl("cartesian", cond$message, fixed = TRUE)) { - cli::cli_abort( - "Merge failed for {.file {basename(file)}} on \\ - {.val {common_id}}: {cond$message}", - call = call - ) - } - NULL - } - ) + main_surveys[[type]] <- result$merged[, ("..main_id") := NULL] + survey_files <- result$survey_files - if (is.null(merged)) { - next - } + inform_longitudinal_key(result$detected_key, participant_key, call) - # Check if merge created duplicates (longitudinal data case) - has_duplicates <- anyDuplicated(merged[, "..main_id", with = FALSE]) > 0 - - # If duplicates exist, check if there's a valid unique key - # (this handles longitudinal surveys where sday files create multiple - # rows per participant) - accept_merge <- !has_duplicates - if (has_duplicates && type == "contact") { - # No methodology for longitudinal contacts - reject merge with - # duplicates - next - } - if (has_duplicates) { - # Only participants reach here (contacts exit early above) - # Use user-specified key if valid, else auto-detect - if (!is.null(participant_key)) { - # Check if all key columns exist in merged data - missing_cols <- setdiff(participant_key, names(merged)) - if ( - length(missing_cols) == 0 && - anyDuplicated(merged, by = participant_key) == 0L - ) { - # User's key works - unique_key <- participant_key - } else { - # Key doesn't work or columns missing - auto-detect - unique_key <- find_unique_key(merged, "part_id") - } - } else { - unique_key <- find_unique_key(merged, "part_id") - } - - if (!is.null(unique_key)) { - accept_merge <- TRUE - # Update ..main_id to reflect the new unique key - merged[, ("..main_id") := seq_len(.N)] - final_detected_key <- unique_key - } - } - - if (accept_merge) { - ## we're keeping the merge; now check for any warnings to issue - matched_main <- sum(!is.na(merged[["..merge_id"]])) - unmatched_main <- nrow(merged) - matched_main - if (unmatched_main > 0) { - cli::cli_warn( - "Only {matched_main} matching value{?s} in {.val {common_id}} \\ - column{?s} when pulling {.file {basename(file)}} into \\ - {.val {type}} survey.", - call = call - ) - } - unmatched_merge <- nrow(contact_data[[file]]) - matched_main - if (unmatched_merge > 0) { - cli::cli_warn( - "{unmatched_merge} row{?s} could not be matched when pulling \\ - {.file {basename(file)}} into {.val {type}} survey.", - call = call - ) - } - merged[, ("..merge_id") := NULL] - main_surveys[[type]] <- merged - merged_files <- c(merged_files, file) - } - } - survey_files <- setdiff(survey_files, merged_files) - main_cols <- colnames(main_surveys[[type]]) - can_merge <- vapply( - survey_files, - function(x) { - any(colnames(contact_data[[x]]) %in% main_cols) - }, - TRUE - ) - if (is.null(merged_files)) { - merge_files <- NULL - } else { - merge_files <- names(can_merge[can_merge]) - } - } - - # Show one message about detected longitudinal data (if not suppressed) - # Show if: we detected a key AND user's key doesn't match - user_key_matches <- !is.null(participant_key) && - setequal(final_detected_key, participant_key) - if (!is.null(final_detected_key) && !user_key_matches) { - key_code <- paste0( - "c(", - paste0("\"", final_detected_key, "\"", collapse = ", "), - ")" - ) - cli::cli_inform( - c( - "Detected longitudinal data with unique key: {.val {final_detected_key}}.", - "*" = "Will treat individuals with the same {.val part_id} as unique.", - i = "To suppress this message, use: \\ - {.code load_survey(..., participant_key = {key_code})}" - ), - call = call - ) - } - - # Store the observation key for participants (excluding part_id since - # that's always the participant identifier after internal renaming) - if (type == "participant" && !is.null(final_detected_key)) { - obs_cols <- setdiff(final_detected_key, "part_id") + if (type == "participant" && !is.null(result$detected_key)) { + obs_cols <- setdiff(result$detected_key, "part_id") if (length(obs_cols) > 0) { observation_key <- obs_cols } } - - main_surveys[[type]] <- main_surveys[[type]][, ..main_id := NULL] } - if (length(survey_files) > 0) { - for (file in survey_files) { - cli::cli_warn( - message = "Could not merge {.file {file}}.", - call = call - ) - } + for (file in survey_files) { + cli::cli_warn( + message = "Could not merge {.file {basename(file)}}.", + call = call + ) } list(