|
| 1 | +#' Archer-Lemeshow Goodness-of-Fit Test for Survey Models |
| 2 | +#' |
| 3 | +#' @description |
| 4 | +#' Performs an Archer-Lemeshow goodness-of-fit (GOF) test for logistic |
| 5 | +#' regression models fitted with complex survey data. This test is an extension |
| 6 | +#' of the Hosmer-Lemeshow test for survey designs. |
| 7 | +#' |
| 8 | +#' @details |
| 9 | +#' The function automates the process of calculating residuals and fitted values, |
| 10 | +#' creating groups (deciles by default) based on fitted probabilities, |
| 11 | +#' building a new survey design with these variables, and running a final |
| 12 | +#' Wald test. A non-significant p-value (e.g., p > 0.05) suggests no evidence |
| 13 | +#' of a poor fit. |
| 14 | +#' |
| 15 | +#' @param fit A fitted model object of class `svyglm`. |
| 16 | +#' @param design A survey design object of class `svydesign` or `svyrep.design` |
| 17 | +#' that was used to fit the model. |
| 18 | +#' @param G An integer specifying the number of groups to create based on |
| 19 | +#' fitted probabilities. Defaults to 10 (deciles). |
| 20 | +#' |
| 21 | +#' @return |
| 22 | +#' A `data.frame` containing the F-statistic, the numerator (df1) and |
| 23 | +#' denominator (df2) degrees of freedom, and the p-value for the test. |
| 24 | +#' |
| 25 | +#' @source |
| 26 | +#' The implementation is a formalized function based on the script and discussion |
| 27 | +#' in the R-help mailing list archives: \url{https://stat.ethz.ch/pipermail/r-help/2016-November/443223.html} |
| 28 | +#' |
| 29 | +#' @importFrom survey svydesign svyglm regTermTest |
| 30 | +#' @importFrom stats residuals fitted quantile |
| 31 | +#' |
| 32 | +#' @export |
| 33 | +#' |
| 34 | +#' @examples |
| 35 | +#' \dontrun{ |
| 36 | +#' # Ensure required packages are loaded |
| 37 | +#' if (requireNamespace("survey", quietly = TRUE) && |
| 38 | +#' requireNamespace("NHANES", quietly = TRUE) && |
| 39 | +#' requireNamespace("dplyr", quietly = TRUE)) { |
| 40 | +#' |
| 41 | +#' # 1. Prepare Data |
| 42 | +#' data(NHANESraw, package = "NHANES") |
| 43 | +#' nhanes_data <- NHANESraw %>% |
| 44 | +#' dplyr::filter(Age >= 20) %>% |
| 45 | +#' dplyr::mutate(ObeseStatus = factor(ifelse(BMI >= 30, "Obese", "Not Obese"), |
| 46 | +#' levels = c("Not Obese", "Obese"))) %>% |
| 47 | +#' dplyr::filter(complete.cases(ObeseStatus, Age, Gender, Race1, |
| 48 | +#' WTMEC2YR, SDMVPSU, SDMVSTRA)) |
| 49 | +#' |
| 50 | +#' # 2. Create a replicate design object |
| 51 | +#' std_design <- survey::svydesign( |
| 52 | +#' ids = ~SDMVPSU, |
| 53 | +#' strata = ~SDMVSTRA, |
| 54 | +#' weights = ~WTMEC2YR, |
| 55 | +#' nest = TRUE, |
| 56 | +#' data = nhanes_data |
| 57 | +#' ) |
| 58 | +#' rep_design <- survey::as.svrepdesign(std_design) |
| 59 | +#' |
| 60 | +#' # 3. Fit a survey logistic regression model using the replicate design |
| 61 | +#' fit_obesity_rep <- survey::svyglm( |
| 62 | +#' ObeseStatus ~ Age + Gender + Race1, |
| 63 | +#' design = rep_design, |
| 64 | +#' family = quasibinomial() |
| 65 | +#' ) |
| 66 | +#' |
| 67 | +#' # 4. Calculate the design-correct AUC |
| 68 | +#' auc_results <- svyAUC(fit_obesity_rep, rep_design) |
| 69 | +#' print(auc_results) |
| 70 | +#' } |
| 71 | +#' } |
| 72 | +svygof <- function(fit, design, G = 10) { |
| 73 | + |
| 74 | + # Get residuals and fitted values from the model |
| 75 | + resids <- stats::residuals(fit, type = "response") |
| 76 | + fitted_vals <- stats::fitted(fit) |
| 77 | + |
| 78 | + # Create a data frame of model results, using row names to link back |
| 79 | + model_data <- data.frame( |
| 80 | + .id = names(resids), |
| 81 | + r = resids, |
| 82 | + f = fitted_vals |
| 83 | + ) |
| 84 | + |
| 85 | + # Use the data directly from the design object, which is the most reliable source |
| 86 | + data_with_res <- design$variables |
| 87 | + data_with_res$.id <- rownames(data_with_res) |
| 88 | + data_with_res <- merge(data_with_res, model_data, by = ".id", all.x = TRUE) |
| 89 | + |
| 90 | + # Create G groups based on fitted values |
| 91 | + breaks <- stats::quantile(data_with_res$f, probs = seq(0, 1, 1 / G), na.rm = TRUE) |
| 92 | + unique_breaks <- unique(breaks) |
| 93 | + data_with_res$g <- cut(data_with_res$f, breaks = unique_breaks, include.lowest = TRUE) |
| 94 | + |
| 95 | + # Rebuild the design object using its internal components |
| 96 | + new_design <- survey::svydesign( |
| 97 | + ids = design$cluster, |
| 98 | + strata = design$strata, |
| 99 | + weights = design$weights, |
| 100 | + data = data_with_res, |
| 101 | + nest = isTRUE(design$nest) |
| 102 | + ) |
| 103 | + |
| 104 | + # Run the test |
| 105 | + decile_model <- survey::svyglm(r ~ g, design = new_design, na.action = na.omit) |
| 106 | + test_result <- survey::regTermTest(decile_model, ~g) |
| 107 | + |
| 108 | + # Return a tidy data frame |
| 109 | + output <- data.frame( |
| 110 | + F_statistic = test_result$Ftest[1], |
| 111 | + df1 = test_result$df, |
| 112 | + df2 = test_result$ddf, |
| 113 | + p_value = test_result$p |
| 114 | + ) |
| 115 | + |
| 116 | + return(output) |
| 117 | +} |
0 commit comments