|
| 1 | +#' Perform Reliability Diagnostics on Survey Regression Models |
| 2 | +#' |
| 3 | +#' @description |
| 4 | +#' This function takes a fitted survey regression model object (e.g., from `svyglm`) |
| 5 | +#' and produces a tibble with key reliability and diagnostic metrics for each |
| 6 | +#' coefficient. |
| 7 | +#' |
| 8 | +#' @details |
| 9 | +#' The output provides a comprehensive overview to help assess the stability and |
| 10 | +#' precision of each regression coefficient. The metrics include: |
| 11 | +#' \itemize{ |
| 12 | +#' \item \strong{Standard Error (SE)}: A measure of the estimate's precision. Smaller is better. |
| 13 | +#' \item \strong{p-value}: The probability of observing the data if the coefficient were zero. |
| 14 | +#' \item \strong{Confidence Interval (CI) Width}: A wide CI indicates greater uncertainty. |
| 15 | +#' \item \strong{Relative Standard Error (RSE)}: Calculated as `(SE / |Estimate|) * 100`. |
| 16 | +#' } |
| 17 | +#' |
| 18 | +#' \strong{Note on RSE}: While included for comparative purposes, the use of RSE to |
| 19 | +#' evaluate the reliability of regression coefficients is not recommended by |
| 20 | +#' agencies like NCHS/CDC. Coefficients near zero can have an extremely large RSE |
| 21 | +#' even if precisely estimated. It is better to rely on the standard error, |
| 22 | +#' p-value, and confidence interval width for reliability assessment. |
| 23 | +#' |
| 24 | +#' @param fit A fitted model object, typically of class `svyglm`. |
| 25 | +#' @param p_threshold A numeric value (between 0 and 1) for the significance threshold. Defaults to `0.05`. |
| 26 | +#' @param rse_threshold A numeric value for flagging high Relative Standard Error (RSE). Defaults to `30`. |
| 27 | +#' |
| 28 | +#' @return |
| 29 | +#' A `tibble` containing the following columns: |
| 30 | +#' \itemize{ |
| 31 | +#' \item \code{Term}: The name of the regression coefficient. |
| 32 | +#' \item \code{Estimate}: The coefficient's point estimate (e.g., on the log-odds scale for logistic models). |
| 33 | +#' \item \code{SE}: The standard error of the estimate. |
| 34 | +#' \item \code{p.value}: The p-value for the coefficient. |
| 35 | +#' \item \code{is_significant}: A logical flag, `TRUE` if `p.value` is less than `p_threshold`. |
| 36 | +#' \item \code{CI_Lower}: The lower bound of the 95% confidence interval. |
| 37 | +#' \item \code{CI_Upper}: The upper bound of the 95% confidence interval. |
| 38 | +#' \item \code{CI_Width}: The absolute width of the confidence interval (`CI_Upper - CI_Lower`). |
| 39 | +#' \item \code{RSE_percent}: The Relative Standard Error, as a percentage. |
| 40 | +#' \item \code{is_rse_high}: A logical flag, `TRUE` if `RSE_percent` is greater than or equal to `rse_threshold`. |
| 41 | +#' } |
| 42 | +#' |
| 43 | +#' @importFrom dplyr mutate bind_cols select |
| 44 | +#' @importFrom tibble as_tibble |
| 45 | +#' @importFrom stats confint setNames |
| 46 | +#' |
| 47 | +#' @export |
| 48 | +#' |
| 49 | +#' @examples |
| 50 | +#' # Ensure required packages are loaded |
| 51 | +#' if (requireNamespace("survey", quietly = TRUE) && |
| 52 | +#' requireNamespace("NHANES", quietly = TRUE) && |
| 53 | +#' requireNamespace("dplyr", quietly = TRUE)) { |
| 54 | +#' |
| 55 | +#' # 1. Prepare Data using the NHANES example |
| 56 | +#' data(NHANESraw, package = "NHANES") |
| 57 | +#' nhanes_adults_with_na <- NHANESraw %>% |
| 58 | +#' dplyr::filter(Age >= 20) %>% |
| 59 | +#' dplyr::mutate( |
| 60 | +#' ObeseStatus = factor(ifelse(BMI >= 30, "Obese", "Not Obese"), |
| 61 | +#' levels = c("Not Obese", "Obese")), |
| 62 | +#' Race1 = factor(Race1) |
| 63 | +#' ) |
| 64 | +#' |
| 65 | +#' # Create a complete-case design object for the regression model |
| 66 | +#' nhanes_complete <- nhanes_adults_with_na[complete.cases( |
| 67 | +#' nhanes_adults_with_na[, c("ObeseStatus", "Age", "Race1")] |
| 68 | +#' ), ] |
| 69 | +#' |
| 70 | +#' adult_design_complete <- survey::svydesign( |
| 71 | +#' id = ~SDMVPSU, |
| 72 | +#' strata = ~SDMVSTRA, |
| 73 | +#' weights = ~WTMEC2YR, |
| 74 | +#' nest = TRUE, |
| 75 | +#' data = nhanes_complete |
| 76 | +#' ) |
| 77 | +#' |
| 78 | +#' # 2. Fit a survey-weighted logistic regression model |
| 79 | +#' fit <- survey::svyglm( |
| 80 | +#' ObeseStatus ~ Age + Race1, |
| 81 | +#' design = adult_design_complete, |
| 82 | +#' family = quasibinomial() |
| 83 | +#' ) |
| 84 | +#' |
| 85 | +#' # 3. Get the reliability diagnostics table |
| 86 | +#' diagnostics_table <- svyglmdiag(fit) |
| 87 | +#' |
| 88 | +#' # Print the resulting table |
| 89 | +#' print(diagnostics_table) |
| 90 | +#' |
| 91 | +#' # For a publication-ready table, pipe the result to kable() |
| 92 | +#' if (requireNamespace("knitr", quietly = TRUE)) { |
| 93 | +#' knitr::kable(diagnostics_table, |
| 94 | +#' caption = "Reliability Diagnostics for NHANES Obesity Model", |
| 95 | +#' digits = 3) |
| 96 | +#' } |
| 97 | +#' } |
| 98 | + |
| 99 | +svyglmdiag <- function(fit, p_threshold = 0.05, rse_threshold = 30) { |
| 100 | + |
| 101 | + # --- Input validation --- |
| 102 | + if (!inherits(fit, "svyglm")) { |
| 103 | + warning("This function is designed for 'svyglm' objects. Results may be unexpected.") |
| 104 | + } |
| 105 | + |
| 106 | + # 1. Get the standard model summary and confidence intervals |
| 107 | + summary_fit <- summary(fit) |
| 108 | + conf_int_fit <- stats::confint(fit) |
| 109 | + |
| 110 | + # 2. Combine these into a single, informative table |
| 111 | + reliability_df <- tibble::as_tibble(summary_fit$coefficients, rownames = "Term") |
| 112 | + names(reliability_df) <- c("Term", "Estimate", "SE", "t.value", "p.value") |
| 113 | + |
| 114 | + # 3. Add CIs, calculate metrics, and add flags |
| 115 | + reliability_df <- reliability_df %>% |
| 116 | + dplyr::bind_cols(tibble::as_tibble(conf_int_fit) %>% |
| 117 | + stats::setNames(c("CI_Lower", "CI_Upper"))) %>% |
| 118 | + dplyr::mutate( |
| 119 | + RSE_percent = (SE / abs(Estimate)) * 100, |
| 120 | + CI_Width = CI_Upper - CI_Lower, |
| 121 | + is_significant = p.value < p_threshold, |
| 122 | + is_rse_high = RSE_percent >= rse_threshold |
| 123 | + ) %>% |
| 124 | + # Reorder and select the final columns for a clean output |
| 125 | + dplyr::select( |
| 126 | + Term, |
| 127 | + Estimate, |
| 128 | + SE, |
| 129 | + p.value, |
| 130 | + is_significant, |
| 131 | + CI_Lower, |
| 132 | + CI_Upper, |
| 133 | + CI_Width, |
| 134 | + RSE_percent, |
| 135 | + is_rse_high |
| 136 | + ) |
| 137 | + |
| 138 | + return(reliability_df) |
| 139 | +} |
0 commit comments