22# '
33# ' @description
44# ' Generates a "Table 1" of descriptive statistics for complex survey data,
5- # ' stratified by an outcome variable. It supports different reporting modes and
6- # ' optional comma formatting for large numbers.
5+ # ' stratified by an outcome variable. It supports different reporting modes,
6+ # ' optional comma formatting, and optional reliability checks based on NCHS
7+ # ' Data Presentation Standards.
78# '
8- # ' @param design A survey design object created by the ` survey` package.
9+ # ' @param design A survey design object created by the survey package.
910# ' @param strata_var A string with the name of the stratification variable.
1011# ' @param table_vars A character vector of variable names to summarize.
1112# ' @param mode A string specifying the output type: "mixed" (default), "weighted", or "unweighted".
1213# ' @param commas Logical; if TRUE (default), large numbers in counts are formatted with commas.
14+ # ' @param reliability_checks Logical; if TRUE, performs reliability checks. Unreliable
15+ # ' estimates are replaced with an asterisk (*). The rules are based on NCHS guidelines.
16+ # ' \itemize{
17+ # ' \item{\strong{For proportions (categorical variables)}, an estimate is suppressed if it meets ANY of the following criteria:
18+ # ' \itemize{
19+ # ' \item{Unweighted Sample Size < 30: The actual number of survey participants in the cell is too small to produce a stable estimate.}
20+ # ' \item{Effective Sample Size < 30: After accounting for the complex survey design, the estimate's precision is equivalent to that from a simple random sample of fewer than 30 people.}
21+ # ' \item{Degrees of Freedom < 8: The variance estimate is itself unstable, making the entire estimate unreliable.}
22+ # ' \item{Confidence Interval Width >= 30\%: The 95\% confidence interval is 30 percentage points wide or wider, indicating extreme imprecision.}
23+ # ' \item{Conditional Relative CI Width > 130\%: If the confidence interval width is between 5\% and 30\%, it is also checked to see if it is excessively large \emph{relative} to the estimate itself.}
24+ # ' }
25+ # ' }
26+ # ' \item{\strong{For means (numeric variables)}, an estimate is suppressed if:
27+ # ' \itemize{
28+ # ' \item{Relative Standard Error (RSE) >= 30\%: The standard error is 30\% or more of the mean's value, indicating high imprecision.}
29+ # ' }
30+ # ' }
31+ # ' }
32+ # ' Defaults to FALSE.
33+ # ' @param return_metrics Logical; if TRUE and `reliability_checks` is also TRUE, the function
34+ # ' returns a list containing the formatted table and a detailed data frame of the
35+ # ' reliability metrics. This includes the RSE and specific TRUE/FALSE columns
36+ # ' (e.g., `fail_n_30`, `fail_eff_n_30`) for each suppression rule. Defaults to FALSE.
1337# '
14- # ' @return A data.frame formatted as a descriptive statistics table.
38+ # ' @return If `return_metrics` is FALSE (default), returns a data.frame. If TRUE,
39+ # ' returns a list with two elements: `formatted_table` and `reliability_metrics`.
1540# '
16- # ' @importFrom survey svytable svymean svyby svyvar
41+ # ' @importFrom survey svytable svymean svyby svyvar svyciprop degf SE
1742# ' @import stats
1843# '
1944# ' @export
2045svytable1 <- function (design , strata_var , table_vars ,
21- mode = " mixed" , commas = TRUE ) {
46+ mode = " mixed" , commas = TRUE ,
47+ reliability_checks = FALSE , return_metrics = FALSE ) {
2248
2349 # --- 1. Input Validation ---
2450 df <- design $ variables
@@ -28,10 +54,10 @@ svytable1 <- function(design, strata_var, table_vars,
2854 stop(paste(" The following variables were not found in the data:" ,
2955 paste(missing_vars , collapse = " , " )))
3056 }
57+ w_counts_strata <- NULL
3158
3259 # --- Helper function for formatting ---
3360 format_num <- function (n , is_weighted ) {
34- # --- FINAL FIX: Corrected typo from is.weighted to is_weighted ---
3561 if (is_weighted ) n <- round(n )
3662 if (commas ) return (format(n , big.mark = " ," ))
3763 return (as.character(n ))
@@ -70,6 +96,10 @@ svytable1 <- function(design, strata_var, table_vars,
7096
7197 results_list <- list (header_row )
7298
99+ if (reliability_checks && return_metrics ) {
100+ metrics_list <- list ()
101+ }
102+
73103 for (current_var_name in table_vars ) {
74104 var_formula <- as.formula(paste0(" ~" , current_var_name ))
75105 strata_formula <- as.formula(paste0(" ~" , strata_var ))
@@ -80,7 +110,7 @@ svytable1 <- function(design, strata_var, table_vars,
80110 current_var_vector <- design $ variables [[current_var_name ]]
81111
82112 if (is.factor(current_var_vector )) {
83-
113+ # --- Code for factor variables ---
84114 if (any(is.na(current_var_vector ))) {
85115 original_levels <- levels(current_var_vector )
86116 var_as_char <- as.character(current_var_vector )
@@ -91,61 +121,170 @@ svytable1 <- function(design, strata_var, table_vars,
91121 )
92122 }
93123
124+ if (reliability_checks ) {
125+ reliability_list <- list ()
126+
127+ for (s_lvl in strata_levels ) {
128+ sub_design <- subset(design , get(strata_var ) == s_lvl )
129+ if (nrow(sub_design ) == 0 ) next
130+
131+ prop_res <- try(svymean(var_formula , sub_design , na.rm = FALSE , deff = TRUE ), silent = TRUE )
132+ if (inherits(prop_res , " try-error" )) next
133+
134+ var_levels <- levels(sub_design $ variables [[current_var_name ]])
135+ ci_low_vals <- c(); ci_high_vals <- c()
136+ for (v_lvl in var_levels ){
137+ prop_formula <- as.formula(paste0(" ~I(`" , current_var_name , " ` == '" , v_lvl , " ')" ))
138+ ci_level <- svyciprop(prop_formula , sub_design , method = " beta" , na.rm = TRUE )
139+ ci_low_vals <- c(ci_low_vals , attr(ci_level , " ci" )[1 ])
140+ ci_high_vals <- c(ci_high_vals , attr(ci_level , " ci" )[2 ])
141+ }
142+
143+ reliability_list [[s_lvl ]] <- list (
144+ prop = as.numeric(prop_res ), se = as.numeric(SE(prop_res )),
145+ n = as.numeric(table(sub_design $ variables [[current_var_name ]])),
146+ deff = diag(attr(prop_res , " deff" )), ci_low = ci_low_vals ,
147+ ci_high = ci_high_vals , df = degf(sub_design )
148+ )
149+ }
150+ }
151+
94152 uw_counts_overall <- table(design $ variables [[current_var_name ]])
95153 uw_counts_strata <- table(design $ variables [[current_var_name ]], design $ variables [[strata_var ]])
96154 w_counts_overall <- svytable(var_formula , design , round = TRUE )
97- w_counts_strata <- svytable(as.formula(paste0(" ~" , current_var_name , " +" , strata_var )), design , round = TRUE )
98- w_pcts_overall <- svymean(var_formula , design , na.rm = FALSE )
99- w_pcts_strata <- svyby(var_formula , strata_formula , design , svymean , na.rm = FALSE )
100155
101156 for (lvl in levels(design $ variables [[current_var_name ]])) {
102157 row_data <- data.frame (Variable = " " , Level = lvl , stringsAsFactors = FALSE )
103158 col_name_pct <- paste0(current_var_name , lvl )
104159
160+ w_pcts_overall <- svymean(var_formula , design , na.rm = FALSE )
105161 if (mode == " mixed" ) {
106162 row_data $ Overall <- sprintf(" %s (%.1f%%)" , format_num(uw_counts_overall [lvl ], FALSE ), w_pcts_overall [col_name_pct ] * 100 )
107163 } else if (mode == " weighted" ) {
108164 row_data $ Overall <- sprintf(" %s (%.1f%%)" , format_num(w_counts_overall [lvl ], TRUE ), w_pcts_overall [col_name_pct ] * 100 )
109- } else { # unweighted
165+ } else {
110166 unweighted_pct <- 100 * uw_counts_overall [lvl ] / sum(uw_counts_overall )
111167 row_data $ Overall <- sprintf(" %s (%.1f%%)" , format_num(uw_counts_overall [lvl ], FALSE ), unweighted_pct )
112168 }
113169
114170 for (s_lvl in strata_levels ) {
115- pct_val <- w_pcts_strata [w_pcts_strata [, 1 ] == s_lvl , col_name_pct ]
116- if (length(pct_val ) == 0 || is.na(pct_val )) pct_val <- 0
117-
118- if (mode == " mixed" ) {
119- uw_count <- uw_counts_strata [lvl , s_lvl ]
120- row_data [[s_lvl ]] <- sprintf(" %s (%.1f%%)" , format_num(uw_count , FALSE ), pct_val * 100 )
121- } else if (mode == " weighted" ) {
122- w_count <- w_counts_strata [lvl , s_lvl ]
123- row_data [[s_lvl ]] <- sprintf(" %s (%.1f%%)" , format_num(w_count , TRUE ), pct_val * 100 )
124- } else { # unweighted
125- uw_count <- uw_counts_strata [lvl , s_lvl ]
126- unweighted_pct_strata <- 100 * uw_count / sum(uw_counts_strata [, s_lvl ])
127- if (is.nan(unweighted_pct_strata )) unweighted_pct_strata <- 0
128- row_data [[s_lvl ]] <- sprintf(" %s (%.1f%%)" , format_num(uw_count , FALSE ), unweighted_pct_strata )
171+ cell_value <- " " ; pct_val <- NA
172+ if (reliability_checks && ! is.null(reliability_list [[s_lvl ]])) {
173+ level_index <- which(levels(design $ variables [[current_var_name ]]) == lvl )
174+
175+ if (level_index > length(reliability_list [[s_lvl ]]$ n )) {
176+ uw_count <- uw_counts_strata [lvl , s_lvl ]
177+ cell_value <- if (uw_count > 0 ) sprintf(" %s (0.0%%)" , format_num(uw_count , FALSE )) else " 0 (0.0%)"
178+ } else {
179+ metrics <- reliability_list [[s_lvl ]]; n <- metrics $ n [level_index ]
180+ deff <- metrics $ deff [level_index ]; df <- metrics $ df
181+ ci_low <- metrics $ ci_low [level_index ]; ci_high <- metrics $ ci_high [level_index ]
182+ pct_val <- metrics $ prop [level_index ]; se <- metrics $ se [level_index ]
183+
184+ effective_n <- if (! is.na(deff ) && deff > 0 ) n / deff else 0
185+ ciw <- ci_high - ci_low
186+ rciw <- if (! is.na(pct_val ) && pct_val > 0 ) (ciw / pct_val ) * 100 else Inf
187+ rse <- if (! is.na(pct_val ) && pct_val > 0 ) (se / pct_val ) * 100 else Inf
188+
189+ fail_n_30 <- is.na(n ) || n < 30
190+ fail_eff_n_30 <- ! is.na(effective_n ) && effective_n < 30
191+ fail_df_8 <- ! is.na(df ) && df < 8
192+ fail_ciw_30 <- ! is.na(ciw ) && ciw > = 0.30
193+ fail_rciw_130 <- ! is.na(ciw ) && ciw > 0.05 && ! is.na(rciw ) && rciw > 130
194+ fail_rse_30 <- ! is.na(rse ) && rse > = 30
195+
196+ suppress <- fail_n_30 || fail_eff_n_30 || fail_df_8 || fail_ciw_30 || fail_rciw_130
197+ if (suppress ) cell_value <- " *"
198+
199+ if (return_metrics ) {
200+ metrics_list [[length(metrics_list ) + 1 ]] <- data.frame (
201+ stratum = s_lvl , variable = current_var_name , level = lvl ,
202+ n = n , df = df , deff = deff , effective_n = effective_n ,
203+ ci_low = ci_low , ci_high = ci_high , rse = rse ,
204+ suppressed = suppress , fail_n_30 = fail_n_30 , fail_eff_n_30 = fail_eff_n_30 ,
205+ fail_df_8 = fail_df_8 , fail_ciw_30 = fail_ciw_30 ,
206+ fail_rciw_130 = fail_rciw_130 , fail_rse_30 = fail_rse_30
207+ )
208+ }
209+ }
210+ }
211+
212+ if (cell_value == " " ) {
213+ if (is.na(pct_val )) {
214+ w_pcts_strata_lazy <- svyby(var_formula , strata_formula , design , svymean , na.rm = FALSE )
215+ pct_val <- w_pcts_strata_lazy [w_pcts_strata_lazy [, 1 ] == s_lvl , col_name_pct ]
216+ if (length(pct_val ) == 0 || is.na(pct_val )) pct_val <- 0
217+ }
218+ if (mode == " mixed" ) {
219+ uw_count <- uw_counts_strata [lvl , s_lvl ]
220+ cell_value <- sprintf(" %s (%.1f%%)" , format_num(uw_count , FALSE ), pct_val * 100 )
221+ } else if (mode == " weighted" ) {
222+ w_count <- w_counts_strata [lvl , s_lvl ]
223+ cell_value <- sprintf(" %s (%.1f%%)" , format_num(w_count , TRUE ), pct_val * 100 )
224+ } else {
225+ uw_count <- uw_counts_strata [lvl , s_lvl ]
226+ unweighted_pct_strata <- 100 * uw_count / sum(uw_counts_strata [, s_lvl ])
227+ if (is.nan(unweighted_pct_strata )) unweighted_pct_strata <- 0
228+ cell_value <- sprintf(" %s (%.1f%%)" , format_num(uw_count , FALSE ), unweighted_pct_strata )
229+ }
129230 }
231+ row_data [[s_lvl ]] <- cell_value
130232 }
131233 results_list [[length(results_list ) + 1 ]] <- row_data
132234 }
133235 } else if (is.numeric(current_var_vector )) {
236+ # --- Code for numeric variables ---
134237 row_data <- data.frame (Variable = " " , Level = " Mean (SD)" , stringsAsFactors = FALSE )
135238
136239 if (mode %in% c(" mixed" , " weighted" )) {
137240 mean_overall_w <- svymean(var_formula , design , na.rm = TRUE )
138241 var_overall_w <- svyvar(var_formula , design , na.rm = TRUE )
139- row_data $ Overall <- sprintf(" %.2f (%.2f)" , mean_overall_w [1 ], sqrt(var_overall_w [1 ]))
242+
243+ suppress_overall <- FALSE
244+ if (reliability_checks ) {
245+ se_overall <- SE(mean_overall_w )[1 ]
246+ mean_val_overall <- mean_overall_w [1 ]
247+ rse_overall <- if (! is.na(mean_val_overall ) && mean_val_overall != 0 ) (se_overall / abs(mean_val_overall )) * 100 else Inf
248+ fail_rse_30_overall <- ! is.na(rse_overall ) && rse_overall > = 30
249+ suppress_overall <- fail_rse_30_overall
250+
251+ if (return_metrics ) {
252+ metrics_list [[length(metrics_list ) + 1 ]] <- data.frame (
253+ stratum = " Overall" , variable = current_var_name , level = " Mean (SD)" ,
254+ n = NA , df = NA , deff = NA , effective_n = NA , ci_low = NA , ci_high = NA , rse = rse_overall ,
255+ suppressed = suppress_overall , fail_n_30 = NA , fail_eff_n_30 = NA , fail_df_8 = NA ,
256+ fail_ciw_30 = NA , fail_rciw_130 = NA , fail_rse_30 = fail_rse_30_overall )
257+ }
258+ }
259+ row_data $ Overall <- if (suppress_overall ) " *" else sprintf(" %.2f (%.2f)" , mean_overall_w [1 ], sqrt(var_overall_w [1 ]))
140260
141261 means_by_strata_w <- svyby(var_formula , strata_formula , design , svymean , na.rm = TRUE )
142262 vars_by_strata_w <- svyby(var_formula , strata_formula , design , svyvar , na.rm = TRUE )
143263
144264 for (i in 1 : nrow(means_by_strata_w )){
145265 s_lvl <- as.character(means_by_strata_w [i , 1 ])
146- row_data [[s_lvl ]] <- sprintf(" %.2f (%.2f)" , means_by_strata_w [i , 2 ], sqrt(vars_by_strata_w [i , 2 ]))
266+ mean_val <- means_by_strata_w [i , 2 ]
267+ sd_val <- sqrt(vars_by_strata_w [i , 2 ])
268+
269+ suppress_strata <- FALSE
270+ if (reliability_checks ) {
271+ se_val <- SE(means_by_strata_w )[i ]
272+ rse <- if (! is.na(mean_val ) && mean_val != 0 ) (se_val / abs(mean_val )) * 100 else Inf
273+ fail_rse_30 <- ! is.na(rse ) && rse > = 30
274+ suppress_strata <- fail_rse_30
275+
276+ if (return_metrics ) {
277+ metrics_list [[length(metrics_list ) + 1 ]] <- data.frame (
278+ stratum = s_lvl , variable = current_var_name , level = " Mean (SD)" ,
279+ n = NA , df = NA , deff = NA , effective_n = NA , ci_low = NA , ci_high = NA , rse = rse ,
280+ suppressed = suppress_strata , fail_n_30 = NA , fail_eff_n_30 = NA , fail_df_8 = NA ,
281+ fail_ciw_30 = NA , fail_rciw_130 = NA , fail_rse_30 = fail_rse_30 )
282+ }
283+ }
284+ row_data [[s_lvl ]] <- if (suppress_strata ) " *" else sprintf(" %.2f (%.2f)" , mean_val , sd_val )
147285 }
148- } else { # unweighted
286+
287+ } else {
149288 unweighted_mean_overall <- mean(current_var_vector , na.rm = TRUE )
150289 unweighted_sd_overall <- sd(current_var_vector , na.rm = TRUE )
151290 row_data $ Overall <- sprintf(" %.2f (%.2f)" , unweighted_mean_overall , unweighted_sd_overall )
@@ -181,5 +320,20 @@ svytable1 <- function(design, strata_var, table_vars,
181320
182321 final_table <- do.call(rbind , results_list )
183322 rownames(final_table ) <- NULL
184- return (final_table )
323+
324+ if (reliability_checks && return_metrics ) {
325+ if (length(metrics_list ) > 0 ) {
326+ metrics_df <- do.call(rbind , metrics_list )
327+
328+ # --- NEW: Round numeric columns to 2 decimal places ---
329+ cols_to_format <- sapply(metrics_df , is.numeric )
330+ metrics_df [cols_to_format ] <- lapply(metrics_df [cols_to_format ], function (x ) round(x , 2 ))
331+
332+ } else {
333+ metrics_df <- data.frame ()
334+ }
335+ return (list (formatted_table = final_table , reliability_metrics = metrics_df ))
336+ } else {
337+ return (final_table )
338+ }
185339}
0 commit comments