Skip to content

Commit 954586e

Browse files
committed
v0.2
1 parent 15cfe74 commit 954586e

File tree

7 files changed

+943
-858
lines changed

7 files changed

+943
-858
lines changed

DESCRIPTION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,8 @@ Imports:
1313
stats
1414
Suggests:
1515
knitr,
16-
rmarkdown,
1716
dplyr,
17+
rmarkdown,
1818
NHANES,
1919
tidyr
2020
VignetteBuilder: knitr

NAMESPACE

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,10 @@
22

33
export(svytable1)
44
import(stats)
5+
importFrom(survey,SE)
6+
importFrom(survey,degf)
57
importFrom(survey,svyby)
8+
importFrom(survey,svyciprop)
69
importFrom(survey,svymean)
710
importFrom(survey,svytable)
811
importFrom(survey,svyvar)

R/svytable1.R

Lines changed: 184 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -2,23 +2,49 @@
22
#'
33
#' @description
44
#' Generates a "Table 1" of descriptive statistics for complex survey data,
5-
#' stratified by an outcome variable. It supports different reporting modes and
6-
#' optional comma formatting for large numbers.
5+
#' stratified by an outcome variable. It supports different reporting modes,
6+
#' optional comma formatting, and optional reliability checks based on NCHS
7+
#' Data Presentation Standards.
78
#'
8-
#' @param design A survey design object created by the `survey` package.
9+
#' @param design A survey design object created by the survey package.
910
#' @param strata_var A string with the name of the stratification variable.
1011
#' @param table_vars A character vector of variable names to summarize.
1112
#' @param mode A string specifying the output type: "mixed" (default), "weighted", or "unweighted".
1213
#' @param commas Logical; if TRUE (default), large numbers in counts are formatted with commas.
14+
#' @param reliability_checks Logical; if TRUE, performs reliability checks. Unreliable
15+
#' estimates are replaced with an asterisk (*). The rules are based on NCHS guidelines.
16+
#' \itemize{
17+
#' \item{\strong{For proportions (categorical variables)}, an estimate is suppressed if it meets ANY of the following criteria:
18+
#' \itemize{
19+
#' \item{Unweighted Sample Size < 30: The actual number of survey participants in the cell is too small to produce a stable estimate.}
20+
#' \item{Effective Sample Size < 30: After accounting for the complex survey design, the estimate's precision is equivalent to that from a simple random sample of fewer than 30 people.}
21+
#' \item{Degrees of Freedom < 8: The variance estimate is itself unstable, making the entire estimate unreliable.}
22+
#' \item{Confidence Interval Width >= 30\%: The 95\% confidence interval is 30 percentage points wide or wider, indicating extreme imprecision.}
23+
#' \item{Conditional Relative CI Width > 130\%: If the confidence interval width is between 5\% and 30\%, it is also checked to see if it is excessively large \emph{relative} to the estimate itself.}
24+
#' }
25+
#' }
26+
#' \item{\strong{For means (numeric variables)}, an estimate is suppressed if:
27+
#' \itemize{
28+
#' \item{Relative Standard Error (RSE) >= 30\%: The standard error is 30\% or more of the mean's value, indicating high imprecision.}
29+
#' }
30+
#' }
31+
#' }
32+
#' Defaults to FALSE.
33+
#' @param return_metrics Logical; if TRUE and `reliability_checks` is also TRUE, the function
34+
#' returns a list containing the formatted table and a detailed data frame of the
35+
#' reliability metrics. This includes the RSE and specific TRUE/FALSE columns
36+
#' (e.g., `fail_n_30`, `fail_eff_n_30`) for each suppression rule. Defaults to FALSE.
1337
#'
14-
#' @return A data.frame formatted as a descriptive statistics table.
38+
#' @return If `return_metrics` is FALSE (default), returns a data.frame. If TRUE,
39+
#' returns a list with two elements: `formatted_table` and `reliability_metrics`.
1540
#'
16-
#' @importFrom survey svytable svymean svyby svyvar
41+
#' @importFrom survey svytable svymean svyby svyvar svyciprop degf SE
1742
#' @import stats
1843
#'
1944
#' @export
2045
svytable1 <- function(design, strata_var, table_vars,
21-
mode = "mixed", commas = TRUE) {
46+
mode = "mixed", commas = TRUE,
47+
reliability_checks = FALSE, return_metrics = FALSE) {
2248

2349
# --- 1. Input Validation ---
2450
df <- design$variables
@@ -28,10 +54,10 @@ svytable1 <- function(design, strata_var, table_vars,
2854
stop(paste("The following variables were not found in the data:",
2955
paste(missing_vars, collapse = ", ")))
3056
}
57+
w_counts_strata <- NULL
3158

3259
# --- Helper function for formatting ---
3360
format_num <- function(n, is_weighted) {
34-
# --- FINAL FIX: Corrected typo from is.weighted to is_weighted ---
3561
if (is_weighted) n <- round(n)
3662
if (commas) return(format(n, big.mark = ","))
3763
return(as.character(n))
@@ -70,6 +96,10 @@ svytable1 <- function(design, strata_var, table_vars,
7096

7197
results_list <- list(header_row)
7298

99+
if (reliability_checks && return_metrics) {
100+
metrics_list <- list()
101+
}
102+
73103
for (current_var_name in table_vars) {
74104
var_formula <- as.formula(paste0("~", current_var_name))
75105
strata_formula <- as.formula(paste0("~", strata_var))
@@ -80,7 +110,7 @@ svytable1 <- function(design, strata_var, table_vars,
80110
current_var_vector <- design$variables[[current_var_name]]
81111

82112
if (is.factor(current_var_vector)) {
83-
113+
# --- Code for factor variables ---
84114
if (any(is.na(current_var_vector))) {
85115
original_levels <- levels(current_var_vector)
86116
var_as_char <- as.character(current_var_vector)
@@ -91,61 +121,170 @@ svytable1 <- function(design, strata_var, table_vars,
91121
)
92122
}
93123

124+
if (reliability_checks) {
125+
reliability_list <- list()
126+
127+
for (s_lvl in strata_levels) {
128+
sub_design <- subset(design, get(strata_var) == s_lvl)
129+
if(nrow(sub_design) == 0) next
130+
131+
prop_res <- try(svymean(var_formula, sub_design, na.rm = FALSE, deff = TRUE), silent = TRUE)
132+
if (inherits(prop_res, "try-error")) next
133+
134+
var_levels <- levels(sub_design$variables[[current_var_name]])
135+
ci_low_vals <- c(); ci_high_vals <- c()
136+
for(v_lvl in var_levels){
137+
prop_formula <- as.formula(paste0("~I(`", current_var_name, "` == '", v_lvl, "')"))
138+
ci_level <- svyciprop(prop_formula, sub_design, method = "beta", na.rm = TRUE)
139+
ci_low_vals <- c(ci_low_vals, attr(ci_level, "ci")[1])
140+
ci_high_vals <- c(ci_high_vals, attr(ci_level, "ci")[2])
141+
}
142+
143+
reliability_list[[s_lvl]] <- list(
144+
prop = as.numeric(prop_res), se = as.numeric(SE(prop_res)),
145+
n = as.numeric(table(sub_design$variables[[current_var_name]])),
146+
deff = diag(attr(prop_res, "deff")), ci_low = ci_low_vals,
147+
ci_high = ci_high_vals, df = degf(sub_design)
148+
)
149+
}
150+
}
151+
94152
uw_counts_overall <- table(design$variables[[current_var_name]])
95153
uw_counts_strata <- table(design$variables[[current_var_name]], design$variables[[strata_var]])
96154
w_counts_overall <- svytable(var_formula, design, round = TRUE)
97-
w_counts_strata <- svytable(as.formula(paste0("~", current_var_name, "+", strata_var)), design, round = TRUE)
98-
w_pcts_overall <- svymean(var_formula, design, na.rm = FALSE)
99-
w_pcts_strata <- svyby(var_formula, strata_formula, design, svymean, na.rm = FALSE)
100155

101156
for (lvl in levels(design$variables[[current_var_name]])) {
102157
row_data <- data.frame(Variable = "", Level = lvl, stringsAsFactors = FALSE)
103158
col_name_pct <- paste0(current_var_name, lvl)
104159

160+
w_pcts_overall <- svymean(var_formula, design, na.rm = FALSE)
105161
if (mode == "mixed") {
106162
row_data$Overall <- sprintf("%s (%.1f%%)", format_num(uw_counts_overall[lvl], FALSE), w_pcts_overall[col_name_pct] * 100)
107163
} else if (mode == "weighted") {
108164
row_data$Overall <- sprintf("%s (%.1f%%)", format_num(w_counts_overall[lvl], TRUE), w_pcts_overall[col_name_pct] * 100)
109-
} else { # unweighted
165+
} else {
110166
unweighted_pct <- 100 * uw_counts_overall[lvl] / sum(uw_counts_overall)
111167
row_data$Overall <- sprintf("%s (%.1f%%)", format_num(uw_counts_overall[lvl], FALSE), unweighted_pct)
112168
}
113169

114170
for (s_lvl in strata_levels) {
115-
pct_val <- w_pcts_strata[w_pcts_strata[, 1] == s_lvl, col_name_pct]
116-
if(length(pct_val) == 0 || is.na(pct_val)) pct_val <- 0
117-
118-
if (mode == "mixed") {
119-
uw_count <- uw_counts_strata[lvl, s_lvl]
120-
row_data[[s_lvl]] <- sprintf("%s (%.1f%%)", format_num(uw_count, FALSE), pct_val * 100)
121-
} else if (mode == "weighted") {
122-
w_count <- w_counts_strata[lvl, s_lvl]
123-
row_data[[s_lvl]] <- sprintf("%s (%.1f%%)", format_num(w_count, TRUE), pct_val * 100)
124-
} else { # unweighted
125-
uw_count <- uw_counts_strata[lvl, s_lvl]
126-
unweighted_pct_strata <- 100 * uw_count / sum(uw_counts_strata[, s_lvl])
127-
if (is.nan(unweighted_pct_strata)) unweighted_pct_strata <- 0
128-
row_data[[s_lvl]] <- sprintf("%s (%.1f%%)", format_num(uw_count, FALSE), unweighted_pct_strata)
171+
cell_value <- ""; pct_val <- NA
172+
if (reliability_checks && !is.null(reliability_list[[s_lvl]])) {
173+
level_index <- which(levels(design$variables[[current_var_name]]) == lvl)
174+
175+
if(level_index > length(reliability_list[[s_lvl]]$n)) {
176+
uw_count <- uw_counts_strata[lvl, s_lvl]
177+
cell_value <- if (uw_count > 0) sprintf("%s (0.0%%)", format_num(uw_count, FALSE)) else "0 (0.0%)"
178+
} else {
179+
metrics <- reliability_list[[s_lvl]]; n <- metrics$n[level_index]
180+
deff <- metrics$deff[level_index]; df <- metrics$df
181+
ci_low <- metrics$ci_low[level_index]; ci_high <- metrics$ci_high[level_index]
182+
pct_val <- metrics$prop[level_index]; se <- metrics$se[level_index]
183+
184+
effective_n <- if(!is.na(deff) && deff > 0) n / deff else 0
185+
ciw <- ci_high - ci_low
186+
rciw <- if(!is.na(pct_val) && pct_val > 0) (ciw / pct_val) * 100 else Inf
187+
rse <- if(!is.na(pct_val) && pct_val > 0) (se / pct_val) * 100 else Inf
188+
189+
fail_n_30 <- is.na(n) || n < 30
190+
fail_eff_n_30 <- !is.na(effective_n) && effective_n < 30
191+
fail_df_8 <- !is.na(df) && df < 8
192+
fail_ciw_30 <- !is.na(ciw) && ciw >= 0.30
193+
fail_rciw_130 <- !is.na(ciw) && ciw > 0.05 && !is.na(rciw) && rciw > 130
194+
fail_rse_30 <- !is.na(rse) && rse >= 30
195+
196+
suppress <- fail_n_30 || fail_eff_n_30 || fail_df_8 || fail_ciw_30 || fail_rciw_130
197+
if (suppress) cell_value <- "*"
198+
199+
if (return_metrics) {
200+
metrics_list[[length(metrics_list) + 1]] <- data.frame(
201+
stratum = s_lvl, variable = current_var_name, level = lvl,
202+
n = n, df = df, deff = deff, effective_n = effective_n,
203+
ci_low = ci_low, ci_high = ci_high, rse = rse,
204+
suppressed = suppress, fail_n_30 = fail_n_30, fail_eff_n_30 = fail_eff_n_30,
205+
fail_df_8 = fail_df_8, fail_ciw_30 = fail_ciw_30,
206+
fail_rciw_130 = fail_rciw_130, fail_rse_30 = fail_rse_30
207+
)
208+
}
209+
}
210+
}
211+
212+
if (cell_value == "") {
213+
if (is.na(pct_val)) {
214+
w_pcts_strata_lazy <- svyby(var_formula, strata_formula, design, svymean, na.rm = FALSE)
215+
pct_val <- w_pcts_strata_lazy[w_pcts_strata_lazy[, 1] == s_lvl, col_name_pct]
216+
if(length(pct_val) == 0 || is.na(pct_val)) pct_val <- 0
217+
}
218+
if (mode == "mixed") {
219+
uw_count <- uw_counts_strata[lvl, s_lvl]
220+
cell_value <- sprintf("%s (%.1f%%)", format_num(uw_count, FALSE), pct_val * 100)
221+
} else if (mode == "weighted") {
222+
w_count <- w_counts_strata[lvl, s_lvl]
223+
cell_value <- sprintf("%s (%.1f%%)", format_num(w_count, TRUE), pct_val * 100)
224+
} else {
225+
uw_count <- uw_counts_strata[lvl, s_lvl]
226+
unweighted_pct_strata <- 100 * uw_count / sum(uw_counts_strata[, s_lvl])
227+
if (is.nan(unweighted_pct_strata)) unweighted_pct_strata <- 0
228+
cell_value <- sprintf("%s (%.1f%%)", format_num(uw_count, FALSE), unweighted_pct_strata)
229+
}
129230
}
231+
row_data[[s_lvl]] <- cell_value
130232
}
131233
results_list[[length(results_list) + 1]] <- row_data
132234
}
133235
} else if (is.numeric(current_var_vector)) {
236+
# --- Code for numeric variables ---
134237
row_data <- data.frame(Variable = "", Level = "Mean (SD)", stringsAsFactors = FALSE)
135238

136239
if (mode %in% c("mixed", "weighted")) {
137240
mean_overall_w <- svymean(var_formula, design, na.rm = TRUE)
138241
var_overall_w <- svyvar(var_formula, design, na.rm = TRUE)
139-
row_data$Overall <- sprintf("%.2f (%.2f)", mean_overall_w[1], sqrt(var_overall_w[1]))
242+
243+
suppress_overall <- FALSE
244+
if (reliability_checks) {
245+
se_overall <- SE(mean_overall_w)[1]
246+
mean_val_overall <- mean_overall_w[1]
247+
rse_overall <- if (!is.na(mean_val_overall) && mean_val_overall != 0) (se_overall / abs(mean_val_overall)) * 100 else Inf
248+
fail_rse_30_overall <- !is.na(rse_overall) && rse_overall >= 30
249+
suppress_overall <- fail_rse_30_overall
250+
251+
if (return_metrics) {
252+
metrics_list[[length(metrics_list) + 1]] <- data.frame(
253+
stratum = "Overall", variable = current_var_name, level = "Mean (SD)",
254+
n = NA, df = NA, deff = NA, effective_n = NA, ci_low = NA, ci_high = NA, rse = rse_overall,
255+
suppressed = suppress_overall, fail_n_30 = NA, fail_eff_n_30 = NA, fail_df_8 = NA,
256+
fail_ciw_30 = NA, fail_rciw_130 = NA, fail_rse_30 = fail_rse_30_overall)
257+
}
258+
}
259+
row_data$Overall <- if(suppress_overall) "*" else sprintf("%.2f (%.2f)", mean_overall_w[1], sqrt(var_overall_w[1]))
140260

141261
means_by_strata_w <- svyby(var_formula, strata_formula, design, svymean, na.rm = TRUE)
142262
vars_by_strata_w <- svyby(var_formula, strata_formula, design, svyvar, na.rm = TRUE)
143263

144264
for(i in 1:nrow(means_by_strata_w)){
145265
s_lvl <- as.character(means_by_strata_w[i, 1])
146-
row_data[[s_lvl]] <- sprintf("%.2f (%.2f)", means_by_strata_w[i, 2], sqrt(vars_by_strata_w[i, 2]))
266+
mean_val <- means_by_strata_w[i, 2]
267+
sd_val <- sqrt(vars_by_strata_w[i, 2])
268+
269+
suppress_strata <- FALSE
270+
if (reliability_checks) {
271+
se_val <- SE(means_by_strata_w)[i]
272+
rse <- if (!is.na(mean_val) && mean_val != 0) (se_val / abs(mean_val)) * 100 else Inf
273+
fail_rse_30 <- !is.na(rse) && rse >= 30
274+
suppress_strata <- fail_rse_30
275+
276+
if (return_metrics) {
277+
metrics_list[[length(metrics_list) + 1]] <- data.frame(
278+
stratum = s_lvl, variable = current_var_name, level = "Mean (SD)",
279+
n = NA, df = NA, deff = NA, effective_n = NA, ci_low = NA, ci_high = NA, rse = rse,
280+
suppressed = suppress_strata, fail_n_30 = NA, fail_eff_n_30 = NA, fail_df_8 = NA,
281+
fail_ciw_30 = NA, fail_rciw_130 = NA, fail_rse_30 = fail_rse_30)
282+
}
283+
}
284+
row_data[[s_lvl]] <- if(suppress_strata) "*" else sprintf("%.2f (%.2f)", mean_val, sd_val)
147285
}
148-
} else { # unweighted
286+
287+
} else {
149288
unweighted_mean_overall <- mean(current_var_vector, na.rm = TRUE)
150289
unweighted_sd_overall <- sd(current_var_vector, na.rm = TRUE)
151290
row_data$Overall <- sprintf("%.2f (%.2f)", unweighted_mean_overall, unweighted_sd_overall)
@@ -181,5 +320,20 @@ svytable1 <- function(design, strata_var, table_vars,
181320

182321
final_table <- do.call(rbind, results_list)
183322
rownames(final_table) <- NULL
184-
return(final_table)
323+
324+
if (reliability_checks && return_metrics) {
325+
if (length(metrics_list) > 0) {
326+
metrics_df <- do.call(rbind, metrics_list)
327+
328+
# --- NEW: Round numeric columns to 2 decimal places ---
329+
cols_to_format <- sapply(metrics_df, is.numeric)
330+
metrics_df[cols_to_format] <- lapply(metrics_df[cols_to_format], function(x) round(x, 2))
331+
332+
} else {
333+
metrics_df <- data.frame()
334+
}
335+
return(list(formatted_table = final_table, reliability_metrics = metrics_df))
336+
} else {
337+
return(final_table)
338+
}
185339
}

0 commit comments

Comments
 (0)