Skip to content

Commit bbc551b

Browse files
committed
re write doc and factor function
1 parent 4961dc2 commit bbc551b

File tree

2 files changed

+167
-109
lines changed

2 files changed

+167
-109
lines changed

R/check_group_variation.R

Lines changed: 126 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -9,53 +9,56 @@
99
#' `by`).
1010
#' @param by Character vector (or formula) with the name of the variable that
1111
#' indicates the group- or cluster-ID. For cross-classified or nested designs,
12-
#' `by` can also identify two or more variables as group- or cluster-IDs. If
13-
#' the data is nested and should be treated as such, set `nested = TRUE`. Else,
14-
#' if `by` defines two or more variables and `nested = FALSE`, a cross-classified
15-
#' design is assumed.
16-
#'
17-
#' For nested designs, `by` can be:
18-
#' - a character vector with the name of the variable that indicates the
19-
#' levels, ordered from *highest* level to *lowest* (e.g.
20-
#' `by = c("L4", "L3", "L2")`.
21-
#' - a character vector with variable names in the format `by = "L4/L3/L2"`,
22-
#' where the levels are separated by `/`.
23-
#' @param only_balanced ...
24-
#' @param tolerance The amount of variation (calculated by `var()`, i.e. the
12+
#' `by` can also identify two or more variables as group- or cluster-IDs.
13+
#' @param include_by When there is more than one grouping variable, should they
14+
#' be check against each other?
15+
#' @param numeric_as_factor Should numeric variables be tested as factors?
16+
#' @param num_tolerance The amount of variation (calculated by `var()`, i.e. the
2517
#' variance of a variable) that is tolerated to indicate no within- or
2618
#' between-effect.
19+
#' @param fct_tolerance How should a non-numeric variable be identified as
20+
#' varying only "within" a grouping variable? Options are:
21+
#' - `"crossed"` - if all groups have all unique values of X.
22+
#' - `"balanced"` - if all groups have all unique values of X, _with equal frequency_.
23+
2724
#'
2825
#' @details
29-
#' This function calls [`datawizard::demean()`] to calculate the within- and
30-
#' between-effects of variables specified in `select`, based on the groups
31-
#' indicated in `by`. Then, the variance for each variable's within- and
32-
#' between-effect is calculated. If the variance is larger than `tolerance`,
33-
#' a within- or between-effect is detected.
26+
#' This function attempt to identify the hierarchical design of a dataset with
27+
#' respect to grouping variables (`by`).
28+
#'
29+
#' ## Numeric variables
30+
#' Numeric variables are portioned via [`datawizard::demean()`] to their within-
31+
#' and between-group components. Then, the variance for each variable's within-
32+
#' and between-group component is calculated. Variable with within-group
33+
#' variance larger than `num_tolerance` are labeled as _within_, variable with
34+
#' between-group variance larger than `num_tolerance` are labeled as _between_,
35+
#' and variables with both variances larger than `num_tolerance` are labeled as
36+
#' _both_.
37+
#' \cr\cr
38+
#' Setting `numeric_as_factor = TRUE` causes numeric variables to be tested
39+
#' using the following criteria.
40+
#'
41+
#' ## Non-numeric variables
42+
#' These variables can have one of the following 4 labels:
43+
#' - _between_ - the variable is fixed (has exactly one unique, constant value) for each group.
44+
#' - _nested_ - the variable varies within each group, with each group having their own set of
45+
#' unique levels of the variable.
46+
#' - _within_ - the variable is _crossed_ with the grouping variable - each value appear
47+
#' within each group. The `fct_tolerance` argument controls if full balance is also required.
48+
#' - _both_ - the variable is partially nested within the grouping variable (or, when
49+
#' `fct_tolerance = "balanced"` the variable is fully crossed, but not perfectly balanced).
3450
#'
35-
#' @return A list with at most three elements, `within`, `between`, and `both`,
36-
#' where each element contains the name of variables that have one of these
37-
#' effects.
51+
#' @return A data frame with group, variable, and type columns.
3852
#'
3953
#' @examples
40-
#' set.seed(1234)
41-
#' dat <- data.frame(
42-
#' id = rep(letters, each = 3),
43-
#' between_num = rep(rnorm(26), each = 3),
44-
#' within1_num = rep(rnorm(3), times = 26),
45-
#' within2_num = rep(rnorm(3), times = 26),
46-
#' both_num = rnorm(3 * 26)
47-
#' )
48-
#' check_group_variation(
49-
#' dat,
50-
#' select = c("between_num", "within1_num", "within2_num", "both_num"),
51-
#' by = "id"
52-
#' )
5354
#' @export
5455
check_group_variation <- function(x,
5556
select = NULL,
5657
by = NULL,
57-
tolerance = 1e-4,
58-
only_balanced = TRUE) {
58+
include_by = FALSE,
59+
numeric_as_factor = FALSE,
60+
num_tolerance = 1e-4,
61+
fct_tolerance = "crossed") {
5962
insight::check_if_installed("datawizard", minimum_version = "0.12.0")
6063

6164
if (inherits(select, "formula")) {
@@ -80,6 +83,14 @@ check_group_variation <- function(x,
8083
select <- setdiff(colnames(x), by)
8184
}
8285

86+
if (include_by && (length(by) > 1L)) {
87+
select <- c(by, select)
88+
}
89+
90+
if (numeric_as_factor) {
91+
x[select] <- lapply(x[select], as.factor)
92+
}
93+
8394
# create all combinations that should be checked
8495
combinations <- expand.grid(
8596
variable = select,
@@ -95,8 +106,8 @@ check_group_variation <- function(x,
95106
x,
96107
combinations[i, "group"],
97108
combinations[i, "variable"],
98-
tolerance = tolerance,
99-
only_balanced = only_balanced
109+
num_tolerance = num_tolerance,
110+
fct_tolerance = fct_tolerance
100111
)
101112
}
102113

@@ -112,33 +123,35 @@ check_group_variation <- function(x,
112123

113124
#' @export
114125
print.check_group_variation <- function(x, ...) {
115-
if (insight::n_unique(x$group) > 1L) {
116-
cat(insight::export_table(
117-
x,
118-
caption = c("Check group variation", "blue"),
119-
by = "group"
120-
))
121-
} else {
122-
x_new <- x
123-
x_new$group <- NULL
124-
cat(insight::export_table(
125-
x_new,
126-
caption = c(sprintf("Check %s variation", x$group[1]), "blue")
127-
))
128-
}
129-
130-
return(invisible(x))
126+
x_orig <- x
127+
128+
cap <- "Check group variation"
129+
by <- "group"
130+
131+
if (insight::n_unique(x$group) == 1L) {
132+
x$group <- NULL
133+
cap <- sprintf("Check %s variation", x_orig$group[1])
134+
by <- NULL
135+
}
136+
137+
cat(insight::export_table(x, caption = c(cap, "blue"), by = by, ...))
138+
139+
return(invisible(x_orig))
131140
}
132141

133142

134143
# utils -------------------------------------------------------------
135144

136145
.check_nested <- function(data, by, predictor, ...) {
146+
if (insight::n_unique(data[[predictor]]) == 1L) {
147+
return(NA_character_)
148+
}
149+
137150
UseMethod(".check_nested", data[[predictor]])
138151
}
139152

140153

141-
.check_nested.numeric <- function(data, by, predictor, tolerance = 1e-05, ...) {
154+
.check_nested.numeric <- function(data, by, predictor, num_tolerance = 1e-05, ...) {
142155
# demean predictor
143156
d <- datawizard::demean(
144157
data,
@@ -152,41 +165,83 @@ print.check_group_variation <- function(x, ...) {
152165
within_name <- paste0(predictor, "_within")
153166
between_name <- paste0(predictor, "_between")
154167

155-
is_between <- var(d[[between_name]], na.rm = TRUE) > tolerance
156-
is_within <- var(d[[within_name]], na.rm = TRUE) > tolerance
168+
is_between <- var(d[[between_name]], na.rm = TRUE) > num_tolerance
169+
is_within <- var(d[[within_name]], na.rm = TRUE) > num_tolerance
157170
is_both <- is_between && is_within
158171

159-
if (is_both) return("both")
160-
if (is_between) return("between")
161-
if (is_within) return("within")
162-
NULL
172+
if (is_both) {
173+
return("both")
174+
}
175+
if (is_between) {
176+
return("between")
177+
}
178+
if (is_within) {
179+
return("within")
180+
}
181+
182+
NA_character_
163183
}
164184

165185

166-
.check_nested.default <- function(data, by, predictor, only_balanced = TRUE, ...) {
186+
.check_nested.default <- function(data, by, predictor, fct_tolerance = c("crossed", "balanced"), ...) {
187+
fct_tolerance <- match.arg(fct_tolerance)
188+
167189
group <- data[[by]]
168190
variable <- data[[predictor]]
169191

170192
complete <- stats::complete.cases(group, variable)
171193
group <- as.factor(group[complete])
172194
variable <- variable[complete]
173195

196+
# Is the variable fixed for each group?
174197
n_uniques <- tapply(variable, group, insight::n_unique)
175198
is_between <- all(n_uniques == 1L)
176-
if (is_between) return("between")
199+
if (is_between) {
200+
return("between")
201+
}
202+
203+
# Is the variable nested within each group?
204+
if (insight::check_if_installed("Matrix", reason = "for checking nested designs")) {
205+
# code from lme4::isNested
206+
f1 <- as.factor(variable)
207+
f2 <- as.factor(group)
208+
k <- length(levels(f1))
209+
sm <- as(
210+
new("ngTMatrix",
211+
i = as.integer(f2) - 1L,
212+
j = as.integer(f1) - 1L,
213+
Dim = c(length(levels(f2)), k)
214+
),
215+
"CsparseMatrix"
216+
)
217+
if (all(sm@p[2:(k + 1L)] - sm@p[1:k] <= 1L)) {
218+
return("nested")
219+
}
220+
}
177221

178-
if (!insight::has_single_value(is_between)) return("both")
222+
# If each group has a different number of unique values,
223+
# then it is partially nested/crossed.
224+
if (!insight::has_single_value(n_uniques)) {
225+
return("both")
226+
}
179227

228+
# Is the variable crossed?
180229
variable_levels <- unique(variable)
181230
has_all <- tapply(variable, group, function(v) all(variable_levels %in% v))
182-
if (!all(has_all)) return("both")
231+
if (!all(has_all)) {
232+
return("both")
233+
}
183234

184-
if (only_balanced) {
185-
tab <- table(variable, group)
186-
is_balanced <- all(apply(tab, 2, insight::has_single_value))
187-
if (!is_balanced) {
188-
return("both")
189-
}
235+
if (fct_tolerance == "crossed") {
236+
return("within")
237+
}
238+
239+
# Is the variable crossed and balanced?
240+
tab <- table(variable, group)
241+
is_balanced <- all(apply(tab, 2, insight::has_single_value))
242+
if (!is_balanced) {
243+
return("both")
190244
}
245+
191246
return("within")
192247
}

man/check_group_variation.Rd

Lines changed: 41 additions & 38 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)