99# ' `by`).
1010# ' @param by Character vector (or formula) with the name of the variable that
1111# ' indicates the group- or cluster-ID. For cross-classified or nested designs,
12- # ' `by` can also identify two or more variables as group- or cluster-IDs. If
13- # ' the data is nested and should be treated as such, set `nested = TRUE`. Else,
14- # ' if `by` defines two or more variables and `nested = FALSE`, a cross-classified
15- # ' design is assumed.
16- # '
17- # ' For nested designs, `by` can be:
18- # ' - a character vector with the name of the variable that indicates the
19- # ' levels, ordered from *highest* level to *lowest* (e.g.
20- # ' `by = c("L4", "L3", "L2")`.
21- # ' - a character vector with variable names in the format `by = "L4/L3/L2"`,
22- # ' where the levels are separated by `/`.
23- # ' @param only_balanced ...
24- # ' @param tolerance The amount of variation (calculated by `var()`, i.e. the
12+ # ' `by` can also identify two or more variables as group- or cluster-IDs.
13+ # ' @param include_by When there is more than one grouping variable, should they
14+ # ' be check against each other?
15+ # ' @param numeric_as_factor Should numeric variables be tested as factors?
16+ # ' @param num_tolerance The amount of variation (calculated by `var()`, i.e. the
2517# ' variance of a variable) that is tolerated to indicate no within- or
2618# ' between-effect.
19+ # ' @param fct_tolerance How should a non-numeric variable be identified as
20+ # ' varying only "within" a grouping variable? Options are:
21+ # ' - `"crossed"` - if all groups have all unique values of X.
22+ # ' - `"balanced"` - if all groups have all unique values of X, _with equal frequency_.
23+
2724# '
2825# ' @details
29- # ' This function calls [`datawizard::demean()`] to calculate the within- and
30- # ' between-effects of variables specified in `select`, based on the groups
31- # ' indicated in `by`. Then, the variance for each variable's within- and
32- # ' between-effect is calculated. If the variance is larger than `tolerance`,
33- # ' a within- or between-effect is detected.
26+ # ' This function attempt to identify the hierarchical design of a dataset with
27+ # ' respect to grouping variables (`by`).
28+ # '
29+ # ' ## Numeric variables
30+ # ' Numeric variables are portioned via [`datawizard::demean()`] to their within-
31+ # ' and between-group components. Then, the variance for each variable's within-
32+ # ' and between-group component is calculated. Variable with within-group
33+ # ' variance larger than `num_tolerance` are labeled as _within_, variable with
34+ # ' between-group variance larger than `num_tolerance` are labeled as _between_,
35+ # ' and variables with both variances larger than `num_tolerance` are labeled as
36+ # ' _both_.
37+ # ' \cr\cr
38+ # ' Setting `numeric_as_factor = TRUE` causes numeric variables to be tested
39+ # ' using the following criteria.
40+ # '
41+ # ' ## Non-numeric variables
42+ # ' These variables can have one of the following 4 labels:
43+ # ' - _between_ - the variable is fixed (has exactly one unique, constant value) for each group.
44+ # ' - _nested_ - the variable varies within each group, with each group having their own set of
45+ # ' unique levels of the variable.
46+ # ' - _within_ - the variable is _crossed_ with the grouping variable - each value appear
47+ # ' within each group. The `fct_tolerance` argument controls if full balance is also required.
48+ # ' - _both_ - the variable is partially nested within the grouping variable (or, when
49+ # ' `fct_tolerance = "balanced"` the variable is fully crossed, but not perfectly balanced).
3450# '
35- # ' @return A list with at most three elements, `within`, `between`, and `both`,
36- # ' where each element contains the name of variables that have one of these
37- # ' effects.
51+ # ' @return A data frame with group, variable, and type columns.
3852# '
3953# ' @examples
40- # ' set.seed(1234)
41- # ' dat <- data.frame(
42- # ' id = rep(letters, each = 3),
43- # ' between_num = rep(rnorm(26), each = 3),
44- # ' within1_num = rep(rnorm(3), times = 26),
45- # ' within2_num = rep(rnorm(3), times = 26),
46- # ' both_num = rnorm(3 * 26)
47- # ' )
48- # ' check_group_variation(
49- # ' dat,
50- # ' select = c("between_num", "within1_num", "within2_num", "both_num"),
51- # ' by = "id"
52- # ' )
5354# ' @export
5455check_group_variation <- function (x ,
5556 select = NULL ,
5657 by = NULL ,
57- tolerance = 1e-4 ,
58- only_balanced = TRUE ) {
58+ include_by = FALSE ,
59+ numeric_as_factor = FALSE ,
60+ num_tolerance = 1e-4 ,
61+ fct_tolerance = " crossed" ) {
5962 insight :: check_if_installed(" datawizard" , minimum_version = " 0.12.0" )
6063
6164 if (inherits(select , " formula" )) {
@@ -80,6 +83,14 @@ check_group_variation <- function(x,
8083 select <- setdiff(colnames(x ), by )
8184 }
8285
86+ if (include_by && (length(by ) > 1L )) {
87+ select <- c(by , select )
88+ }
89+
90+ if (numeric_as_factor ) {
91+ x [select ] <- lapply(x [select ], as.factor )
92+ }
93+
8394 # create all combinations that should be checked
8495 combinations <- expand.grid(
8596 variable = select ,
@@ -95,8 +106,8 @@ check_group_variation <- function(x,
95106 x ,
96107 combinations [i , " group" ],
97108 combinations [i , " variable" ],
98- tolerance = tolerance ,
99- only_balanced = only_balanced
109+ num_tolerance = num_tolerance ,
110+ fct_tolerance = fct_tolerance
100111 )
101112 }
102113
@@ -112,33 +123,35 @@ check_group_variation <- function(x,
112123
113124# ' @export
114125print.check_group_variation <- function (x , ... ) {
115- if (insight :: n_unique(x $ group ) > 1L ) {
116- cat(insight :: export_table(
117- x ,
118- caption = c(" Check group variation" , " blue" ),
119- by = " group"
120- ))
121- } else {
122- x_new <- x
123- x_new $ group <- NULL
124- cat(insight :: export_table(
125- x_new ,
126- caption = c(sprintf(" Check %s variation" , x $ group [1 ]), " blue" )
127- ))
128- }
129-
130- return (invisible (x ))
126+ x_orig <- x
127+
128+ cap <- " Check group variation"
129+ by <- " group"
130+
131+ if (insight :: n_unique(x $ group ) == 1L ) {
132+ x $ group <- NULL
133+ cap <- sprintf(" Check %s variation" , x_orig $ group [1 ])
134+ by <- NULL
135+ }
136+
137+ cat(insight :: export_table(x , caption = c(cap , " blue" ), by = by , ... ))
138+
139+ return (invisible (x_orig ))
131140}
132141
133142
134143# utils -------------------------------------------------------------
135144
136145.check_nested <- function (data , by , predictor , ... ) {
146+ if (insight :: n_unique(data [[predictor ]]) == 1L ) {
147+ return (NA_character_ )
148+ }
149+
137150 UseMethod(" .check_nested" , data [[predictor ]])
138151}
139152
140153
141- .check_nested.numeric <- function (data , by , predictor , tolerance = 1e-05 , ... ) {
154+ .check_nested.numeric <- function (data , by , predictor , num_tolerance = 1e-05 , ... ) {
142155 # demean predictor
143156 d <- datawizard :: demean(
144157 data ,
@@ -152,41 +165,83 @@ print.check_group_variation <- function(x, ...) {
152165 within_name <- paste0(predictor , " _within" )
153166 between_name <- paste0(predictor , " _between" )
154167
155- is_between <- var(d [[between_name ]], na.rm = TRUE ) > tolerance
156- is_within <- var(d [[within_name ]], na.rm = TRUE ) > tolerance
168+ is_between <- var(d [[between_name ]], na.rm = TRUE ) > num_tolerance
169+ is_within <- var(d [[within_name ]], na.rm = TRUE ) > num_tolerance
157170 is_both <- is_between && is_within
158171
159- if (is_both ) return (" both" )
160- if (is_between ) return (" between" )
161- if (is_within ) return (" within" )
162- NULL
172+ if (is_both ) {
173+ return (" both" )
174+ }
175+ if (is_between ) {
176+ return (" between" )
177+ }
178+ if (is_within ) {
179+ return (" within" )
180+ }
181+
182+ NA_character_
163183}
164184
165185
166- .check_nested.default <- function (data , by , predictor , only_balanced = TRUE , ... ) {
186+ .check_nested.default <- function (data , by , predictor , fct_tolerance = c(" crossed" , " balanced" ), ... ) {
187+ fct_tolerance <- match.arg(fct_tolerance )
188+
167189 group <- data [[by ]]
168190 variable <- data [[predictor ]]
169191
170192 complete <- stats :: complete.cases(group , variable )
171193 group <- as.factor(group [complete ])
172194 variable <- variable [complete ]
173195
196+ # Is the variable fixed for each group?
174197 n_uniques <- tapply(variable , group , insight :: n_unique )
175198 is_between <- all(n_uniques == 1L )
176- if (is_between ) return (" between" )
199+ if (is_between ) {
200+ return (" between" )
201+ }
202+
203+ # Is the variable nested within each group?
204+ if (insight :: check_if_installed(" Matrix" , reason = " for checking nested designs" )) {
205+ # code from lme4::isNested
206+ f1 <- as.factor(variable )
207+ f2 <- as.factor(group )
208+ k <- length(levels(f1 ))
209+ sm <- as(
210+ new(" ngTMatrix" ,
211+ i = as.integer(f2 ) - 1L ,
212+ j = as.integer(f1 ) - 1L ,
213+ Dim = c(length(levels(f2 )), k )
214+ ),
215+ " CsparseMatrix"
216+ )
217+ if (all(sm @ p [2 : (k + 1L )] - sm @ p [1 : k ] < = 1L )) {
218+ return (" nested" )
219+ }
220+ }
177221
178- if (! insight :: has_single_value(is_between )) return (" both" )
222+ # If each group has a different number of unique values,
223+ # then it is partially nested/crossed.
224+ if (! insight :: has_single_value(n_uniques )) {
225+ return (" both" )
226+ }
179227
228+ # Is the variable crossed?
180229 variable_levels <- unique(variable )
181230 has_all <- tapply(variable , group , function (v ) all(variable_levels %in% v ))
182- if (! all(has_all )) return (" both" )
231+ if (! all(has_all )) {
232+ return (" both" )
233+ }
183234
184- if (only_balanced ) {
185- tab <- table(variable , group )
186- is_balanced <- all(apply(tab , 2 , insight :: has_single_value ))
187- if (! is_balanced ) {
188- return (" both" )
189- }
235+ if (fct_tolerance == " crossed" ) {
236+ return (" within" )
237+ }
238+
239+ # Is the variable crossed and balanced?
240+ tab <- table(variable , group )
241+ is_balanced <- all(apply(tab , 2 , insight :: has_single_value ))
242+ if (! is_balanced ) {
243+ return (" both" )
190244 }
245+
191246 return (" within" )
192247}
0 commit comments