diff --git a/DESCRIPTION b/DESCRIPTION index e01f37595..92571a5bf 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: datawizard Title: Easy Data Wrangling and Statistical Transformations -Version: 1.2.0.8 +Version: 1.2.0.9 Authors@R: c( person("Indrajeet", "Patil", , "patilindrajeet.science@gmail.com", role = "aut", comment = c(ORCID = "0000-0003-1995-6531")), diff --git a/NAMESPACE b/NAMESPACE index dfe242add..f10b5720d 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -8,6 +8,8 @@ S3method(as.double,parameters_smoothness) S3method(as.numeric,parameters_kurtosis) S3method(as.numeric,parameters_skewness) S3method(as.numeric,parameters_smoothness) +S3method(as.prop.table,datawizard_crosstab) +S3method(as.prop.table,datawizard_crosstabs) S3method(as.table,datawizard_crosstab) S3method(as.table,datawizard_crosstabs) S3method(as.table,datawizard_table) @@ -230,6 +232,7 @@ S3method(winsorize,factor) S3method(winsorize,logical) S3method(winsorize,numeric) export(adjust) +export(as.prop.table) export(assign_labels) export(categorize) export(center) diff --git a/NEWS.md b/NEWS.md index 0267ea254..e8417386f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -10,6 +10,9 @@ BREAKING CHANGES CHANGES +* `data_tabulate()` now saves the table of proportions for crosstables as + attribute, accessible via the new `as.prop.table()` method (#656). + * Due to changes in the package `insight`, `data_tabulate()` no longer prints decimals when all values in a column are integers (#641). diff --git a/R/data_tabulate.R b/R/data_tabulate.R index cb9e865da..0a5c72972 100644 --- a/R/data_tabulate.R +++ b/R/data_tabulate.R @@ -32,11 +32,6 @@ #' used for large numbers. If `NULL` (default), a big mark is added automatically for #' large numbers (i.e. numbers with more than 5 digits). If you want to remove #' the big mark, set `big_mark = ""`. -#' @param simplify Logical, if `TRUE`, the returned table is simplified to a -#' single table object if there is only one frequency or contingency table -#' input. Else, always for multiple table inputs or when `simplify = FALSE`, a -#' list of tables is returned. This is only relevant for the `as.table()` -#' methods. To ensure consistent output, the default is `FALSE`. #' @param object An object returned by `data_tabulate()`. #' @param format String, indicating the output format. Can be `"markdown"` #' `"html"`, or `"tt"`. `format = "html"` create an HTML table using the *gt* @@ -52,11 +47,12 @@ #' data frame. The structure of the returned object is a nested data frame, #' where the first column contains name of the variable for which frequencies #' were calculated, and the second column is a list column that contains the -#' frequency tables as data frame. See 'Examples'. +#' frequency tables as data frame. See [as.table.datawizard_table]. #' #' There is also an `as.table()` method, which returns a table object with the #' frequencies of the variable. This is useful for further statistical analysis, -#' e.g. for using `chisq.test()` on the frequency table. See 'Examples'. +#' e.g. for using `chisq.test()` on the frequency table. See +#' [as.table.datawizard_table]. #' #' @section Crosstables: #' If `by` is supplied, a crosstable is created. The crosstable includes `` @@ -77,6 +73,8 @@ #' @return A data frame, or a list of data frames, with one frequency table #' as data frame per variable. #' +#' @seealso [as.prop.table] +#' #' @examplesIf requireNamespace("poorman") #' # frequency tables ------- #' # ------------------------ @@ -139,28 +137,6 @@ #' # round percentages #' out <- data_tabulate(efc, "c172code", by = "e16sex", proportions = "column") #' print(out, digits = 0) -#' -#' # coerce to data frames -#' result <- data_tabulate(efc, "c172code", by = "e16sex") -#' as.data.frame(result) -#' as.data.frame(result)$table -#' as.data.frame(result, add_total = TRUE)$table -#' -#' # post-processing ------ -#' # ---------------------- -#' -#' out <- data_tabulate(efc, "c172code", by = "e16sex") -#' # we need to simplify the output, else we get a list of tables -#' suppressWarnings(chisq.test(as.table(out, simplify = TRUE))) -#' -#' # apply chisq.test to each table -#' out <- data_tabulate(efc, c("c172code", "e16sex")) -#' suppressWarnings(lapply(as.table(out), chisq.test)) -#' -#' # can also handle grouped data frames -#' d <- data_group(mtcars, "am") -#' x <- data_tabulate(d, "cyl", by = "gear") -#' as.table(x) #' @export data_tabulate <- function(x, ...) { UseMethod("data_tabulate") @@ -464,10 +440,147 @@ insight::print_md #' @export insight::display +#' Convert a crosstable to a frequency or a propensity table +#' +#' @description +#' `as.prop.table()` is an S3 generic. It can be used on objects of class +#' `datawizard_crosstab` created by `data_tabulate()` when it was run with the +#' arguments `by` and `proportions`. +#' +#' @param x An object created by `data_tabulate()`. It must be of class +#' `datawizard_crosstab` for `as.prop.table()`. +#' @param simplify Logical, if `TRUE`, the returned table is simplified to a +#' single table object if there is only one frequency or contingency table +#' input. Else, always for multiple table inputs or when `simplify = FALSE`, a +#' list of tables is returned. This is only relevant for the `as.table()` +#' methods. To ensure consistent output, the default is `FALSE`. +#' @inheritParams data_tabulate +#' +#' @export +#' @seealso [data_tabulate] +#' +#' @examples +#' data(efc) +#' +#' # Some cross tabulation +#' cross <- data_tabulate(efc, select = "e42dep", by = "c172code", proportions = "row") +#' cross +#' +#' # Convert to a propensity table +#' as.prop.table(cross) +#' +#' # Convert to data.frame +#' result <- data_tabulate(efc, "c172code", by = "e16sex") +#' as.data.frame(result) +#' as.data.frame(result)$table +#' as.data.frame(result, add_total = TRUE)$table +#' +#' # Convert to a table that can be passed to chisq.test() +#' +#' out <- data_tabulate(efc, "c172code", by = "e16sex") +#' # we need to simplify the output, else we get a list of tables +#' tbl <- as.table(out, simplify = TRUE) +#' tbl +#' suppressWarnings(chisq.test(tbl)) +#' +#' # apply chisq.test to each table +#' out <- data_tabulate(efc, c("c172code", "e16sex")) +#' suppressWarnings(lapply(as.table(out), chisq.test)) +#' +#' # can also handle grouped data frames +#' d <- data_group(mtcars, "am") +#' x <- data_tabulate(d, "cyl", by = "gear") +#' as.table(x) +as.prop.table <- function(x, ...) { + UseMethod("as.prop.table") +} + +#' @rdname as.prop.table +#' @export +as.prop.table.datawizard_crosstab <- function( + x, + remove_na = TRUE, + simplify = FALSE, + verbose = TRUE, + ... +) { + # sanity check - the `.data.frame` method returns a list, but not the + # default method + if (!is.data.frame(x)) { + x <- x[[1]] + } + prop_table <- attributes(x)$prop_table + + if (is.null(prop_table)) { + insight::format_warning("No proportions available.") + return(NULL) + } + + if (remove_na) { + if ( + verbose && + ("NA" %in% colnames(prop_table) || "NA" %in% rownames(prop_table)) + ) { + insight::format_alert("Removing NA values from frequency table.") + } + if (!is.null(prop_table[["NA"]])) { + prop_table[["NA"]] <- NULL + } + if ("NA" %in% rownames(prop_table)) { + prop_table <- prop_table[rownames(prop_table) != "NA", ] + } + } + # coerce to table + result <- as.table(as.matrix(prop_table)) + # if we don't want to simplify the table, we wrap it into a list + if (!simplify) { + result <- list(result) + } + + result +} + +#' @export +as.prop.table.datawizard_crosstabs <- function( + x, + remove_na = TRUE, + simplify = FALSE, + verbose = TRUE, + ... +) { + # only show message once we set `verbose = FALSE` in the lapply() + if (remove_na && verbose) { + prop_table <- attributes(x[[1]])$prop_table + if ("NA" %in% colnames(prop_table) || "NA" %in% rownames(prop_table)) { + insight::format_alert("Removing NA values from frequency table.") + } + } + + out <- insight::compact_list(lapply( + x, + as.prop.table.datawizard_crosstab, + remove_na = remove_na, + simplify = TRUE, + verbose = FALSE, + ... + )) + + # if no proportions found, return NULL + if (!length(out)) { + return(NULL) + } + + # if only one table is returned, "unlist" + if (length(out) == 1 && simplify) { + out <- out[[1]] + } + out +} + # as.data.frame -------------------- -#' @rdname data_tabulate +#' @rdname as.prop.table #' @param add_total For crosstables (i.e. when `by` is not `NULL`), a row and #' column with the total N values are added to the data frame. `add_total` has #' no effect in `as.data.frame()` for simple frequency tables. @@ -527,7 +640,7 @@ as.data.frame.datawizard_crosstabs <- as.data.frame.datawizard_tables # as.table -------------------- -#' @rdname data_tabulate +#' @rdname as.prop.table #' @export as.table.datawizard_table <- function( x, diff --git a/R/data_xtabulate.R b/R/data_xtabulate.R index a1b4ac1ec..f29387b36 100644 --- a/R/data_xtabulate.R +++ b/R/data_xtabulate.R @@ -90,12 +90,100 @@ attr(out, "proportions") <- proportions attr(out, "varname") <- obj_name attr(out, "grouped_df") <- !is.null(group_variable) + attr(out, "prop_table") <- .prop_table(out) class(out) <- c("datawizard_crosstab", "data.frame") out } +# Helper function to calculate a table of proportions from a frequency table +.prop_table <- function(x) { + # Extract the "proportions" attribute, which determines the type of calculation (row, column, or full) + props <- attributes(x)$proportions + out <- NULL + + # Proceed only if the "proportions" attribute is set + if (!is.null(props)) { + # Identify numeric columns, as proportions are only calculated for these + numeric_columns <- vapply(x, is.numeric, logical(1)) + # Get the total count from the attributes, used for "full" proportions + total_n <- attributes(x)$total_n + + # Use a switch to perform the calculation based on the "props" value + out <- switch( + props, + # Calculate row-wise proportions + row = lapply(seq_len(nrow(x)), function(i) { + # Sum of the current row's numeric values + row_sum <- sum(x[i, numeric_columns], na.rm = TRUE) + # Avoid division by zero; if row sum is 0, return a row of zeros + if (row_sum == 0) { + tmp <- as.data.frame(as.list(rep(0, sum(numeric_columns)))) + # for later rbind, we need identical column names + colnames(tmp) <- colnames(x)[numeric_columns] + tmp + } else { + x[i, numeric_columns] / row_sum + } + }), + # Calculate column-wise proportions + column = lapply(seq_len(ncol(x))[numeric_columns], function(i) { + # Sum of the current column's values + col_sum <- sum(x[, i], na.rm = TRUE) + # Avoid division by zero; if column sum is 0, return a vector of zeros + if (col_sum == 0) { + rep(0, nrow(x)) + } else { + x[, i] / col_sum + } + }), + # Calculate proportions relative to the total count of the entire table + full = lapply(seq_len(ncol(x))[numeric_columns], function(i) { + # Avoid division by zero; if total is 0, return a vector of zeros + if (total_n == 0) { + rep(0, nrow(x)) + } else { + x[, i] / total_n + } + }) + ) + } + + # If a proportion table was calculated, format it into a data frame + if (!is.null(out)) { + # The output of the switch is a list. We need to bind it into a data frame. + # For row proportions, we bind rows. For column/full, we bind columns. + out <- switch( + props, + row = as.data.frame(do.call(rbind, out)), + as.data.frame(do.call(cbind, out)) + ) + # Set the column names of the new proportion table + colnames(out) <- colnames(x)[numeric_columns] + + # Check if the dimensions are consistent before setting row names + if (nrow(out) == nrow(x)) { + # If the first column of the original data is not numeric, it's likely a + # label column. Use these labels as row names in the output for better + # readability. This is useful for identifying rows, especially when NAs + # are present. + if (isFALSE(numeric_columns[1])) { + r_names <- x[[1]] + r_names <- as.character(r_names) + # Replace NA in labels with the string "NA", else we cannot set rownames + r_names[is.na(r_names)] <- "NA" + rownames(out) <- r_names + } else { + # Otherwise, just use the original row names + rownames(out) <- rownames(x) + } + } + } + + out +} + # methods --------------------- @@ -113,7 +201,7 @@ format.datawizard_crosstab <- function( x <- as.data.frame(x) # find numeric columns, only for these we need row/column sums - numeric_columns <- vapply(x, is.numeric, logical(1)) + numeric_columns <- which(vapply(x, is.numeric, logical(1))) # compute total N for rows and columns total_n <- attributes(x)$total_n @@ -122,55 +210,18 @@ format.datawizard_crosstab <- function( # proportions? props <- attributes(x)$proportions - - if (!is.null(props)) { - # we copy x to tmp, because when we create strings with "sprintf()", the - # variable is coerced to character, and in subsequent iterations of the loop, - # mathemathical operations are not possible anymore - tmp <- x - if (identical(props, "row")) { - for (i in seq_len(nrow(x))) { - row_sum <- sum(x[i, numeric_columns], na.rm = TRUE) - if (row_sum == 0) { - row_sum_string <- "(0%)" - } else { - row_sum_string <- sprintf( - "(%.*f%%)", - digits, - 100 * x[i, numeric_columns] / row_sum - ) - } - tmp[i, numeric_columns] <- paste( - format(x[i, numeric_columns]), - format(row_sum_string, justify = "right") + prop_table <- attributes(x)$prop_table + + if (!is.null(props) && !is.null(prop_table)) { + for (i in seq_len(ncol(prop_table))) { + x[, numeric_columns[i]] <- paste( + format(x[, numeric_columns[i]]), + format( + sprintf("(%.*f%%)", digits, 100 * prop_table[, i]), + justify = "right" ) - } - } else if (identical(props, "column")) { - for (i in seq_len(ncol(x))[numeric_columns]) { - col_sum <- sum(x[, i], na.rm = TRUE) - if (col_sum == 0) { - col_sum_string <- "(0%)" - } else { - col_sum_string <- sprintf("(%.*f%%)", digits, 100 * x[, i] / col_sum) - } - tmp[, i] <- paste( - format(x[, i]), - format(col_sum_string, justify = "right") - ) - } - } else if (identical(props, "full")) { - for (i in seq_len(ncol(x))[numeric_columns]) { - tmp[, i] <- paste( - format(x[, i]), - format( - sprintf("(%.*f%%)", digits, 100 * x[, i] / total_n), - justify = "right" - ) - ) - } + ) } - # copy back final result - x <- tmp } x[] <- lapply(x, as.character) diff --git a/R/utils.R b/R/utils.R index 6083ee562..a42b1a630 100644 --- a/R/utils.R +++ b/R/utils.R @@ -163,7 +163,7 @@ for (nm in setdiff(names(custom_attr), names(attributes(data.frame())))) { attr(data, which = nm) <- custom_attr[[nm]] } - return(data) + data } @@ -241,7 +241,7 @@ } -#' Taken from https://github.com/coolbutuseless/gluestick [licence: MIT] +#' Taken from https://github.com/coolbutuseless/gluestick (licence: MIT) #' Same functionality as `{glue}` #' #' @noRd @@ -292,15 +292,15 @@ # Evaluate if (eval) { - args <- lapply(exprs, function(expr) { + fun_args <- lapply(exprs, function(expr) { eval(parse(text = expr), envir = src) }) } else { - args <- unname(mget(exprs, envir = as.environment(src))) + fun_args <- unname(mget(exprs, envir = as.environment(src))) } # Create the string(s) - do.call(sprintf, c(list(fmt_sprintf), args)) + do.call(sprintf, c(list(fmt_sprintf), fun_args)) } diff --git a/man/as.prop.table.Rd b/man/as.prop.table.Rd new file mode 100644 index 000000000..1cf84f1ca --- /dev/null +++ b/man/as.prop.table.Rd @@ -0,0 +1,103 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data_tabulate.R +\name{as.prop.table} +\alias{as.prop.table} +\alias{as.prop.table.datawizard_crosstab} +\alias{as.data.frame.datawizard_tables} +\alias{as.table.datawizard_table} +\title{Convert a crosstable to a frequency or a propensity table} +\usage{ +as.prop.table(x, ...) + +\method{as.prop.table}{datawizard_crosstab}(x, remove_na = TRUE, simplify = FALSE, verbose = TRUE, ...) + +\method{as.data.frame}{datawizard_tables}( + x, + row.names = NULL, + optional = FALSE, + ..., + stringsAsFactors = FALSE, + add_total = FALSE +) + +\method{as.table}{datawizard_table}(x, remove_na = TRUE, simplify = FALSE, verbose = TRUE, ...) +} +\arguments{ +\item{x}{An object created by \code{data_tabulate()}. It must be of class +\code{datawizard_crosstab} for \code{as.prop.table()}.} + +\item{...}{not used.} + +\item{remove_na}{Logical, if \code{FALSE}, missing values are included in the +frequency or crosstable, else missing values are omitted. Note that the +default for the \code{as.table()} method is \code{remove_na = TRUE}, so that missing +values are not included in the returned table, which makes more sense for +post-processing of the table, e.g. using \code{chisq.test()}.} + +\item{simplify}{Logical, if \code{TRUE}, the returned table is simplified to a +single table object if there is only one frequency or contingency table +input. Else, always for multiple table inputs or when \code{simplify = FALSE}, a +list of tables is returned. This is only relevant for the \code{as.table()} +methods. To ensure consistent output, the default is \code{FALSE}.} + +\item{verbose}{Toggle warnings and messages.} + +\item{row.names}{\code{NULL} or a character vector giving the row + names for the data frame. Missing values are not allowed.} + +\item{optional}{logical. If \code{TRUE}, setting row names and + converting column names (to syntactic names: see + \code{\link[base]{make.names}}) is optional. Note that all of \R's + \pkg{base} package \code{as.data.frame()} methods use + \code{optional} only for column names treatment, basically with the + meaning of \code{\link[base]{data.frame}(*, check.names = !optional)}. + See also the \code{make.names} argument of the \code{matrix} method.} + +\item{stringsAsFactors}{logical: should the character vector be converted + to a factor?} + +\item{add_total}{For crosstables (i.e. when \code{by} is not \code{NULL}), a row and +column with the total N values are added to the data frame. \code{add_total} has +no effect in \code{as.data.frame()} for simple frequency tables.} +} +\description{ +\code{as.prop.table()} is an S3 generic. It can be used on objects of class +\code{datawizard_crosstab} created by \code{data_tabulate()} when it was run with the +arguments \code{by} and \code{proportions}. +} +\examples{ +data(efc) + +# Some cross tabulation +cross <- data_tabulate(efc, select = "e42dep", by = "c172code", proportions = "row") +cross + +# Convert to a propensity table +as.prop.table(cross) + +# Convert to data.frame +result <- data_tabulate(efc, "c172code", by = "e16sex") +as.data.frame(result) +as.data.frame(result)$table +as.data.frame(result, add_total = TRUE)$table + +# Convert to a table that can be passed to chisq.test() + +out <- data_tabulate(efc, "c172code", by = "e16sex") +# we need to simplify the output, else we get a list of tables +tbl <- as.table(out, simplify = TRUE) +tbl +suppressWarnings(chisq.test(tbl)) + +# apply chisq.test to each table +out <- data_tabulate(efc, c("c172code", "e16sex")) +suppressWarnings(lapply(as.table(out), chisq.test)) + +# can also handle grouped data frames +d <- data_group(mtcars, "am") +x <- data_tabulate(d, "cyl", by = "gear") +as.table(x) +} +\seealso{ +\link{data_tabulate} +} diff --git a/man/data_tabulate.Rd b/man/data_tabulate.Rd index 75b3fd1e1..87656e17f 100644 --- a/man/data_tabulate.Rd +++ b/man/data_tabulate.Rd @@ -4,8 +4,6 @@ \alias{data_tabulate} \alias{data_tabulate.default} \alias{data_tabulate.data.frame} -\alias{as.data.frame.datawizard_tables} -\alias{as.table.datawizard_table} \alias{print.datawizard_table} \alias{display.datawizard_table} \title{Create frequency and crosstables of variables} @@ -40,17 +38,6 @@ data_tabulate(x, ...) ... ) -\method{as.data.frame}{datawizard_tables}( - x, - row.names = NULL, - optional = FALSE, - ..., - stringsAsFactors = FALSE, - add_total = FALSE -) - -\method{as.table}{datawizard_table}(x, remove_na = TRUE, simplify = FALSE, verbose = TRUE, ...) - \method{print}{datawizard_table}(x, big_mark = NULL, ...) \method{display}{datawizard_table}(object, big_mark = NULL, format = "markdown", ...) @@ -146,30 +133,6 @@ functions (see 'Details'), this argument may be used as workaround.} \item{collapse}{Logical, if \code{TRUE} collapses multiple tables into one larger table for printing. This affects only printing, not the returned object.} -\item{row.names}{\code{NULL} or a character vector giving the row - names for the data frame. Missing values are not allowed.} - -\item{optional}{logical. If \code{TRUE}, setting row names and - converting column names (to syntactic names: see - \code{\link[base]{make.names}}) is optional. Note that all of \R's - \pkg{base} package \code{as.data.frame()} methods use - \code{optional} only for column names treatment, basically with the - meaning of \code{\link[base]{data.frame}(*, check.names = !optional)}. - See also the \code{make.names} argument of the \code{matrix} method.} - -\item{stringsAsFactors}{logical: should the character vector be converted - to a factor?} - -\item{add_total}{For crosstables (i.e. when \code{by} is not \code{NULL}), a row and -column with the total N values are added to the data frame. \code{add_total} has -no effect in \code{as.data.frame()} for simple frequency tables.} - -\item{simplify}{Logical, if \code{TRUE}, the returned table is simplified to a -single table object if there is only one frequency or contingency table -input. Else, always for multiple table inputs or when \code{simplify = FALSE}, a -list of tables is returned. This is only relevant for the \code{as.table()} -methods. To ensure consistent output, the default is \code{FALSE}.} - \item{big_mark}{Optional character string, indicating the big mark that is used for large numbers. If \code{NULL} (default), a big mark is added automatically for large numbers (i.e. numbers with more than 5 digits). If you want to remove @@ -198,11 +161,12 @@ There is an \code{as.data.frame()} method, to return the frequency tables as a data frame. The structure of the returned object is a nested data frame, where the first column contains name of the variable for which frequencies were calculated, and the second column is a list column that contains the -frequency tables as data frame. See 'Examples'. +frequency tables as data frame. See \link{as.table.datawizard_table}. There is also an \code{as.table()} method, which returns a table object with the frequencies of the variable. This is useful for further statistical analysis, -e.g. for using \code{chisq.test()} on the frequency table. See 'Examples'. +e.g. for using \code{chisq.test()} on the frequency table. See +\link{as.table.datawizard_table}. } \note{ There are \code{print_html()} and \code{print_md()} methods available for printing @@ -285,27 +249,8 @@ data_tabulate( # round percentages out <- data_tabulate(efc, "c172code", by = "e16sex", proportions = "column") print(out, digits = 0) - -# coerce to data frames -result <- data_tabulate(efc, "c172code", by = "e16sex") -as.data.frame(result) -as.data.frame(result)$table -as.data.frame(result, add_total = TRUE)$table - -# post-processing ------ -# ---------------------- - -out <- data_tabulate(efc, "c172code", by = "e16sex") -# we need to simplify the output, else we get a list of tables -suppressWarnings(chisq.test(as.table(out, simplify = TRUE))) - -# apply chisq.test to each table -out <- data_tabulate(efc, c("c172code", "e16sex")) -suppressWarnings(lapply(as.table(out), chisq.test)) - -# can also handle grouped data frames -d <- data_group(mtcars, "am") -x <- data_tabulate(d, "cyl", by = "gear") -as.table(x) \dontshow{\}) # examplesIf} } +\seealso{ +\link{as.prop.table} +} diff --git a/pkgdown/_pkgdown.yaml b/pkgdown/_pkgdown.yaml index 01b701f03..9d6308271 100644 --- a/pkgdown/_pkgdown.yaml +++ b/pkgdown/_pkgdown.yaml @@ -61,6 +61,7 @@ reference: desc: | Functions to compute statistical summaries of data properties and distributions contents: + - as.prop.table - data_codebook - data_summary - data_tabulate diff --git a/tests/testthat/_snaps/data_tabulate.md b/tests/testthat/_snaps/data_tabulate.md index c83b293a1..872b3764c 100644 --- a/tests/testthat/_snaps/data_tabulate.md +++ b/tests/testthat/_snaps/data_tabulate.md @@ -516,17 +516,17 @@ Code print(data_tabulate(efc, c("c172code", "e42dep"), by = "e16sex", proportions = "row")) Output - Variable | Value | male | female | | Total - ---------+-------+------------+------------+------------+------ - c172code | 1 | 5 (62.5%) | 2 (25.0%) | 1 (12.5%) | 8 - c172code | 2 | 31 (47.0%) | 33 (50.0%) | 2 (3.0%) | 66 - c172code | 3 | 4 (25.0%) | 11 (68.8%) | 1 (6.2%) | 16 - c172code | | 5 (50.0%) | 4 (40.0%) | 1 (10.0%) | 10 - e42dep | 1 | 2 (100.0%) | 0 (0.0%) | 0 (0.0%) | 2 - e42dep | 2 | 2 (50.0%) | 2 (50.0%) | 0 (0.0%) | 4 - e42dep | 3 | 8 (28.6%) | 18 (64.3%) | 2 (7.1%) | 28 - e42dep | 4 | 32 (50.8%) | 28 (44.4%) | 3 (4.8%) | 63 - e42dep | | 1 (33.3%) | 2 (66.7%) | 0 (0.0%) | 3 + Variable | Value | male | female | | Total + ---------+-------+-------------+------------+-----------+------ + c172code | 1 | 5 (62.5%) | 2 (25.0%) | 1 (12.5%) | 8 + c172code | 2 | 31 (47.0%) | 33 (50.0%) | 2 (3.0%) | 66 + c172code | 3 | 4 (25.0%) | 11 (68.8%) | 1 (6.2%) | 16 + c172code | | 5 (50.0%) | 4 (40.0%) | 1 (10.0%) | 10 + e42dep | 1 | 2 (100.0%) | 0 (0.0%) | 0 (0.0%) | 2 + e42dep | 2 | 2 (50.0%) | 2 (50.0%) | 0 (0.0%) | 4 + e42dep | 3 | 8 (28.6%) | 18 (64.3%) | 2 (7.1%) | 28 + e42dep | 4 | 32 (50.8%) | 28 (44.4%) | 3 (4.8%) | 63 + e42dep | | 1 (33.3%) | 2 (66.7%) | 0 (0.0%) | 3 # data_tabulate, cross tables, tinytable @@ -655,17 +655,17 @@ Output Grouped by e42dep (1) - Variable | Value | male | female | | Total - ---------+-------+------------+--------+------------+------ - c172code | 2 | 2 (100.0%) | | 0 (0.0%) | 2 - | | 0 (0%) | | 0 (0%) | 0 + Variable | Value | male | female | | Total + ---------+-------+------------+--------+----------+------ + c172code | 2 | 2 (100.0%) | | 0 (0.0%) | 2 + | | 0 (0.0%) | | 0 (0.0%) | 0 Grouped by e42dep (2) - Variable | Value | male | female | | Total - ---------+-------+-----------+-----------+-----------+------ - c172code | 2 | 2 (50.0%) | 2 (50.0%) | 0 (0.0%) | 4 - | | 0 (0%) | 0 (0%) | 0 (0%) | 0 + Variable | Value | male | female | | Total + ---------+-------+-----------+-----------+----------+------ + c172code | 2 | 2 (50.0%) | 2 (50.0%) | 0 (0.0%) | 4 + | | 0 (0.0%) | 0 (0.0%) | 0 (0.0%) | 0 Grouped by e42dep (3) @@ -687,10 +687,10 @@ Grouped by e42dep (NA) - Variable | Value | male | female | | Total - ---------+-------+------------+------------+------------+------ - c172code | 2 | 0 (0.0%) | 2 (100.0%) | 0 (0.0%) | 2 - | | 1 (100.0%) | 0 (0.0%) | 0 (0.0%) | 1 + Variable | Value | male | female | | Total + ---------+-------+------------+------------+----------+------ + c172code | 2 | 0 (0.0%) | 2 (100.0%) | 0 (0.0%) | 2 + | | 1 (100.0%) | 0 (0.0%) | 0 (0.0%) | 1 # data_tabulate, cross tables, print/format works @@ -796,17 +796,17 @@ Output - |Variable | Value| male| female| (NA)| Total| - |:--------|-----:|----------:|----------:|----------:|-----:| - |c172code | 1| 5 (62.5%)| 2 (25.0%)| 1 (12.5%)| 8| - |c172code | 2| 31 (47.0%)| 33 (50.0%)| 2 (3.0%)| 66| - |c172code | 3| 4 (25.0%)| 11 (68.8%)| 1 (6.2%)| 16| - |c172code | (NA)| 5 (50.0%)| 4 (40.0%)| 1 (10.0%)| 10| - |e42dep | 1| 2 (100.0%)| 0 (0.0%)| 0 (0.0%)| 2| - |e42dep | 2| 2 (50.0%)| 2 (50.0%)| 0 (0.0%)| 4| - |e42dep | 3| 8 (28.6%)| 18 (64.3%)| 2 (7.1%)| 28| - |e42dep | 4| 32 (50.8%)| 28 (44.4%)| 3 (4.8%)| 63| - |e42dep | (NA)| 1 (33.3%)| 2 (66.7%)| 0 (0.0%)| 3| + |Variable | Value| male| female| (NA) | Total| + |:--------|-----:|-----------:|----------:|:---------|-----:| + |c172code | 1| 5 (62.5%)| 2 (25.0%)|1 (12.5%) | 8| + |c172code | 2| 31 (47.0%)| 33 (50.0%)|2 (3.0%) | 66| + |c172code | 3| 4 (25.0%)| 11 (68.8%)|1 (6.2%) | 16| + |c172code | (NA)| 5 (50.0%)| 4 (40.0%)|1 (10.0%) | 10| + |e42dep | 1| 2 (100.0%)| 0 (0.0%)| 0 (0.0%) | 2| + |e42dep | 2| 2 (50.0%)| 2 (50.0%)| 0 (0.0%) | 4| + |e42dep | 3| 8 (28.6%)| 18 (64.3%)| 2 (7.1%) | 28| + |e42dep | 4| 32 (50.8%)| 28 (44.4%)| 3 (4.8%) | 63| + |e42dep | (NA)| 1 (33.3%)| 2 (66.7%)| 0 (0.0%) | 3| --- @@ -831,31 +831,31 @@ Output - |Variable | Value| male| female| (NA)| Total| - |:--------|-----:|----------:|----------:|----------:|-----:| - |c172code | 1| 5 (62.5%)| 2 (25.0%)| 1 (12.5%)| 8| - |c172code | 2| 31 (47.0%)| 33 (50.0%)| 2 (3.0%)| 66| - |c172code | 3| 4 (25.0%)| 11 (68.8%)| 1 (6.2%)| 16| - |c172code | (NA)| 5 (50.0%)| 4 (40.0%)| 1 (10.0%)| 10| - |e42dep | 1| 2 (100.0%)| 0 (0.0%)| 0 (0.0%)| 2| - |e42dep | 2| 2 (50.0%)| 2 (50.0%)| 0 (0.0%)| 4| - |e42dep | 3| 8 (28.6%)| 18 (64.3%)| 2 (7.1%)| 28| - |e42dep | 4| 32 (50.8%)| 28 (44.4%)| 3 (4.8%)| 63| - |e42dep | (NA)| 1 (33.3%)| 2 (66.7%)| 0 (0.0%)| 3| + |Variable | Value| male| female| (NA) | Total| + |:--------|-----:|-----------:|----------:|:---------|-----:| + |c172code | 1| 5 (62.5%)| 2 (25.0%)|1 (12.5%) | 8| + |c172code | 2| 31 (47.0%)| 33 (50.0%)|2 (3.0%) | 66| + |c172code | 3| 4 (25.0%)| 11 (68.8%)|1 (6.2%) | 16| + |c172code | (NA)| 5 (50.0%)| 4 (40.0%)|1 (10.0%) | 10| + |e42dep | 1| 2 (100.0%)| 0 (0.0%)| 0 (0.0%) | 2| + |e42dep | 2| 2 (50.0%)| 2 (50.0%)| 0 (0.0%) | 4| + |e42dep | 3| 8 (28.6%)| 18 (64.3%)| 2 (7.1%) | 28| + |e42dep | 4| 32 (50.8%)| 28 (44.4%)| 3 (4.8%) | 63| + |e42dep | (NA)| 1 (33.3%)| 2 (66.7%)| 0 (0.0%) | 3| # data_tabulate, correct 0% for proportions Code print(out[[1]]) Output - c172code | male | female | | Total - ---------+------------+------------+--------+------ - 1 | 5 (10.9%) | 3 (5.6%) | 0 (0%) | 8 - 2 | 32 (69.6%) | 34 (63.0%) | 0 (0%) | 66 - 3 | 4 (8.7%) | 12 (22.2%) | 0 (0%) | 16 - | 5 (10.9%) | 5 (9.3%) | 0 (0%) | 10 - ---------+------------+------------+--------+------ - Total | 46 | 54 | 0 | 100 + c172code | male | female | | Total + ---------+------------+------------+----------+------ + 1 | 5 (10.9%) | 3 (5.6%) | 0 (0.0%) | 8 + 2 | 32 (69.6%) | 34 (63.0%) | 0 (0.0%) | 66 + 3 | 4 (8.7%) | 12 (22.2%) | 0 (0.0%) | 16 + | 5 (10.9%) | 5 (9.3%) | 0 (0.0%) | 10 + ---------+------------+------------+----------+------ + Total | 46 | 54 | 0 | 100 # data_tabulate, table methods diff --git a/tests/testthat/test-data_tabulate.R b/tests/testthat/test-data_tabulate.R index 20193d68d..5ac4bad63 100644 --- a/tests/testthat/test-data_tabulate.R +++ b/tests/testthat/test-data_tabulate.R @@ -818,7 +818,7 @@ test_that("data_tabulate, correct 0% for proportions", { out <- data_tabulate(efc, "c172code", by = "e16sex", proportions = "column") expect_identical( format(out[[1]])[[4]], - c("0 (0%)", "0 (0%)", "0 (0%)", "0 (0%)", "", "0") + c("0 (0.0%)", "0 (0.0%)", "0 (0.0%)", "0 (0.0%)", "", "0") ) expect_snapshot(print(out[[1]])) }) @@ -1052,3 +1052,54 @@ test_that("data_tabulate, table methods, only warn if necessary", { d <- data_group(mtcars, "am") expect_silent(as.table(data_tabulate(d, "cyl", by = "gear"))) }) + + +test_that("data_tabulate, cross tables, extract proportions", { + data(efc, package = "datawizard") + out <- data_tabulate( + efc, + "c172code", + by = "e16sex", + proportions = "row", + remove_na = TRUE + ) + tab <- table(efc$c172code, efc$e16sex) / + rowSums(table(efc$c172code, efc$e16sex)) + dimnames(tab) <- list(c("1", "2", "3"), c("male", "female")) + expect_equal( + as.prop.table(out, verbose = FALSE), + list(tab), + ignore_attr = TRUE, + tolerance = 1e-4 + ) + expect_equal( + as.prop.table(out, verbose = FALSE, simplify = TRUE), + tab, + ignore_attr = TRUE, + tolerance = 1e-4 + ) + out <- data_tabulate( + efc, + "c172code", + by = "e16sex", + proportions = "col", + remove_na = TRUE + ) + tab <- as.table(t( + t(table(efc$c172code, efc$e16sex)) / + colSums(table(efc$c172code, efc$e16sex)) + )) + dimnames(tab) <- list(c("1", "2", "3"), c("male", "female")) + expect_equal( + as.prop.table(out, verbose = FALSE), + list(tab), + ignore_attr = TRUE, + tolerance = 1e-4 + ) + expect_equal( + as.prop.table(out, verbose = FALSE, simplify = TRUE), + tab, + ignore_attr = TRUE, + tolerance = 1e-4 + ) +})