diff --git a/NEWS.md b/NEWS.md index 001ec4e5e1..b07057f90c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,47 @@ # dplyr (development version) +* `.by` is a new experimental inline alternative to `group_by()` that supports + _temporary_ grouping in the following key dplyr verbs: `mutate()`, + `summarise()`, `filter()`, and the `slice()` family (#6528). + + Rather than: + + ``` + starwars %>% + group_by(species, homeworld) %>% + summarise(mean_height = mean(height)) + ``` + + You can now write: + + ``` + starwars %>% + summarise( + mean_height = mean(height), + .by = c(species, homeworld) + ) + ``` + + The most useful reason to do this is because grouping with `.by` is + _temporary_ and only affects the verb it is being applied to. An ungrouped + data frame went into the `summarise()` call, so an ungrouped data frame will + come out; with `.by`, you never need to remember to `ungroup()` afterwards. + + Additionally, using `summarise()` or `slice()` with `.by` will never sort the + results by the group key, unlike with `group_by()`. Instead, the results are + returned using the existing ordering of the groups from the original data. We + feel this is more predictable, better maintains any ordering you might have + already applied with a previous call to `arrange()`, and provides a way to + maintain the current ordering without having to resort to factors. + + This exciting feature was inspired by + [data.table](https://CRAN.R-project.org/package=data.table), where the + equivalent syntax looks like: + + ``` + starwars[, .(mean_height = mean(height)), by = .(species, homeworld)] + ``` + * `summarise()` now correctly recycles named 0-column data frames (#6509). * `.cols` and `.fns` are now required arguments in `across()`, `c_across()`, diff --git a/R/by.R b/R/by.R new file mode 100644 index 0000000000..7d291e07e2 --- /dev/null +++ b/R/by.R @@ -0,0 +1,126 @@ +#' Helper for consistent documentation of `.by` +#' +#' Use `@inheritParams args_by` to consistently document `.by`. +#' +#' @param .by `r lifecycle::badge("experimental")` +#' +#' <[`tidy-select`][dplyr_tidy_select]> Optionally, a selection of columns to +#' temporarily group by using an inline alternative to [group_by()]. For +#' details and examples, see [?dplyr_by][dplyr_by]. +#' +#' @name args_by +#' @keywords internal +NULL + +#' Temporary grouping with `.by` +#' +#' ```{r, echo = FALSE, results = "asis"} +#' result <- rlang::with_options( +#' knitr::knit_child("man/rmd/by.Rmd"), +#' dplyr.summarise.inform = TRUE +#' ) +#' cat(result, sep = "\n") +#' ``` +#' +#' @name dplyr_by +NULL + +compute_by <- function(by, + data, + ..., + by_arg = "by", + data_arg = "data", + error_call = caller_env()) { + check_dots_empty0(...) + + error_call <- dplyr_error_call(error_call) + + by <- enquo(by) + check_by(by, data, by_arg = by_arg, data_arg = data_arg, error_call = error_call) + + if (is_grouped_df(data)) { + type <- "grouped" + names <- group_vars(data) + data <- group_data(data) + } else if (is_rowwise_df(data)) { + type <- "rowwise" + names <- group_vars(data) + data <- group_data(data) + } else { + by <- eval_select_by(by, data, error_call = error_call) + + if (length(by) == 0L) { + # `by = NULL` or empty selection + type <- "ungrouped" + names <- by + data <- group_data(data) + data <- as_tibble(data) + } else { + type <- "grouped" + names <- by + data <- compute_by_groups(data, by, error_call = error_call) + } + } + + new_by(type = type, names = names, data = data) +} + +compute_by_groups <- function(data, names, error_call = caller_env()) { + data <- dplyr_col_select(data, names, error_call = error_call) + info <- vec_group_loc(data) + + size <- vec_size(info) + + out <- dplyr_new_list(info$key) + out[[".rows"]] <- new_list_of(info$loc, ptype = integer()) + out <- new_tibble(out, nrow = size) + + out +} + +check_by <- function(by, + data, + ..., + by_arg = "by", + data_arg = "data", + error_call = caller_env()) { + check_dots_empty0(...) + + if (quo_is_null(by)) { + return(invisible(NULL)) + } + + if (is_grouped_df(data)) { + message <- paste0( + "Can't supply {.arg {by_arg}} when ", + "{.arg {data_arg}} is a grouped data frame." + ) + cli::cli_abort(message, call = error_call) + } + + if (is_rowwise_df(data)) { + message <- paste0( + "Can't supply {.arg {by_arg}} when ", + "{.arg {data_arg}} is a rowwise data frame." + ) + cli::cli_abort(message, call = error_call) + } + + invisible(NULL) +} + +eval_select_by <- function(by, + data, + error_call = caller_env()) { + out <- tidyselect::eval_select( + expr = by, + data = data, + allow_rename = FALSE, + error_call = error_call + ) + names(out) +} + +new_by <- function(type, names, data) { + structure(list(type = type, names = names, data = data), class = "dplyr_by") +} diff --git a/R/conditions.R b/R/conditions.R index 341e944084..d892362c7f 100644 --- a/R/conditions.R +++ b/R/conditions.R @@ -47,7 +47,7 @@ cnd_bullet_cur_group_label <- function(what = "error") { } cnd_bullet_rowwise_unlist <- function() { - if (peek_mask()$is_rowwise_df()) { + if (peek_mask()$is_rowwise()) { glue_data(peek_error_context(), "Did you mean: `{error_name} = list({error_expression})` ?") } } @@ -131,9 +131,9 @@ dot_as_label <- function(expr) { mask_type <- function(mask = peek_mask()) { if (mask$get_size() > 0) { - if (mask$is_grouped_df()) { + if (mask$is_grouped()) { return("grouped") - } else if (mask$is_rowwise_df()) { + } else if (mask$is_rowwise()) { return("rowwise") } } diff --git a/R/data-mask.R b/R/data-mask.R index 95f6d7bbed..2d5f63a446 100644 --- a/R/data-mask.R +++ b/R/data-mask.R @@ -1,7 +1,7 @@ DataMask <- R6Class("DataMask", public = list( - initialize = function(data, verb, error_call) { - rows <- group_rows(data) + initialize = function(data, by, verb, error_call) { + rows <- by$data$.rows if (length(rows) == 0) { # Specially handle case of zero groups rows <- new_list_of(list(integer()), ptype = integer()) @@ -16,22 +16,23 @@ DataMask <- R6Class("DataMask", abort("Can't transform a data frame with duplicate names.", call = error_call) } names(data) <- names_bindings + private$size <- nrow(data) private$current_data <- dplyr_new_list(data) - private$chops <- .Call(dplyr_lazy_vec_chop_impl, data, rows) - private$mask <- .Call(dplyr_data_masks_setup, private$chops, data, rows) + private$grouped <- by$type == "grouped" + private$rowwise <- by$type == "rowwise" - private$grouped_df <- is_grouped_df(data) - private$rowwise_df <- is_rowwise_df(data) + private$chops <- .Call(dplyr_lazy_vec_chop_impl, data, rows, private$grouped, private$rowwise) + private$mask <- .Call(dplyr_data_masks_setup, private$chops, data, rows) - private$keys <- group_keys(data) - private$group_vars <- group_vars(data) + private$keys <- group_keys0(by$data) + private$by_names <- by$names private$verb <- verb }, add_one = function(name, chunks, result) { - if (self$is_rowwise_df()){ + if (self$is_rowwise()){ is_scalar_list <- function(.x) { vec_is_list(.x) && length(.x) == 1L } @@ -78,7 +79,7 @@ DataMask <- R6Class("DataMask", # `across(.fns = NULL)`. We should remove this when we defunct those. cols <- self$current_cols(vars) - if (self$is_rowwise_df()) { + if (self$is_rowwise()) { cols <- map2(cols, names(cols), function(col, name) { if (vec_is_list(private$current_data[[name]])) { col <- list(col) @@ -117,7 +118,7 @@ DataMask <- R6Class("DataMask", }, current_non_group_vars = function() { - setdiff(self$current_vars(), private$group_vars) + setdiff(self$current_vars(), private$by_names) }, get_current_group = function() { @@ -172,12 +173,12 @@ DataMask <- R6Class("DataMask", }) }, - is_grouped_df = function() { - private$grouped_df + is_grouped = function() { + private$grouped }, - is_rowwise_df = function() { - private$rowwise_df + is_rowwise = function() { + private$rowwise }, get_keys = function() { @@ -215,8 +216,8 @@ DataMask <- R6Class("DataMask", # ptypes of all the variables current_data = list(), - # names of the grouping variables - group_vars = character(), + # names of the `by` variables + by_names = character(), # list of indices, one integer vector per group rows = NULL, @@ -228,8 +229,8 @@ DataMask <- R6Class("DataMask", size = NULL, # Type of data frame - grouped_df = NULL, - rowwise_df = NULL, + grouped = NULL, + rowwise = NULL, verb = character() ) diff --git a/R/filter.R b/R/filter.R index 92350e1f29..f7d3518141 100644 --- a/R/filter.R +++ b/R/filter.R @@ -48,6 +48,7 @@ #' #' @family single table verbs #' @inheritParams arrange +#' @inheritParams args_by #' @param ... <[`data-masking`][dplyr_data_masking]> Expressions that return a #' logical value, and are defined in terms of the variables in `.data`. #' If multiple expressions are included, they are combined with the `&` operator. @@ -105,23 +106,37 @@ #' .data[[vars[[2]]]] > cond[[2]] #' ) #' # Learn more in ?dplyr_data_masking -filter <- function(.data, ..., .preserve = FALSE) { +filter <- function(.data, ..., .by = NULL, .preserve = FALSE) { + by <- enquo(.by) + + if (!quo_is_null(by) && !is_false(.preserve)) { + abort("Can't supply both `.by` and `.preserve`.") + } + UseMethod("filter") } #' @export -filter.data.frame <- function(.data, ..., .preserve = FALSE) { - loc <- filter_rows(.data, ...) +filter.data.frame <- function(.data, ..., .by = NULL, .preserve = FALSE) { + loc <- filter_rows(.data, ..., .by = {{ .by }}) dplyr_row_slice(.data, loc, preserve = .preserve) } -filter_rows <- function(.data, ..., error_call = caller_env()) { +filter_rows <- function(.data, ..., .by = NULL, error_call = caller_env()) { error_call <- dplyr_error_call(error_call) dots <- dplyr_quosures(...) check_filter(dots, error_call = error_call) - mask <- DataMask$new(.data, "filter", error_call = error_call) + by <- compute_by( + by = {{ .by }}, + data = .data, + by_arg = ".by", + data_arg = ".data", + error_call = error_call + ) + + mask <- DataMask$new(.data, by, "filter", error_call = error_call) on.exit(mask$forget(), add = TRUE) dots <- filter_expand(dots, mask = mask, error_call = error_call) diff --git a/R/group-by.R b/R/group-by.R index e10c4258d9..8e501cbc72 100644 --- a/R/group-by.R +++ b/R/group-by.R @@ -244,9 +244,13 @@ add_computed_columns <- function(.data, if (any(needs_mutate)) { # TODO: use less of a hack if (inherits(.data, "data.frame")) { + bare_data <- ungroup(.data) + by <- compute_by(by = NULL, data = bare_data) + cols <- mutate_cols( - ungroup(.data), + bare_data, dplyr_quosures(!!!vars), + by = by, error_call = error_call ) diff --git a/R/group-data.R b/R/group-data.R index 5b64e4bc8b..1184bea642 100644 --- a/R/group-data.R +++ b/R/group-data.R @@ -97,8 +97,13 @@ group_keys.data.frame <- function(.tbl, ...) { .tbl <- group_by(.tbl, ...) } out <- group_data(.tbl) - .Call(`dplyr_group_keys`, out) + group_keys0(out) } +group_keys0 <- function(x) { + # Compute keys directly from `group_data()` results + .Call(`dplyr_group_keys`, x) +} + #' @rdname group_data #' @export group_rows <- function(.data) { diff --git a/R/mutate.R b/R/mutate.R index 50c11a4a3d..a3d2c24a93 100644 --- a/R/mutate.R +++ b/R/mutate.R @@ -147,6 +147,9 @@ mutate <- function(.data, ...) { } #' @rdname mutate +#' +#' @inheritParams args_by +#' #' @param .keep #' Control which columns from `.data` are retained in the output. Grouping #' columns and columns created by `...` are always kept. @@ -167,12 +170,15 @@ mutate <- function(.data, ...) { #' @export mutate.data.frame <- function(.data, ..., + .by = NULL, .keep = c("all", "used", "unused", "none"), .before = NULL, .after = NULL) { keep <- arg_match(.keep) - cols <- mutate_cols(.data, dplyr_quosures(...)) + by <- compute_by({{ .by }}, .data, by_arg = ".by", data_arg = ".data") + + cols <- mutate_cols(.data, dplyr_quosures(...), by) used <- attr(cols, "used") out <- dplyr_col_modify(.data, cols) @@ -184,7 +190,7 @@ mutate.data.frame <- function(.data, cols <- compact_null(cols) cols_data <- names(.data) - cols_group <- group_vars(.data) + cols_group <- by$names cols_expr <- names(cols) cols_expr_modified <- intersect(cols_expr, cols_data) @@ -218,13 +224,13 @@ mutate.data.frame <- function(.data, # Helpers ----------------------------------------------------------------- -mutate_cols <- function(.data, dots, error_call = caller_env()) { +mutate_cols <- function(data, dots, by, error_call = caller_env()) { # Collect dots before setting up error handlers (#6178) force(dots) error_call <- dplyr_error_call(error_call) - mask <- DataMask$new(.data, "mutate", error_call = error_call) + mask <- DataMask$new(data, by, "mutate", error_call = error_call) old_current_column <- context_peek_bare("column") on.exit(context_poke("column", old_current_column), add = TRUE) @@ -240,7 +246,7 @@ mutate_cols <- function(.data, dots, error_call = caller_env()) { poke_error_context(dots, i, mask = mask) context_poke("column", old_current_column) - new_columns <- mutate_col(dots[[i]], .data, mask, new_columns) + new_columns <- mutate_col(dots[[i]], data, mask, new_columns) }, error = dplyr_error_handler( dots = dots, @@ -307,7 +313,7 @@ mutate_col <- function(dot, data, mask, new_columns) { chunks <- mask$resolve(name) } - if (inherits(data, "rowwise_df") && vec_is_list(result)) { + if (mask$is_rowwise() && vec_is_list(result)) { sizes <- list_sizes(result) wrong <- which(sizes != 1) if (length(wrong)) { diff --git a/R/slice.R b/R/slice.R index d7b023352b..de6a271a83 100644 --- a/R/slice.R +++ b/R/slice.R @@ -20,6 +20,7 @@ #' operation, use [filter()] and [row_number()]. #' #' @family single table verbs +#' @inheritParams args_by #' @inheritParams arrange #' @inheritParams filter #' @param ... For `slice()`: <[`data-masking`][dplyr_data_masking]> Integer row @@ -118,52 +119,62 @@ #' filter(mtcars, row_number() == 1L) #' filter(mtcars, row_number() == n()) #' filter(mtcars, between(row_number(), 5, n())) -slice <- function(.data, ..., .preserve = FALSE) { +slice <- function(.data, ..., .by = NULL, .preserve = FALSE) { + by <- enquo(.by) + + if (!quo_is_null(by) && !is_false(.preserve)) { + abort("Can't supply both `.by` and `.preserve`.") + } + UseMethod("slice") } #' @export -slice.data.frame <- function(.data, ..., .preserve = FALSE) { - loc <- slice_rows(.data, ..., error_call = current_env()) +slice.data.frame <- function(.data, ..., .by = NULL, .preserve = FALSE) { + loc <- slice_rows(.data, ..., .by = {{ .by }}) dplyr_row_slice(.data, loc, preserve = .preserve) } #' @export #' @rdname slice -slice_head <- function(.data, ..., n, prop) { +slice_head <- function(.data, ..., n, prop, by = NULL) { check_slice_dots(..., n = n, prop = prop) UseMethod("slice_head") } #' @export -slice_head.data.frame <- function(.data, ..., n, prop) { +slice_head.data.frame <- function(.data, ..., n, prop, by = NULL) { size <- get_slice_size(n = n, prop = prop) idx <- function(n) { seq2(1, size(n)) } dplyr_local_error_call() - slice(.data, idx(dplyr::n())) + dplyr_local_slice_by_arg("by") + + slice(.data, idx(dplyr::n()), .by = {{ by }}) } #' @export #' @rdname slice -slice_tail <- function(.data, ..., n, prop) { +slice_tail <- function(.data, ..., n, prop, by = NULL) { check_slice_dots(..., n = n, prop = prop) UseMethod("slice_tail") } #' @export -slice_tail.data.frame <- function(.data, ..., n, prop) { +slice_tail.data.frame <- function(.data, ..., n, prop, by = NULL) { size <- get_slice_size(n = n, prop = prop) idx <- function(n) { seq2(n - size(n) + 1, n) } dplyr_local_error_call() - slice(.data, idx(dplyr::n())) + dplyr_local_slice_by_arg("by") + + slice(.data, idx(dplyr::n()), .by = {{ by }}) } #' @export @@ -178,7 +189,7 @@ slice_tail.data.frame <- function(.data, ..., n, prop) { #' If `FALSE`, `NA` values are sorted to the end (like in [arrange()]), so #' they will only be included if there are insufficient non-missing values to #' reach `n`/`prop`. -slice_min <- function(.data, order_by, ..., n, prop, with_ties = TRUE, na_rm = FALSE) { +slice_min <- function(.data, order_by, ..., n, prop, by = NULL, with_ties = TRUE, na_rm = FALSE) { check_required(order_by) check_slice_dots(..., n = n, prop = prop) check_bool(with_ties) @@ -188,28 +199,34 @@ slice_min <- function(.data, order_by, ..., n, prop, with_ties = TRUE, na_rm = F } #' @export -slice_min.data.frame <- function(.data, order_by, ..., n, prop, with_ties = TRUE, na_rm = FALSE) { +slice_min.data.frame <- function(.data, order_by, ..., n, prop, by = NULL, with_ties = TRUE, na_rm = FALSE) { size <- get_slice_size(n = n, prop = prop) - dplyr_local_error_call() - slice(.data, local({ - n <- dplyr::n() - order_by <- {{ order_by }} - vec_assert(order_by, size = n) - - slice_rank_idx( - order_by, - size(n), - direction = "asc", - with_ties = with_ties, - na_rm = na_rm - ) - })) + dplyr_local_error_call() + dplyr_local_slice_by_arg("by") + + slice( + .data, + .by = {{ by }}, + local({ + n <- dplyr::n() + order_by <- {{ order_by }} + vec_assert(order_by, size = n) + + slice_rank_idx( + order_by, + size(n), + direction = "asc", + with_ties = with_ties, + na_rm = na_rm + ) + }) + ) } #' @export #' @rdname slice -slice_max <- function(.data, order_by, ..., n, prop, with_ties = TRUE, na_rm = FALSE) { +slice_max <- function(.data, order_by, ..., n, prop, by = NULL, with_ties = TRUE, na_rm = FALSE) { check_required(order_by) check_slice_dots(..., n = n, prop = prop) check_bool(with_ties) @@ -219,24 +236,30 @@ slice_max <- function(.data, order_by, ..., n, prop, with_ties = TRUE, na_rm = F } #' @export -slice_max.data.frame <- function(.data, order_by, ..., n, prop, with_ties = TRUE, na_rm = FALSE) { +slice_max.data.frame <- function(.data, order_by, ..., n, prop, by = NULL, with_ties = TRUE, na_rm = FALSE) { size <- get_slice_size(n = n, prop = prop) dplyr_local_error_call() - slice(.data, local({ - n <- dplyr::n() - order_by <- {{ order_by }} - - vec_assert(order_by, size = n) - - slice_rank_idx( - order_by, - size(n), - direction = "desc", - with_ties = with_ties, - na_rm = na_rm - ) - })) + dplyr_local_slice_by_arg("by") + + slice( + .data, + .by = {{ by }}, + local({ + n <- dplyr::n() + order_by <- {{ order_by }} + + vec_assert(order_by, size = n) + + slice_rank_idx( + order_by, + size(n), + direction = "desc", + with_ties = with_ties, + na_rm = na_rm + ) + }) + ) } #' @export @@ -246,7 +269,7 @@ slice_max.data.frame <- function(.data, order_by, ..., n, prop, with_ties = TRUE #' @param weight_by <[`data-masking`][dplyr_data_masking]> Sampling weights. #' This must evaluate to a vector of non-negative numbers the same length as #' the input. Weights are automatically standardised to sum to 1. -slice_sample <- function(.data, ..., n, prop, weight_by = NULL, replace = FALSE) { +slice_sample <- function(.data, ..., n, prop, by = NULL, weight_by = NULL, replace = FALSE) { check_slice_dots(..., n = n, prop = prop) check_bool(replace) @@ -254,31 +277,46 @@ slice_sample <- function(.data, ..., n, prop, weight_by = NULL, replace = FALSE) } #' @export -slice_sample.data.frame <- function(.data, ..., n, prop, weight_by = NULL, replace = FALSE) { +slice_sample.data.frame <- function(.data, ..., n, prop, by = NULL, weight_by = NULL, replace = FALSE) { size <- get_slice_size(n = n, prop = prop, allow_outsize = replace) dplyr_local_error_call() - slice(.data, local({ - weight_by <- {{ weight_by }} + dplyr_local_slice_by_arg("by") - n <- dplyr::n() - if (!is.null(weight_by)) { - weight_by <- vec_assert(weight_by, size = n, arg = "weight_by") - } - sample_int(n, size(n), replace = replace, wt = weight_by) - })) + slice( + .data, + .by = {{ by }}, + local({ + weight_by <- {{ weight_by }} + + n <- dplyr::n() + if (!is.null(weight_by)) { + weight_by <- vec_assert(weight_by, size = n, arg = "weight_by") + } + sample_int(n, size(n), replace = replace, wt = weight_by) + }) + ) } # helpers ----------------------------------------------------------------- -slice_rows <- function(.data, ..., error_call = caller_env()) { +slice_rows <- function(.data, ..., .by = NULL, error_call = caller_env()) { error_call <- dplyr_error_call(error_call) + by <- compute_by( + by = {{ .by }}, + data = .data, + by_arg = the$slice_by_arg, + data_arg = ".data", + error_call = error_call + ) + dots <- enquos(...) if (is_empty(dots)) { return(TRUE) } - mask <- DataMask$new(.data, "slice", error_call = error_call) + + mask <- DataMask$new(.data, by, "slice", error_call = error_call) on.exit(mask$forget(), add = TRUE) chunks <- slice_eval(mask, dots, error_call = error_call) @@ -500,6 +538,14 @@ slice_rank_idx <- function( which[order(ranks[which])] } +on_load({ + # Default used by `slice()` + the$slice_by_arg <- ".by" +}) +dplyr_local_slice_by_arg <- function(by_arg, frame = caller_env()) { + local_bindings(slice_by_arg = by_arg, .env = the, .frame = frame) +} + # Backports for R 3.5.0 utils ...length2 <- function(frame = caller_env()) { length(env_get(frame, "...")) diff --git a/R/summarise.R b/R/summarise.R index 5aa2f090a3..bad974ac9a 100644 --- a/R/summarise.R +++ b/R/summarise.R @@ -33,6 +33,7 @@ #' #' @export #' @inheritParams arrange +#' @inheritParams args_by #' @param ... <[`data-masking`][dplyr_data_masking]> Name-value pairs of summary #' functions. The name will be the name of the variable in the result. #' @@ -116,7 +117,13 @@ #' var <- "mass" #' summarise(starwars, avg = mean(.data[[var]], na.rm = TRUE)) #' # Learn more in ?dplyr_data_masking -summarise <- function(.data, ..., .groups = NULL) { +summarise <- function(.data, ..., .by = NULL, .groups = NULL) { + by <- enquo(.by) + + if (!quo_is_null(by) && !is.null(.groups)) { + abort("Can't supply both `.by` and `.groups`.") + } + UseMethod("summarise") } #' @rdname summarise @@ -124,19 +131,32 @@ summarise <- function(.data, ..., .groups = NULL) { summarize <- summarise #' @export -summarise.data.frame <- function(.data, ..., .groups = NULL) { - cols <- summarise_cols(.data, dplyr_quosures(...)) - out <- summarise_build(.data, cols) +summarise.data.frame <- function(.data, ..., .by = NULL, .groups = NULL) { + by <- compute_by({{ .by }}, .data, by_arg = ".by", data_arg = ".data") + + cols <- summarise_cols(.data, dplyr_quosures(...), by) + out <- summarise_build(by, cols) + + if (!is_tibble(.data)) { + # The `by` group data we build from is always a tibble, + # so we have to manually downcast as needed + out <- as.data.frame(out) + } + if (identical(.groups, "rowwise")) { out <- rowwise_df(out, character()) } + out } #' @export -summarise.grouped_df <- function(.data, ..., .groups = NULL) { - cols <- summarise_cols(.data, dplyr_quosures(...)) - out <- summarise_build(.data, cols) +summarise.grouped_df <- function(.data, ..., .by = NULL, .groups = NULL) { + # Will always error if `.by != NULL` b/c you can't use it with grouped/rowwise dfs. + by <- compute_by({{ .by }}, .data, by_arg = ".by", data_arg = ".data") + + cols <- summarise_cols(.data, dplyr_quosures(...), by) + out <- summarise_build(by, cols) verbose <- summarise_verbose(.groups, caller_env()) if (is.null(.groups)) { @@ -147,7 +167,7 @@ summarise.grouped_df <- function(.data, ..., .groups = NULL) { } } - group_vars <- group_vars(.data) + group_vars <- by$names if (identical(.groups, "drop_last")) { n <- length(group_vars) if (n > 1) { @@ -177,12 +197,15 @@ summarise.grouped_df <- function(.data, ..., .groups = NULL) { } #' @export -summarise.rowwise_df <- function(.data, ..., .groups = NULL) { - cols <- summarise_cols(.data, dplyr_quosures(...)) - out <- summarise_build(.data, cols) +summarise.rowwise_df <- function(.data, ..., .by = NULL, .groups = NULL) { + # Will always error if `.by != NULL` b/c you can't use it with grouped/rowwise dfs. + by <- compute_by({{ .by }}, .data, by_arg = ".by", data_arg = ".data") + + cols <- summarise_cols(.data, dplyr_quosures(...), by) + out <- summarise_build(by, cols) verbose <- summarise_verbose(.groups, caller_env()) - group_vars <- group_vars(.data) + group_vars <- by$names if (is.null(.groups) || identical(.groups, "keep")) { if (verbose && length(group_vars)) { new_groups <- glue_collapse(paste0("'", group_vars, "'"), sep = ", ") @@ -202,10 +225,10 @@ summarise.rowwise_df <- function(.data, ..., .groups = NULL) { out } -summarise_cols <- function(.data, dots, error_call = caller_env()) { +summarise_cols <- function(data, dots, by, error_call = caller_env()) { error_call <- dplyr_error_call(error_call) - mask <- DataMask$new(.data, "summarise", error_call = error_call) + mask <- DataMask$new(data, by, "summarise", error_call = error_call) old_current_column <- context_peek_bare("column") warnings_state <- env(warnings = list()) @@ -343,8 +366,8 @@ summarise_eval_one <- function(quo, mask) { list(chunks = chunks_k, types = types_k, results = result_k) } -summarise_build <- function(.data, cols) { - out <- group_keys(.data) +summarise_build <- function(by, cols) { + out <- group_keys0(by$data) if (!cols$all_one) { out <- vec_slice(out, rep(seq_len(nrow(out)), cols$sizes)) } @@ -396,9 +419,19 @@ summarise_bullets <- function(cnd, ...) { # messaging --------------------------------------------------------------- summarise_verbose <- function(.groups, .env) { - is.null(.groups) && - is_reference(topenv(.env), global_env()) && - !identical(getOption("dplyr.summarise.inform"), FALSE) + if (!is.null(.groups)) { + # User supplied `.groups` + return(FALSE) + } + + inform <- getOption("dplyr.summarise.inform") + + if (is_true(inform) || is_false(inform)) { + # User supplied global option + return(inform) + } + + is_reference(topenv(.env), global_env()) } summarise_inform <- function(..., .env = parent.frame()) { diff --git a/R/transmute.R b/R/transmute.R index d6024b841c..106cde9860 100644 --- a/R/transmute.R +++ b/R/transmute.R @@ -39,7 +39,10 @@ transmute.data.frame <- function(.data, ...) { dots <- check_transmute_args(...) dots <- dplyr_quosures(!!!dots) - cols <- mutate_cols(.data, dots) + # We don't expose `.by` because `transmute()` is superseded + by <- compute_by(by = NULL, data = .data) + + cols <- mutate_cols(.data, dots, by) out <- dplyr_col_modify(.data, cols) @@ -53,7 +56,7 @@ transmute.data.frame <- function(.data, ...) { cols_expr <- names(cols) # Retain untouched group variables up front - cols_group <- group_vars(.data) + cols_group <- by$names cols_group <- setdiff(cols_group, cols_expr) cols_retain <- c(cols_group, cols_expr) diff --git a/_pkgdown.yml b/_pkgdown.yml index 69b824e1bc..ce0f44a6c0 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -41,6 +41,7 @@ reference: contents: - count - group_by + - dplyr_by - rowwise - summarise - 'n' diff --git a/man/args_by.Rd b/man/args_by.Rd new file mode 100644 index 0000000000..cdb9aa9e9d --- /dev/null +++ b/man/args_by.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/by.R +\name{args_by} +\alias{args_by} +\title{Helper for consistent documentation of \code{.by}} +\arguments{ +\item{.by}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} + +<\code{\link[=dplyr_tidy_select]{tidy-select}}> Optionally, a selection of columns to +temporarily group by using an inline alternative to \code{\link[=group_by]{group_by()}}. For +details and examples, see \link[=dplyr_by]{?dplyr_by}.} +} +\description{ +Use \verb{@inheritParams args_by} to consistently document \code{.by}. +} +\keyword{internal} diff --git a/man/dplyr_by.Rd b/man/dplyr_by.Rd new file mode 100644 index 0000000000..9c9a85de1a --- /dev/null +++ b/man/dplyr_by.Rd @@ -0,0 +1,212 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/by.R +\name{dplyr_by} +\alias{dplyr_by} +\title{Temporary grouping with \code{.by}} +\description{ +There are two ways to group in dplyr: +\itemize{ +\item Persistent grouping with \code{\link[=group_by]{group_by()}} +\item Temporary grouping with \code{.by} +} + +This help page is dedicated to explaining where and why you might want to use the latter. +Grouping radically affects the computation of the dplyr verb you use it with, and one of the goals of \code{.by} is to allow you to place that grouping specification alongside the code that actually uses it. +As an added benefit, with \code{.by} you no longer need to remember to \code{\link[=ungroup]{ungroup()}} after \code{\link[=summarise]{summarise()}}, and \code{summarise()} won't ever message you about how it's handling the groups! + +This great idea comes from \href{https://CRAN.R-project.org/package=data.table}{data.table}, which allows you to specify \code{by} alongside modifications in \code{j}, like: \code{dt[, .(x = mean(x)), by = g]}. +\subsection{Supported verbs}{ +\itemize{ +\item \code{\link[=mutate]{mutate()}} +\item \code{\link[=summarise]{summarise()}} +\item \code{\link[=filter]{filter()}} +\item \code{\link[=slice]{slice()}} and its variants, such as \code{\link[=slice_head]{slice_head()}} +} +} + +\subsection{Differences between \code{.by} and \code{group_by()}}{\tabular{ll}{ + \code{.by} \tab \code{group_by()} \cr + Grouping only affects a single verb \tab Grouping is persistent across multiple verbs \cr + Selects variables with \link[=dplyr_tidy_select]{tidy-select} \tab Computes expressions with \link[=dplyr_data_masking]{data-masking} \cr + Summaries use existing order of group keys \tab Summaries sort group keys in ascending order \cr +} + +} + +\subsection{Using \code{.by}}{ + +Let's take a look at the two grouping approaches using this \code{expenses} data set, which tracks costs accumulated across various \code{id}s and \code{region}s: + +\if{html}{\out{
}}\preformatted{expenses <- tibble( + id = c(1, 2, 1, 3, 1, 2, 3), + region = c("A", "A", "A", "B", "B", "A", "A"), + cost = c(25, 20, 19, 12, 9, 6, 6) +) +expenses +#> # A tibble: 7 x 3 +#> id region cost +#> +#> 1 1 A 25 +#> 2 2 A 20 +#> 3 1 A 19 +#> 4 3 B 12 +#> 5 1 B 9 +#> 6 2 A 6 +#> 7 3 A 6 +}\if{html}{\out{
}} + +Imagine that you wanted to compute the average cost per region. +You'd probably write something like this: + +\if{html}{\out{
}}\preformatted{expenses \%>\% + group_by(region) \%>\% + summarise(cost = mean(cost)) +#> # A tibble: 2 x 2 +#> region cost +#> +#> 1 A 15.2 +#> 2 B 10.5 +}\if{html}{\out{
}} + +Instead, you can now specify the grouping \emph{inline} within the verb: + +\if{html}{\out{
}}\preformatted{expenses \%>\% + summarise(cost = mean(cost), .by = region) +#> # A tibble: 2 x 2 +#> region cost +#> +#> 1 A 15.2 +#> 2 B 10.5 +}\if{html}{\out{
}} + +Grouping with \code{.by} is temporary, meaning that since \code{expenses} was an ungrouped data frame, the result after applying \code{.by} will also always be an ungrouped data frame, regardless of the number of grouping columns. + +\if{html}{\out{
}}\preformatted{expenses \%>\% + summarise(cost = mean(cost), .by = c(id, region)) +#> # A tibble: 5 x 3 +#> id region cost +#> +#> 1 1 A 22 +#> 2 2 A 13 +#> 3 3 B 12 +#> 4 1 B 9 +#> 5 3 A 6 +}\if{html}{\out{
}} + +Compare that with \code{group_by() \%>\% summarise()}, where \code{summarise()} generally peels off 1 layer of grouping by default, typically with a message that it is doing so: + +\if{html}{\out{
}}\preformatted{expenses \%>\% + group_by(id, region) \%>\% + summarise(cost = mean(cost)) +#> `summarise()` has grouped output by 'id'. You can override using the `.groups` +#> argument. +#> # A tibble: 5 x 3 +#> # Groups: id [3] +#> id region cost +#> +#> 1 1 A 22 +#> 2 1 B 9 +#> 3 2 A 13 +#> 4 3 A 6 +#> 5 3 B 12 +}\if{html}{\out{
}} + +Because \code{.by} grouping is temporary, you don't need to worry about ungrouping, and it never needs to emit a message to remind you what it is doing with the groups. + +Note that with \code{.by} we specified multiple columns to group by using the \link[=dplyr_tidy_select]{tidy-select} syntax \code{c(id, region)}. +If you have a character vector of column names you'd like to group by, you can do so with \code{.by = all_of(my_cols)}. +It will group by the columns in the order they were provided. + +To prevent surprising results, you can't use \code{.by} on an existing grouped data frame: + +\if{html}{\out{
}}\preformatted{expenses \%>\% + group_by(id) \%>\% + summarise(cost = mean(cost), .by = c(id, region)) +#> Error in `summarise()`: +#> ! Can't supply `.by` when `.data` is a grouped data frame. +}\if{html}{\out{
}} + +So far we've focused on the usage of \code{.by} with \code{summarise()}, but \code{.by} works with a number of other dplyr verbs. +For example, you could append the mean cost per region onto the original data frame as a new column rather than computing a summary: + +\if{html}{\out{
}}\preformatted{expenses \%>\% + mutate(cost_by_region = mean(cost), .by = region) +#> # A tibble: 7 x 4 +#> id region cost cost_by_region +#> +#> 1 1 A 25 15.2 +#> 2 2 A 20 15.2 +#> 3 1 A 19 15.2 +#> 4 3 B 12 10.5 +#> 5 1 B 9 10.5 +#> 6 2 A 6 15.2 +#> 7 3 A 6 15.2 +}\if{html}{\out{
}} + +Or you could slice out the maximum cost per combination of id and region: + +\if{html}{\out{
}}\preformatted{expenses \%>\% + slice_max(cost, n = 1, by = c(id, region)) +#> # A tibble: 5 x 3 +#> id region cost +#> +#> 1 1 A 25 +#> 2 2 A 20 +#> 3 3 B 12 +#> 4 1 B 9 +#> 5 3 A 6 +}\if{html}{\out{
}} +} + +\subsection{Result ordering}{ + +When used with \code{.by}, \code{summarise()} and \code{slice()} both maintain the ordering of the existing data. +This is different from \code{group_by()}, which has always sorted the group keys in ascending order. + +\if{html}{\out{
}}\preformatted{df <- tibble( + month = c("jan", "jan", "feb", "feb", "mar"), + temp = c(20, 25, 18, 20, 40) +) + +# Uses ordering by "first appearance" in the original data +df \%>\% + summarise(average_temp = mean(temp), .by = month) +#> # A tibble: 3 x 2 +#> month average_temp +#> +#> 1 jan 22.5 +#> 2 feb 19 +#> 3 mar 40 + +# Sorts in ascending order +df \%>\% + group_by(month) \%>\% + summarise(average_temp = mean(temp)) +#> # A tibble: 3 x 2 +#> month average_temp +#> +#> 1 feb 19 +#> 2 jan 22.5 +#> 3 mar 40 +}\if{html}{\out{
}} + +If you need sorted group keys, we recommend that you explicitly use \code{\link[=arrange]{arrange()}} either before or after the call to \code{summarise()} or \code{slice()}. +This also gives you full access to all of \code{arrange()}'s features, such as \code{desc()} and the \code{.locale} argument. +} + +\subsection{Verbs without \code{.by} support}{ + +If a dplyr verb doesn't support \code{.by}, then that typically means that the verb isn't inherently affected by grouping. +For example, \code{\link[=pull]{pull()}} and \code{\link[=rename]{rename()}} don't support \code{.by}, because specifying columns to group by would not affect their implementations. + +That said, there are a few exceptions to this where sometimes a dplyr verb doesn't support \code{.by}, but \emph{does} have special support for grouped data frames created by \code{\link[=group_by]{group_by()}}. +This is typically because the verbs are required to retain the grouping columns, for example: +\itemize{ +\item \code{\link[=select]{select()}} always retains grouping columns, with a message if any aren't specified in the \code{select()} call. +\item \code{\link[=distinct]{distinct()}} and \code{\link[=count]{count()}} place unspecified grouping columns at the front of the data frame before computing their results. +\item \code{\link[=arrange]{arrange()}} has a \code{.by_group} argument to optionally order by grouping columns first. +} + +If \code{group_by()} didn't exist, then these verbs would not have special support for grouped data frames. +} +} diff --git a/man/filter.Rd b/man/filter.Rd index 431013b4e9..6bd7bf7469 100644 --- a/man/filter.Rd +++ b/man/filter.Rd @@ -4,7 +4,7 @@ \alias{filter} \title{Keep rows that match a condition} \usage{ -filter(.data, ..., .preserve = FALSE) +filter(.data, ..., .by = NULL, .preserve = FALSE) } \arguments{ \item{.data}{A data frame, data frame extension (e.g. a tibble), or a @@ -16,6 +16,12 @@ logical value, and are defined in terms of the variables in \code{.data}. If multiple expressions are included, they are combined with the \code{&} operator. Only rows for which all conditions evaluate to \code{TRUE} are kept.} +\item{.by}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} + +<\code{\link[=dplyr_tidy_select]{tidy-select}}> Optionally, a selection of columns to +temporarily group by using an inline alternative to \code{\link[=group_by]{group_by()}}. For +details and examples, see \link[=dplyr_by]{?dplyr_by}.} + \item{.preserve}{Relevant when the \code{.data} input is grouped. If \code{.preserve = FALSE} (the default), the grouping structure is recalculated based on the resulting data, otherwise the grouping is kept as is.} diff --git a/man/mutate.Rd b/man/mutate.Rd index 018a302be1..666446c3a8 100644 --- a/man/mutate.Rd +++ b/man/mutate.Rd @@ -10,6 +10,7 @@ mutate(.data, ...) \method{mutate}{data.frame}( .data, ..., + .by = NULL, .keep = c("all", "used", "unused", "none"), .before = NULL, .after = NULL @@ -32,6 +33,12 @@ if ungrouped). \item A data frame or tibble, to create multiple columns in the output. }} +\item{.by}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} + +<\code{\link[=dplyr_tidy_select]{tidy-select}}> Optionally, a selection of columns to +temporarily group by using an inline alternative to \code{\link[=group_by]{group_by()}}. For +details and examples, see \link[=dplyr_by]{?dplyr_by}.} + \item{.keep}{Control which columns from \code{.data} are retained in the output. Grouping columns and columns created by \code{...} are always kept. \itemize{ diff --git a/man/rmd/by.Rmd b/man/rmd/by.Rmd new file mode 100644 index 0000000000..65756e8685 --- /dev/null +++ b/man/rmd/by.Rmd @@ -0,0 +1,153 @@ +--- +output: html_document +editor_options: + chunk_output_type: console +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +``` + +There are two ways to group in dplyr: + +- Persistent grouping with [group_by()] + +- Temporary grouping with `.by` + +This help page is dedicated to explaining where and why you might want to use the latter. +Grouping radically affects the computation of the dplyr verb you use it with, and one of the goals of `.by` is to allow you to place that grouping specification alongside the code that actually uses it. +As an added benefit, with `.by` you no longer need to remember to [ungroup()] after [summarise()], and `summarise()` won't ever message you about how it's handling the groups! + +This great idea comes from [data.table](https://CRAN.R-project.org/package=data.table), which allows you to specify `by` alongside modifications in `j`, like: `dt[, .(x = mean(x)), by = g]`. + +### Supported verbs + +- [mutate()] + +- [summarise()] + +- [filter()] + +- [slice()] and its variants, such as [slice_head()] + +### Differences between `.by` and `group_by()` + +| `.by` | `group_by()` | +|----------------------------------|-------------------------------------| +| Grouping only affects a single verb | Grouping is persistent across multiple verbs | +| Selects variables with [tidy-select][dplyr_tidy_select] | Computes expressions with [data-masking][dplyr_data_masking] | +| Summaries use existing order of group keys | Summaries sort group keys in ascending order | + +### Using `.by` + +Let's take a look at the two grouping approaches using this `expenses` data set, which tracks costs accumulated across various `id`s and `region`s: + +```{r} +expenses <- tibble( + id = c(1, 2, 1, 3, 1, 2, 3), + region = c("A", "A", "A", "B", "B", "A", "A"), + cost = c(25, 20, 19, 12, 9, 6, 6) +) +expenses +``` + +Imagine that you wanted to compute the average cost per region. +You'd probably write something like this: + +```{r} +expenses %>% + group_by(region) %>% + summarise(cost = mean(cost)) +``` + +Instead, you can now specify the grouping *inline* within the verb: + +```{r} +expenses %>% + summarise(cost = mean(cost), .by = region) +``` + +Grouping with `.by` is temporary, meaning that since `expenses` was an ungrouped data frame, the result after applying `.by` will also always be an ungrouped data frame, regardless of the number of grouping columns. + +```{r} +expenses %>% + summarise(cost = mean(cost), .by = c(id, region)) +``` + +Compare that with `group_by() %>% summarise()`, where `summarise()` generally peels off 1 layer of grouping by default, typically with a message that it is doing so: + +```{r} +expenses %>% + group_by(id, region) %>% + summarise(cost = mean(cost)) +``` + +Because `.by` grouping is temporary, you don't need to worry about ungrouping, and it never needs to emit a message to remind you what it is doing with the groups. + +Note that with `.by` we specified multiple columns to group by using the [tidy-select][dplyr_tidy_select] syntax `c(id, region)`. +If you have a character vector of column names you'd like to group by, you can do so with `.by = all_of(my_cols)`. +It will group by the columns in the order they were provided. + +To prevent surprising results, you can't use `.by` on an existing grouped data frame: + +```{r, error=TRUE} +expenses %>% + group_by(id) %>% + summarise(cost = mean(cost), .by = c(id, region)) +``` + +So far we've focused on the usage of `.by` with `summarise()`, but `.by` works with a number of other dplyr verbs. +For example, you could append the mean cost per region onto the original data frame as a new column rather than computing a summary: + +```{r} +expenses %>% + mutate(cost_by_region = mean(cost), .by = region) +``` + +Or you could slice out the maximum cost per combination of id and region: + +```{r} +expenses %>% + slice_max(cost, n = 1, by = c(id, region)) +``` + +### Result ordering + +When used with `.by`, `summarise()` and `slice()` both maintain the ordering of the existing data. +This is different from `group_by()`, which has always sorted the group keys in ascending order. + +```{r} +df <- tibble( + month = c("jan", "jan", "feb", "feb", "mar"), + temp = c(20, 25, 18, 20, 40) +) + +# Uses ordering by "first appearance" in the original data +df %>% + summarise(average_temp = mean(temp), .by = month) + +# Sorts in ascending order +df %>% + group_by(month) %>% + summarise(average_temp = mean(temp)) +``` + +If you need sorted group keys, we recommend that you explicitly use [arrange()] either before or after the call to `summarise()` or `slice()`. +This also gives you full access to all of `arrange()`'s features, such as `desc()` and the `.locale` argument. + +### Verbs without `.by` support + +If a dplyr verb doesn't support `.by`, then that typically means that the verb isn't inherently affected by grouping. +For example, [pull()] and [rename()] don't support `.by`, because specifying columns to group by would not affect their implementations. + +That said, there are a few exceptions to this where sometimes a dplyr verb doesn't support `.by`, but *does* have special support for grouped data frames created by [group_by()]. +This is typically because the verbs are required to retain the grouping columns, for example: + +- [select()] always retains grouping columns, with a message if any aren't specified in the `select()` call. + +- [distinct()] and [count()] place unspecified grouping columns at the front of the data frame before computing their results. + +- [arrange()] has a `.by_group` argument to optionally order by grouping columns first. + +If `group_by()` didn't exist, then these verbs would not have special support for grouped data frames. + diff --git a/man/slice.Rd b/man/slice.Rd index c75a4fd59b..cfb89c59de 100644 --- a/man/slice.Rd +++ b/man/slice.Rd @@ -9,17 +9,35 @@ \alias{slice_sample} \title{Subset rows using their positions} \usage{ -slice(.data, ..., .preserve = FALSE) +slice(.data, ..., .by = NULL, .preserve = FALSE) -slice_head(.data, ..., n, prop) +slice_head(.data, ..., n, prop, by = NULL) -slice_tail(.data, ..., n, prop) +slice_tail(.data, ..., n, prop, by = NULL) -slice_min(.data, order_by, ..., n, prop, with_ties = TRUE, na_rm = FALSE) +slice_min( + .data, + order_by, + ..., + n, + prop, + by = NULL, + with_ties = TRUE, + na_rm = FALSE +) -slice_max(.data, order_by, ..., n, prop, with_ties = TRUE, na_rm = FALSE) +slice_max( + .data, + order_by, + ..., + n, + prop, + by = NULL, + with_ties = TRUE, + na_rm = FALSE +) -slice_sample(.data, ..., n, prop, weight_by = NULL, replace = FALSE) +slice_sample(.data, ..., n, prop, by = NULL, weight_by = NULL, replace = FALSE) } \arguments{ \item{.data}{A data frame, data frame extension (e.g. a tibble), or a @@ -35,6 +53,12 @@ Indices beyond the number of rows in the input are silently ignored. For \verb{slice_*()}, these arguments are passed on to methods.} +\item{.by, by}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} + +<\code{\link[=dplyr_tidy_select]{tidy-select}}> Optionally, a selection of columns to +temporarily group by using an inline alternative to \code{\link[=group_by]{group_by()}}. For +details and examples, see \link[=dplyr_by]{?dplyr_by}.} + \item{.preserve}{Relevant when the \code{.data} input is grouped. If \code{.preserve = FALSE} (the default), the grouping structure is recalculated based on the resulting data, otherwise the grouping is kept as is.} diff --git a/man/summarise.Rd b/man/summarise.Rd index 99f09dc56d..7c2055fb20 100644 --- a/man/summarise.Rd +++ b/man/summarise.Rd @@ -5,9 +5,9 @@ \alias{summarize} \title{Summarise each group to fewer rows} \usage{ -summarise(.data, ..., .groups = NULL) +summarise(.data, ..., .by = NULL, .groups = NULL) -summarize(.data, ..., .groups = NULL) +summarize(.data, ..., .by = NULL, .groups = NULL) } \arguments{ \item{.data}{A data frame, data frame extension (e.g. a tibble), or a @@ -24,6 +24,12 @@ The value can be: \item A data frame, to add multiple columns from a single expression. }} +\item{.by}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} + +<\code{\link[=dplyr_tidy_select]{tidy-select}}> Optionally, a selection of columns to +temporarily group by using an inline alternative to \code{\link[=group_by]{group_by()}}. For +details and examples, see \link[=dplyr_by]{?dplyr_by}.} + \item{.groups}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} Grouping structure of the result. \itemize{ \item "drop_last": dropping the last level of grouping. This was the diff --git a/src/chop.cpp b/src/chop.cpp index 44a89645d5..06c73a9825 100644 --- a/src/chop.cpp +++ b/src/chop.cpp @@ -66,7 +66,10 @@ void dplyr_lazy_vec_chop_ungrouped(SEXP chops_env, SEXP data) { UNPROTECT(1); } -SEXP dplyr_lazy_vec_chop(SEXP data, SEXP rows) { +SEXP dplyr_lazy_vec_chop(SEXP data, SEXP rows, SEXP ffi_grouped, SEXP ffi_rowwise) { + bool grouped = static_cast(LOGICAL_ELT(ffi_grouped, 0)); + bool rowwise = static_cast(LOGICAL_ELT(ffi_rowwise, 0)); + // a first environment to hide `.indices` and `.current_group` // this is for example used by funs:: SEXP indices_env = PROTECT(new_environment(2, R_EmptyEnv)); @@ -75,9 +78,9 @@ SEXP dplyr_lazy_vec_chop(SEXP data, SEXP rows) { // then an environment to hold the chops of the columns SEXP chops_env = PROTECT(new_environment(XLENGTH(data), indices_env)); - if (Rf_inherits(data, "grouped_df")) { + if (grouped) { dplyr_lazy_vec_chop_grouped(chops_env, rows, data, false); - } else if (Rf_inherits(data, "rowwise_df")) { + } else if (rowwise) { dplyr_lazy_vec_chop_grouped(chops_env, rows, data, true); } else { dplyr_lazy_vec_chop_ungrouped(chops_env, data); diff --git a/src/dplyr.h b/src/dplyr.h index 2517a539ed..d00e601f12 100644 --- a/src/dplyr.h +++ b/src/dplyr.h @@ -112,7 +112,7 @@ SEXP dplyr_group_keys(SEXP group_data); SEXP dplyr_mask_remove(SEXP env_private, SEXP s_name); SEXP dplyr_mask_add(SEXP env_private, SEXP s_name, SEXP ptype, SEXP chunks); -SEXP dplyr_lazy_vec_chop(SEXP data, SEXP rows); +SEXP dplyr_lazy_vec_chop(SEXP data, SEXP rows, SEXP ffi_grouped, SEXP ffi_rowwise); SEXP dplyr_data_masks_setup(SEXP chops, SEXP data, SEXP rows); SEXP env_resolved(SEXP env, SEXP names); void add_mask_binding(SEXP name, SEXP env_bindings, SEXP env_chops); diff --git a/src/init.cpp b/src/init.cpp index ba354ad591..7bf0b26f92 100644 --- a/src/init.cpp +++ b/src/init.cpp @@ -114,7 +114,7 @@ static const R_CallMethodDef CallEntries[] = { {"dplyr_mask_remove", (DL_FUNC)& dplyr_mask_remove, 2}, {"dplyr_mask_add", (DL_FUNC)& dplyr_mask_add, 4}, - {"dplyr_lazy_vec_chop_impl", (DL_FUNC)& dplyr_lazy_vec_chop, 2}, + {"dplyr_lazy_vec_chop_impl", (DL_FUNC)& dplyr_lazy_vec_chop, 4}, {"dplyr_data_masks_setup", (DL_FUNC)& dplyr_data_masks_setup, 3}, {"env_resolved", (DL_FUNC)& env_resolved, 2}, diff --git a/tests/testthat/_snaps/by.md b/tests/testthat/_snaps/by.md new file mode 100644 index 0000000000..343d99012e --- /dev/null +++ b/tests/testthat/_snaps/by.md @@ -0,0 +1,33 @@ +# throws tidyselect errors + + Code + compute_by(by = y, data = df) + Condition + Error: + ! Can't subset columns that don't exist. + x Column `y` doesn't exist. + +# can't set `.by` with a grouped-df + + Code + compute_by(x, gdf) + Condition + Error: + ! Can't supply `by` when `data` is a grouped data frame. + +# can't set `.by` with a rowwise-df + + Code + compute_by(x, rdf) + Condition + Error: + ! Can't supply `by` when `data` is a rowwise data frame. + +# can tweak the error args + + Code + compute_by(x, gdf, by_arg = "x", data_arg = "dat") + Condition + Error: + ! Can't supply `x` when `dat` is a grouped data frame. + diff --git a/tests/testthat/_snaps/filter.md b/tests/testthat/_snaps/filter.md index 128e727f1e..4c4f7fd619 100644 --- a/tests/testthat/_snaps/filter.md +++ b/tests/testthat/_snaps/filter.md @@ -229,3 +229,27 @@ x y 1 1 1 +# can't use `.by` with `.preserve` + + Code + filter(df, .by = x, .preserve = TRUE) + Condition + Error in `filter()`: + ! Can't supply both `.by` and `.preserve`. + +# catches `.by` with grouped-df + + Code + filter(gdf, .by = x) + Condition + Error in `filter()`: + ! Can't supply `.by` when `.data` is a grouped data frame. + +# catches `.by` with rowwise-df + + Code + filter(rdf, .by = x) + Condition + Error in `filter()`: + ! Can't supply `.by` when `.data` is a rowwise data frame. + diff --git a/tests/testthat/_snaps/mutate.md b/tests/testthat/_snaps/mutate.md index 1e3edd3e00..2211198702 100644 --- a/tests/testthat/_snaps/mutate.md +++ b/tests/testthat/_snaps/mutate.md @@ -59,6 +59,22 @@ ! `y` must be size 1, not 2. i Did you mean: `y = list(x)` ? +# catches `.by` with grouped-df + + Code + mutate(gdf, .by = x) + Condition + Error in `mutate()`: + ! Can't supply `.by` when `.data` is a grouped data frame. + +# catches `.by` with rowwise-df + + Code + mutate(rdf, .by = x) + Condition + Error in `mutate()`: + ! Can't supply `.by` when `.data` is a rowwise data frame. + # mutate() deals with 0 groups (#5534) Code diff --git a/tests/testthat/_snaps/slice.md b/tests/testthat/_snaps/slice.md index d7d90c88ee..750561e94f 100644 --- a/tests/testthat/_snaps/slice.md +++ b/tests/testthat/_snaps/slice.md @@ -47,6 +47,30 @@ Caused by error in `1 + ""`: ! non-numeric argument to binary operator +# can't use `.by` with `.preserve` + + Code + slice(df, .by = x, .preserve = TRUE) + Condition + Error in `slice()`: + ! Can't supply both `.by` and `.preserve`. + +# catches `.by` with grouped-df + + Code + slice(gdf, .by = x) + Condition + Error in `slice()`: + ! Can't supply `.by` when `.data` is a grouped data frame. + +# catches `.by` with rowwise-df + + Code + slice(rdf, .by = x) + Condition + Error in `slice()`: + ! Can't supply `.by` when `.data` is a rowwise data frame. + # slice_helpers() call get_slice_size() Code @@ -196,6 +220,34 @@ * ..2 = 2 i Did you forget to name an argument? +# slice_helper `by` errors use correct error context and correct `by_arg` + + Code + slice_head(gdf, n = 1, by = x) + Condition + Error in `slice_head()`: + ! Can't supply `by` when `.data` is a grouped data frame. + Code + slice_tail(gdf, n = 1, by = x) + Condition + Error in `slice_tail()`: + ! Can't supply `by` when `.data` is a grouped data frame. + Code + slice_min(gdf, order_by = x, by = x) + Condition + Error in `slice_min()`: + ! Can't supply `by` when `.data` is a grouped data frame. + Code + slice_max(gdf, order_by = x, by = x) + Condition + Error in `slice_max()`: + ! Can't supply `by` when `.data` is a grouped data frame. + Code + slice_sample(gdf, n = 1, by = x) + Condition + Error in `slice_sample()`: + ! Can't supply `by` when `.data` is a grouped data frame. + # slice_min/max() check size of `order_by=` (#5922) Code diff --git a/tests/testthat/_snaps/summarise.md b/tests/testthat/_snaps/summarise.md index c9fd3c2a86..21f084e581 100644 --- a/tests/testthat/_snaps/summarise.md +++ b/tests/testthat/_snaps/summarise.md @@ -1,3 +1,27 @@ +# can't use `.by` with `.groups` + + Code + summarise(df, .by = x, .groups = "drop") + Condition + Error in `summarise()`: + ! Can't supply both `.by` and `.groups`. + +# catches `.by` with grouped-df + + Code + summarise(gdf, .by = x) + Condition + Error in `summarise()`: + ! Can't supply `.by` when `.data` is a grouped data frame. + +# catches `.by` with rowwise-df + + Code + summarise(rdf, .by = x) + Condition + Error in `summarise()`: + ! Can't supply `.by` when `.data` is a rowwise data frame. + # summarise() gives meaningful errors Code diff --git a/tests/testthat/test-across.R b/tests/testthat/test-across.R index b387ea284f..b662b7b5fa 100644 --- a/tests/testthat/test-across.R +++ b/tests/testthat/test-across.R @@ -913,7 +913,8 @@ test_that("expand_across() expands lambdas", { index = 1 ) - DataMask$new(mtcars, "mutate", call("caller")) + by <- compute_by(by = NULL, data = mtcars, error_call = call("caller")) + DataMask$new(mtcars, by, "mutate", call("caller")) expect_equal( map(expand_across(quo), quo_get_expr), @@ -934,7 +935,8 @@ test_that("expand_if_across() expands lambdas", { index = 1 ) - DataMask$new(mtcars, "mutate", call("caller")) + by <- compute_by(by = NULL, data = mtcars, error_call = call("caller")) + DataMask$new(mtcars, by, "mutate", call("caller")) expect_equal( map(expand_if_across(quo), quo_squash), diff --git a/tests/testthat/test-by.R b/tests/testthat/test-by.R new file mode 100644 index 0000000000..de4dc1a41d --- /dev/null +++ b/tests/testthat/test-by.R @@ -0,0 +1,100 @@ +test_that("computes group data when `by` is set", { + df <- tibble(x = c(1, 1, 2, 2, 1)) + + out <- compute_by(by = x, data = df) + expect_identical(out$type, "grouped") + expect_identical(out$names, "x") + + expect_identical( + out$data, + tibble(x = c(1, 2), ".rows" := list_of(c(1L, 2L, 5L), c(3L, 4L))) + ) +}) + +test_that("computes `by` group data in order of appearance", { + df <- tibble( + x = c(5, 4, 5, 5), + y = c(2, 3, 1, 2) + ) + + out <- compute_by(by = c(x, y), data = df) + + expect <- tibble( + x = c(5, 4, 5), + y = c(2, 3, 1), + ".rows" := list_of(c(1L, 4L), 2L, 3L) + ) + + expect_identical(out$data, expect) +}) + +test_that("extracts existing data when `by = NULL`", { + df <- data.frame(x = c(1, 1, 2, 2, 1)) + out <- compute_by(by = NULL, data = df) + expect_identical(out$type, "ungrouped") + expect_identical(out$names, character()) + # `compute_by()` is always type stable on `$data` and returns a bare tibble + expect_identical(out$data, as_tibble(group_data(df))) + + df <- tibble(x = c(1, 1, 2, 2, 1)) + out <- compute_by(by = NULL, data = df) + expect_identical(out$type, "ungrouped") + expect_identical(out$names, character()) + expect_identical(out$data, group_data(df)) + + gdf <- group_by(df, x) + out <- compute_by(by = NULL, data = gdf) + expect_identical(out$type, "grouped") + expect_identical(out$names, "x") + expect_identical(out$data, group_data(gdf)) + + rdf <- rowwise(df) + out <- compute_by(by = NULL, data = rdf) + expect_identical(out$type, "rowwise") + expect_identical(out$names, character()) + expect_identical(out$data, group_data(rdf)) +}) + +test_that("empty selection results in ungrouped group data", { + df <- tibble(x = 1) + + out <- compute_by(by = c(), data = df) + expect_identical(out$type, "ungrouped") + expect_identical(out$names, character()) + expect_identical(out$data, group_data(df)) +}) + +test_that("throws tidyselect errors", { + df <- tibble(x = 1) + + expect_snapshot(error = TRUE, { + compute_by(by = y, data = df) + }) +}) + +test_that("can't set `.by` with a grouped-df", { + df <- tibble(x = 1:5) + gdf <- group_by(df, x) + + expect_snapshot(error = TRUE, { + compute_by(x, gdf) + }) +}) + +test_that("can't set `.by` with a rowwise-df", { + df <- tibble(x = 1:5) + rdf <- rowwise(df) + + expect_snapshot(error = TRUE, { + compute_by(x, rdf) + }) +}) + +test_that("can tweak the error args", { + df <- tibble(x = 1:5) + gdf <- group_by(df, x) + + expect_snapshot(error = TRUE, { + compute_by(x, gdf, by_arg = "x", data_arg = "dat") + }) +}) diff --git a/tests/testthat/test-filter.R b/tests/testthat/test-filter.R index f7c33d3cc7..ac78adc544 100644 --- a/tests/testthat/test-filter.R +++ b/tests/testthat/test-filter.R @@ -587,3 +587,62 @@ test_that("if_any() and if_all() work", { filter(df, x1 > 6 | x2 > 6) ) }) + +# .by ------------------------------------------------------------------------- + +test_that("can group transiently using `.by`", { + df <- tibble(g = c(1, 1, 2, 1, 2), x = c(5, 10, 1, 2, 3)) + + out <- filter(df, x > mean(x), .by = g) + + expect_identical(out$g, c(1, 2)) + expect_identical(out$x, c(10, 3)) + expect_s3_class(out, class(df), exact = TRUE) +}) + +test_that("transient grouping retains bare data.frame class", { + df <- tibble(g = c(1, 1, 2, 1, 2), x = c(5, 10, 1, 2, 3)) + out <- filter(df, x > mean(x), .by = g) + expect_s3_class(out, class(df), exact = TRUE) +}) + +test_that("transient grouping retains data frame attributes", { + # With data.frames or tibbles + df <- data.frame(g = c(1, 1, 2), x = c(1, 2, 1)) + tbl <- as_tibble(df) + + attr(df, "foo") <- "bar" + attr(tbl, "foo") <- "bar" + + out <- filter(df, x > mean(x), .by = g) + expect_identical(attr(out, "foo"), "bar") + + out <- filter(tbl, x > mean(x), .by = g) + expect_identical(attr(out, "foo"), "bar") +}) + +test_that("can't use `.by` with `.preserve`", { + df <- tibble(x = 1) + + expect_snapshot(error = TRUE, { + filter(df, .by = x, .preserve = TRUE) + }) +}) + +test_that("catches `.by` with grouped-df", { + df <- tibble(x = 1) + gdf <- group_by(df, x) + + expect_snapshot(error = TRUE, { + filter(gdf, .by = x) + }) +}) + +test_that("catches `.by` with rowwise-df", { + df <- tibble(x = 1) + rdf <- rowwise(df) + + expect_snapshot(error = TRUE, { + filter(rdf, .by = x) + }) +}) diff --git a/tests/testthat/test-mutate.R b/tests/testthat/test-mutate.R index b0631a1557..baed61ef78 100644 --- a/tests/testthat/test-mutate.R +++ b/tests/testthat/test-mutate.R @@ -3,16 +3,22 @@ test_that("empty mutate returns input", { gf <- group_by(df, x) expect_equal(mutate(df), df) + expect_equal(mutate(df, .by = x), df) expect_equal(mutate(gf), gf) expect_equal(mutate(df, !!!list()), df) + expect_equal(mutate(df, !!!list(), .by = x), df) expect_equal(mutate(gf, !!!list()), gf) }) test_that("rownames preserved", { df <- data.frame(x = c(1, 2), row.names = c("a", "b")) + df <- mutate(df, y = 2) expect_equal(row.names(df), c("a", "b")) + + df <- mutate(df, y = 2, .by = x) + expect_equal(row.names(df), c("a", "b")) }) test_that("mutations applied progressively", { @@ -74,9 +80,8 @@ test_that("assignments don't overwrite variables (#315)", { expect_equal(out, tibble(x = 1, y = 2, z = 10)) }) -test_that("can mutate a data frame with zero columns and `NULL` column names", { +test_that("can mutate a data frame with zero columns", { df <- new_data_frame(n = 2L) - colnames(df) <- NULL expect_equal(mutate(df, x = 1), data.frame(x = c(1, 1))) }) @@ -382,6 +387,65 @@ test_that("DataMask$add() forces chunks (#4677)", { expect_equal(df$log_e_bf01, log(1 / 0.244)) }) +# .by ------------------------------------------------------------------------- + +test_that("can group transiently using `.by`", { + df <- tibble(g = c(1, 1, 2, 1, 2), x = c(5, 2, 1, 2, 3)) + + out <- mutate(df, x = mean(x), .by = g) + + expect_identical(out$g, df$g) + expect_identical(out$x, c(3, 3, 2, 3, 2)) + expect_s3_class(out, class(df), exact = TRUE) +}) + +test_that("transient grouping retains bare data.frame class", { + df <- data.frame(g = c(1, 1, 2, 1, 2), x = c(5, 2, 1, 2, 3)) + out <- mutate(df, x = mean(x), .by = g) + expect_s3_class(out, class(df), exact = TRUE) +}) + +test_that("transient grouping retains data frame attributes (#6100)", { + # With data.frames or tibbles + df <- data.frame(g = c(1, 1, 2), x = c(1, 2, 1)) + tbl <- as_tibble(df) + + attr(df, "foo") <- "bar" + attr(tbl, "foo") <- "bar" + + out <- mutate(df, x = mean(x), .by = g) + expect_identical(attr(out, "foo"), "bar") + + out <- mutate(tbl, x = mean(x), .by = g) + expect_identical(attr(out, "foo"), "bar") +}) + +test_that("can `NULL` out the `.by` column", { + df <- tibble(x = 1:3) + + expect_identical( + mutate(df, x = NULL, .by = x), + new_tibble(list(), nrow = 3) + ) +}) + +test_that("catches `.by` with grouped-df", { + df <- tibble(x = 1) + gdf <- group_by(df, x) + + expect_snapshot(error = TRUE, { + mutate(gdf, .by = x) + }) +}) + +test_that("catches `.by` with rowwise-df", { + df <- tibble(x = 1) + rdf <- rowwise(df) + + expect_snapshot(error = TRUE, { + mutate(rdf, .by = x) + }) +}) # .before, .after, .keep ------------------------------------------------------ diff --git a/tests/testthat/test-pick.R b/tests/testthat/test-pick.R index 8441cb82cf..48a439042a 100644 --- a/tests/testthat/test-pick.R +++ b/tests/testthat/test-pick.R @@ -503,7 +503,8 @@ test_that("`pick()` can be used inside `group_by()` wrappers", { test_that("`pick()` doesn't expand across anonymous function boundaries", { df <- tibble(x = 1, y = 2) - mask <- DataMask$new(df, verb = "mutate", error_call = current_env()) + by <- compute_by(by = NULL, data = df, error_call = current_env()) + mask <- DataMask$new(df, by, verb = "mutate", error_call = current_env()) # With inline `function() { }` calls (this also handles native R anonymous functions) quo <- dplyr_quosures(z = function() pick(y, x))$z @@ -516,7 +517,8 @@ test_that("`pick()` doesn't expand across anonymous function boundaries", { test_that("`pick()` expands embedded quosures", { df <- tibble(x = 1, y = 2) - mask <- DataMask$new(df, verb = "mutate", error_call = current_env()) + by <- compute_by(by = NULL, data = df, error_call = current_env()) + mask <- DataMask$new(df, by, verb = "mutate", error_call = current_env()) wrapper <- function(x) { dplyr_quosures(z = dense_rank({{x}})) diff --git a/tests/testthat/test-slice.R b/tests/testthat/test-slice.R index 8e78cde13d..95ba3350d7 100644 --- a/tests/testthat/test-slice.R +++ b/tests/testthat/test-slice.R @@ -91,6 +91,72 @@ test_that("user errors are correctly labelled", { }) }) +test_that("can group transiently using `.by`", { + df <- tibble(g = c(1, 1, 2), x = c(1, 2, 3)) + + out <- slice(df, n(), .by = g) + + expect_identical(out$g, c(1, 2)) + expect_identical(out$x, c(2, 3)) + expect_s3_class(out, class(df), exact = TRUE) +}) + +test_that("transient grouping retains bare data.frame class", { + df <- tibble(g = c(1, 1, 2), x = c(1, 2, 3)) + out <- slice(df, n(), .by = g) + expect_s3_class(out, class(df), exact = TRUE) +}) + +test_that("transient grouping retains data frame attributes", { + # With data.frames or tibbles + df <- data.frame(g = c(1, 1, 2), x = c(1, 2, 3)) + tbl <- as_tibble(df) + + attr(df, "foo") <- "bar" + attr(tbl, "foo") <- "bar" + + out <- slice(df, n(), .by = g) + expect_identical(attr(out, "foo"), "bar") + + out <- slice(tbl, n(), .by = g) + expect_identical(attr(out, "foo"), "bar") +}) + +test_that("transient grouping orders by first appearance", { + df <- tibble(g = c(2, 1, 2, 0), x = c(4, 2, 8, 5)) + + out <- slice(df, which(x == max(x)), .by = g) + + expect_identical(out$g, c(2, 1, 0)) + expect_identical(out$x, c(8, 2, 5)) +}) + +test_that("can't use `.by` with `.preserve`", { + df <- tibble(x = 1) + + expect_snapshot(error = TRUE, { + slice(df, .by = x, .preserve = TRUE) + }) +}) + +test_that("catches `.by` with grouped-df", { + df <- tibble(x = 1) + gdf <- group_by(df, x) + + expect_snapshot(error = TRUE, { + slice(gdf, .by = x) + }) +}) + +test_that("catches `.by` with rowwise-df", { + df <- tibble(x = 1) + rdf <- rowwise(df) + + expect_snapshot(error = TRUE, { + slice(rdf, .by = x) + }) +}) + # Slice variants ---------------------------------------------------------- test_that("slice_helpers() call get_slice_size()", { @@ -221,6 +287,19 @@ test_that("slice_helpers do call slice() and benefit from dispatch (#6084)", { expect_warning(sample_frac(nf, .5), "noisy") }) +test_that("slice_helper `by` errors use correct error context and correct `by_arg`", { + df <- tibble(x = 1) + gdf <- group_by(df, x) + + expect_snapshot(error = TRUE, { + slice_head(gdf, n = 1, by = x) + slice_tail(gdf, n = 1, by = x) + slice_min(gdf, order_by = x, by = x) + slice_max(gdf, order_by = x, by = x) + slice_sample(gdf, n = 1, by = x) + }) +}) + # slice_min/slice_max ----------------------------------------------------- test_that("min and max return ties by default", { @@ -287,6 +366,13 @@ test_that("slice_min/max() can order by multiple variables (#6176)", { expect_equal(slice_max(df, tibble(x, y), n = 1)$id, 2) }) +test_that("slice_min/max() work with `by`", { + df <- tibble(g = c(2, 2, 1, 1), x = c(1, 2, 3, 1)) + + expect_identical(slice_min(df, x, by = g), df[c(1, 4),]) + expect_identical(slice_max(df, x, by = g), df[c(2, 3),]) +}) + test_that("slice_min/max() check size of `order_by=` (#5922)", { expect_snapshot(error = TRUE, { slice_min(data.frame(x = 1:10), 1:6) @@ -360,6 +446,11 @@ test_that("slice_sample() handles negative n= and prop= (#6402)", { expect_equal(nrow(slice_sample(df, prop = -2)), 0) }) +test_that("slice_sample() works with `by`", { + df <- tibble(g = c(2, 2, 2, 1), x = c(1, 2, 3, 1)) + expect_identical(slice_sample(df, n = 2, by = g)$g, c(2, 2, 1)) +}) + # slice_head/slice_tail --------------------------------------------------- test_that("slice_head/slice_tail keep positive values", { @@ -404,3 +495,9 @@ test_that("slice_head/slice_tail handle infinite n/prop", { expect_identical(slice_head(df, prop = -Inf), df[0, ]) expect_identical(slice_tail(df, prop = -Inf), df[0, ]) }) + +test_that("slice_head/slice_tail work with `by`", { + df <- tibble(g = c(2, 2, 2, 1), x = c(1, 2, 3, 1)) + expect_identical(slice_head(df, n = 2, by = g), df[c(1, 2, 4),]) + expect_identical(slice_tail(df, n = 2, by = g), df[c(2, 3, 4),]) +}) diff --git a/tests/testthat/test-summarise.R b/tests/testthat/test-summarise.R index c42b39629c..31f85f3f47 100644 --- a/tests/testthat/test-summarise.R +++ b/tests/testthat/test-summarise.R @@ -257,6 +257,76 @@ test_that("summarise() silently skips when all results are NULL (#5708)", { expect_error(summarise(df, x = if(g == 1) 42)) }) +# .by ---------------------------------------------------------------------- + +test_that("can group transiently using `.by`", { + df <- tibble(g = c(1, 1, 2, 1, 2), x = c(5, 2, 1, 2, 3)) + + out <- summarise(df, x = mean(x), .by = g) + + expect_identical(out$g, c(1, 2)) + expect_identical(out$x, c(3, 2)) + expect_s3_class(out, class(df), exact = TRUE) +}) + +test_that("transient grouping retains bare data.frame class", { + df <- data.frame(g = c(1, 1, 2, 1, 2), x = c(5, 2, 1, 2, 3)) + out <- summarise(df, x = mean(x), .by = g) + expect_s3_class(out, class(df), exact = TRUE) +}) + +test_that("transient grouping drops data frame attributes", { + # Because `summarise()` theoretically creates a "new" data frame + + # With data.frames or tibbles + df <- data.frame(g = c(1, 1, 2), x = c(1, 2, 1)) + tbl <- as_tibble(df) + + attr(df, "foo") <- "bar" + attr(tbl, "foo") <- "bar" + + out <- summarise(df, x = mean(x), .by = g) + expect_null(attr(out, "foo")) + + out <- summarise(tbl, x = mean(x), .by = g) + expect_null(attr(out, "foo")) +}) + +test_that("transient grouping orders by first appearance", { + df <- tibble(g = c(2, 1, 2, 0), x = c(4, 2, 8, 5)) + + out <- summarise(df, x = mean(x), .by = g) + + expect_identical(out$g, c(2, 1, 0)) + expect_identical(out$x, c(6, 2, 5)) +}) + +test_that("can't use `.by` with `.groups`", { + df <- tibble(x = 1) + + expect_snapshot(error = TRUE, { + summarise(df, .by = x, .groups = "drop") + }) +}) + +test_that("catches `.by` with grouped-df", { + df <- tibble(x = 1) + gdf <- group_by(df, x) + + expect_snapshot(error = TRUE, { + summarise(gdf, .by = x) + }) +}) + +test_that("catches `.by` with rowwise-df", { + df <- tibble(x = 1) + rdf <- rowwise(df) + + expect_snapshot(error = TRUE, { + summarise(rdf, .by = x) + }) +}) + # errors ------------------------------------------------------------------- test_that("summarise() preserves the call stack on error (#5308)", {