diff --git a/NEWS.md b/NEWS.md
index 001ec4e5e1..b07057f90c 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,5 +1,47 @@
# dplyr (development version)
+* `.by` is a new experimental inline alternative to `group_by()` that supports
+ _temporary_ grouping in the following key dplyr verbs: `mutate()`,
+ `summarise()`, `filter()`, and the `slice()` family (#6528).
+
+ Rather than:
+
+ ```
+ starwars %>%
+ group_by(species, homeworld) %>%
+ summarise(mean_height = mean(height))
+ ```
+
+ You can now write:
+
+ ```
+ starwars %>%
+ summarise(
+ mean_height = mean(height),
+ .by = c(species, homeworld)
+ )
+ ```
+
+ The most useful reason to do this is because grouping with `.by` is
+ _temporary_ and only affects the verb it is being applied to. An ungrouped
+ data frame went into the `summarise()` call, so an ungrouped data frame will
+ come out; with `.by`, you never need to remember to `ungroup()` afterwards.
+
+ Additionally, using `summarise()` or `slice()` with `.by` will never sort the
+ results by the group key, unlike with `group_by()`. Instead, the results are
+ returned using the existing ordering of the groups from the original data. We
+ feel this is more predictable, better maintains any ordering you might have
+ already applied with a previous call to `arrange()`, and provides a way to
+ maintain the current ordering without having to resort to factors.
+
+ This exciting feature was inspired by
+ [data.table](https://CRAN.R-project.org/package=data.table), where the
+ equivalent syntax looks like:
+
+ ```
+ starwars[, .(mean_height = mean(height)), by = .(species, homeworld)]
+ ```
+
* `summarise()` now correctly recycles named 0-column data frames (#6509).
* `.cols` and `.fns` are now required arguments in `across()`, `c_across()`,
diff --git a/R/by.R b/R/by.R
new file mode 100644
index 0000000000..7d291e07e2
--- /dev/null
+++ b/R/by.R
@@ -0,0 +1,126 @@
+#' Helper for consistent documentation of `.by`
+#'
+#' Use `@inheritParams args_by` to consistently document `.by`.
+#'
+#' @param .by `r lifecycle::badge("experimental")`
+#'
+#' <[`tidy-select`][dplyr_tidy_select]> Optionally, a selection of columns to
+#' temporarily group by using an inline alternative to [group_by()]. For
+#' details and examples, see [?dplyr_by][dplyr_by].
+#'
+#' @name args_by
+#' @keywords internal
+NULL
+
+#' Temporary grouping with `.by`
+#'
+#' ```{r, echo = FALSE, results = "asis"}
+#' result <- rlang::with_options(
+#' knitr::knit_child("man/rmd/by.Rmd"),
+#' dplyr.summarise.inform = TRUE
+#' )
+#' cat(result, sep = "\n")
+#' ```
+#'
+#' @name dplyr_by
+NULL
+
+compute_by <- function(by,
+ data,
+ ...,
+ by_arg = "by",
+ data_arg = "data",
+ error_call = caller_env()) {
+ check_dots_empty0(...)
+
+ error_call <- dplyr_error_call(error_call)
+
+ by <- enquo(by)
+ check_by(by, data, by_arg = by_arg, data_arg = data_arg, error_call = error_call)
+
+ if (is_grouped_df(data)) {
+ type <- "grouped"
+ names <- group_vars(data)
+ data <- group_data(data)
+ } else if (is_rowwise_df(data)) {
+ type <- "rowwise"
+ names <- group_vars(data)
+ data <- group_data(data)
+ } else {
+ by <- eval_select_by(by, data, error_call = error_call)
+
+ if (length(by) == 0L) {
+ # `by = NULL` or empty selection
+ type <- "ungrouped"
+ names <- by
+ data <- group_data(data)
+ data <- as_tibble(data)
+ } else {
+ type <- "grouped"
+ names <- by
+ data <- compute_by_groups(data, by, error_call = error_call)
+ }
+ }
+
+ new_by(type = type, names = names, data = data)
+}
+
+compute_by_groups <- function(data, names, error_call = caller_env()) {
+ data <- dplyr_col_select(data, names, error_call = error_call)
+ info <- vec_group_loc(data)
+
+ size <- vec_size(info)
+
+ out <- dplyr_new_list(info$key)
+ out[[".rows"]] <- new_list_of(info$loc, ptype = integer())
+ out <- new_tibble(out, nrow = size)
+
+ out
+}
+
+check_by <- function(by,
+ data,
+ ...,
+ by_arg = "by",
+ data_arg = "data",
+ error_call = caller_env()) {
+ check_dots_empty0(...)
+
+ if (quo_is_null(by)) {
+ return(invisible(NULL))
+ }
+
+ if (is_grouped_df(data)) {
+ message <- paste0(
+ "Can't supply {.arg {by_arg}} when ",
+ "{.arg {data_arg}} is a grouped data frame."
+ )
+ cli::cli_abort(message, call = error_call)
+ }
+
+ if (is_rowwise_df(data)) {
+ message <- paste0(
+ "Can't supply {.arg {by_arg}} when ",
+ "{.arg {data_arg}} is a rowwise data frame."
+ )
+ cli::cli_abort(message, call = error_call)
+ }
+
+ invisible(NULL)
+}
+
+eval_select_by <- function(by,
+ data,
+ error_call = caller_env()) {
+ out <- tidyselect::eval_select(
+ expr = by,
+ data = data,
+ allow_rename = FALSE,
+ error_call = error_call
+ )
+ names(out)
+}
+
+new_by <- function(type, names, data) {
+ structure(list(type = type, names = names, data = data), class = "dplyr_by")
+}
diff --git a/R/conditions.R b/R/conditions.R
index 341e944084..d892362c7f 100644
--- a/R/conditions.R
+++ b/R/conditions.R
@@ -47,7 +47,7 @@ cnd_bullet_cur_group_label <- function(what = "error") {
}
cnd_bullet_rowwise_unlist <- function() {
- if (peek_mask()$is_rowwise_df()) {
+ if (peek_mask()$is_rowwise()) {
glue_data(peek_error_context(), "Did you mean: `{error_name} = list({error_expression})` ?")
}
}
@@ -131,9 +131,9 @@ dot_as_label <- function(expr) {
mask_type <- function(mask = peek_mask()) {
if (mask$get_size() > 0) {
- if (mask$is_grouped_df()) {
+ if (mask$is_grouped()) {
return("grouped")
- } else if (mask$is_rowwise_df()) {
+ } else if (mask$is_rowwise()) {
return("rowwise")
}
}
diff --git a/R/data-mask.R b/R/data-mask.R
index 95f6d7bbed..2d5f63a446 100644
--- a/R/data-mask.R
+++ b/R/data-mask.R
@@ -1,7 +1,7 @@
DataMask <- R6Class("DataMask",
public = list(
- initialize = function(data, verb, error_call) {
- rows <- group_rows(data)
+ initialize = function(data, by, verb, error_call) {
+ rows <- by$data$.rows
if (length(rows) == 0) {
# Specially handle case of zero groups
rows <- new_list_of(list(integer()), ptype = integer())
@@ -16,22 +16,23 @@ DataMask <- R6Class("DataMask",
abort("Can't transform a data frame with duplicate names.", call = error_call)
}
names(data) <- names_bindings
+
private$size <- nrow(data)
private$current_data <- dplyr_new_list(data)
- private$chops <- .Call(dplyr_lazy_vec_chop_impl, data, rows)
- private$mask <- .Call(dplyr_data_masks_setup, private$chops, data, rows)
+ private$grouped <- by$type == "grouped"
+ private$rowwise <- by$type == "rowwise"
- private$grouped_df <- is_grouped_df(data)
- private$rowwise_df <- is_rowwise_df(data)
+ private$chops <- .Call(dplyr_lazy_vec_chop_impl, data, rows, private$grouped, private$rowwise)
+ private$mask <- .Call(dplyr_data_masks_setup, private$chops, data, rows)
- private$keys <- group_keys(data)
- private$group_vars <- group_vars(data)
+ private$keys <- group_keys0(by$data)
+ private$by_names <- by$names
private$verb <- verb
},
add_one = function(name, chunks, result) {
- if (self$is_rowwise_df()){
+ if (self$is_rowwise()){
is_scalar_list <- function(.x) {
vec_is_list(.x) && length(.x) == 1L
}
@@ -78,7 +79,7 @@ DataMask <- R6Class("DataMask",
# `across(.fns = NULL)`. We should remove this when we defunct those.
cols <- self$current_cols(vars)
- if (self$is_rowwise_df()) {
+ if (self$is_rowwise()) {
cols <- map2(cols, names(cols), function(col, name) {
if (vec_is_list(private$current_data[[name]])) {
col <- list(col)
@@ -117,7 +118,7 @@ DataMask <- R6Class("DataMask",
},
current_non_group_vars = function() {
- setdiff(self$current_vars(), private$group_vars)
+ setdiff(self$current_vars(), private$by_names)
},
get_current_group = function() {
@@ -172,12 +173,12 @@ DataMask <- R6Class("DataMask",
})
},
- is_grouped_df = function() {
- private$grouped_df
+ is_grouped = function() {
+ private$grouped
},
- is_rowwise_df = function() {
- private$rowwise_df
+ is_rowwise = function() {
+ private$rowwise
},
get_keys = function() {
@@ -215,8 +216,8 @@ DataMask <- R6Class("DataMask",
# ptypes of all the variables
current_data = list(),
- # names of the grouping variables
- group_vars = character(),
+ # names of the `by` variables
+ by_names = character(),
# list of indices, one integer vector per group
rows = NULL,
@@ -228,8 +229,8 @@ DataMask <- R6Class("DataMask",
size = NULL,
# Type of data frame
- grouped_df = NULL,
- rowwise_df = NULL,
+ grouped = NULL,
+ rowwise = NULL,
verb = character()
)
diff --git a/R/filter.R b/R/filter.R
index 92350e1f29..f7d3518141 100644
--- a/R/filter.R
+++ b/R/filter.R
@@ -48,6 +48,7 @@
#'
#' @family single table verbs
#' @inheritParams arrange
+#' @inheritParams args_by
#' @param ... <[`data-masking`][dplyr_data_masking]> Expressions that return a
#' logical value, and are defined in terms of the variables in `.data`.
#' If multiple expressions are included, they are combined with the `&` operator.
@@ -105,23 +106,37 @@
#' .data[[vars[[2]]]] > cond[[2]]
#' )
#' # Learn more in ?dplyr_data_masking
-filter <- function(.data, ..., .preserve = FALSE) {
+filter <- function(.data, ..., .by = NULL, .preserve = FALSE) {
+ by <- enquo(.by)
+
+ if (!quo_is_null(by) && !is_false(.preserve)) {
+ abort("Can't supply both `.by` and `.preserve`.")
+ }
+
UseMethod("filter")
}
#' @export
-filter.data.frame <- function(.data, ..., .preserve = FALSE) {
- loc <- filter_rows(.data, ...)
+filter.data.frame <- function(.data, ..., .by = NULL, .preserve = FALSE) {
+ loc <- filter_rows(.data, ..., .by = {{ .by }})
dplyr_row_slice(.data, loc, preserve = .preserve)
}
-filter_rows <- function(.data, ..., error_call = caller_env()) {
+filter_rows <- function(.data, ..., .by = NULL, error_call = caller_env()) {
error_call <- dplyr_error_call(error_call)
dots <- dplyr_quosures(...)
check_filter(dots, error_call = error_call)
- mask <- DataMask$new(.data, "filter", error_call = error_call)
+ by <- compute_by(
+ by = {{ .by }},
+ data = .data,
+ by_arg = ".by",
+ data_arg = ".data",
+ error_call = error_call
+ )
+
+ mask <- DataMask$new(.data, by, "filter", error_call = error_call)
on.exit(mask$forget(), add = TRUE)
dots <- filter_expand(dots, mask = mask, error_call = error_call)
diff --git a/R/group-by.R b/R/group-by.R
index e10c4258d9..8e501cbc72 100644
--- a/R/group-by.R
+++ b/R/group-by.R
@@ -244,9 +244,13 @@ add_computed_columns <- function(.data,
if (any(needs_mutate)) {
# TODO: use less of a hack
if (inherits(.data, "data.frame")) {
+ bare_data <- ungroup(.data)
+ by <- compute_by(by = NULL, data = bare_data)
+
cols <- mutate_cols(
- ungroup(.data),
+ bare_data,
dplyr_quosures(!!!vars),
+ by = by,
error_call = error_call
)
diff --git a/R/group-data.R b/R/group-data.R
index 5b64e4bc8b..1184bea642 100644
--- a/R/group-data.R
+++ b/R/group-data.R
@@ -97,8 +97,13 @@ group_keys.data.frame <- function(.tbl, ...) {
.tbl <- group_by(.tbl, ...)
}
out <- group_data(.tbl)
- .Call(`dplyr_group_keys`, out)
+ group_keys0(out)
}
+group_keys0 <- function(x) {
+ # Compute keys directly from `group_data()` results
+ .Call(`dplyr_group_keys`, x)
+}
+
#' @rdname group_data
#' @export
group_rows <- function(.data) {
diff --git a/R/mutate.R b/R/mutate.R
index 50c11a4a3d..a3d2c24a93 100644
--- a/R/mutate.R
+++ b/R/mutate.R
@@ -147,6 +147,9 @@ mutate <- function(.data, ...) {
}
#' @rdname mutate
+#'
+#' @inheritParams args_by
+#'
#' @param .keep
#' Control which columns from `.data` are retained in the output. Grouping
#' columns and columns created by `...` are always kept.
@@ -167,12 +170,15 @@ mutate <- function(.data, ...) {
#' @export
mutate.data.frame <- function(.data,
...,
+ .by = NULL,
.keep = c("all", "used", "unused", "none"),
.before = NULL,
.after = NULL) {
keep <- arg_match(.keep)
- cols <- mutate_cols(.data, dplyr_quosures(...))
+ by <- compute_by({{ .by }}, .data, by_arg = ".by", data_arg = ".data")
+
+ cols <- mutate_cols(.data, dplyr_quosures(...), by)
used <- attr(cols, "used")
out <- dplyr_col_modify(.data, cols)
@@ -184,7 +190,7 @@ mutate.data.frame <- function(.data,
cols <- compact_null(cols)
cols_data <- names(.data)
- cols_group <- group_vars(.data)
+ cols_group <- by$names
cols_expr <- names(cols)
cols_expr_modified <- intersect(cols_expr, cols_data)
@@ -218,13 +224,13 @@ mutate.data.frame <- function(.data,
# Helpers -----------------------------------------------------------------
-mutate_cols <- function(.data, dots, error_call = caller_env()) {
+mutate_cols <- function(data, dots, by, error_call = caller_env()) {
# Collect dots before setting up error handlers (#6178)
force(dots)
error_call <- dplyr_error_call(error_call)
- mask <- DataMask$new(.data, "mutate", error_call = error_call)
+ mask <- DataMask$new(data, by, "mutate", error_call = error_call)
old_current_column <- context_peek_bare("column")
on.exit(context_poke("column", old_current_column), add = TRUE)
@@ -240,7 +246,7 @@ mutate_cols <- function(.data, dots, error_call = caller_env()) {
poke_error_context(dots, i, mask = mask)
context_poke("column", old_current_column)
- new_columns <- mutate_col(dots[[i]], .data, mask, new_columns)
+ new_columns <- mutate_col(dots[[i]], data, mask, new_columns)
},
error = dplyr_error_handler(
dots = dots,
@@ -307,7 +313,7 @@ mutate_col <- function(dot, data, mask, new_columns) {
chunks <- mask$resolve(name)
}
- if (inherits(data, "rowwise_df") && vec_is_list(result)) {
+ if (mask$is_rowwise() && vec_is_list(result)) {
sizes <- list_sizes(result)
wrong <- which(sizes != 1)
if (length(wrong)) {
diff --git a/R/slice.R b/R/slice.R
index d7b023352b..de6a271a83 100644
--- a/R/slice.R
+++ b/R/slice.R
@@ -20,6 +20,7 @@
#' operation, use [filter()] and [row_number()].
#'
#' @family single table verbs
+#' @inheritParams args_by
#' @inheritParams arrange
#' @inheritParams filter
#' @param ... For `slice()`: <[`data-masking`][dplyr_data_masking]> Integer row
@@ -118,52 +119,62 @@
#' filter(mtcars, row_number() == 1L)
#' filter(mtcars, row_number() == n())
#' filter(mtcars, between(row_number(), 5, n()))
-slice <- function(.data, ..., .preserve = FALSE) {
+slice <- function(.data, ..., .by = NULL, .preserve = FALSE) {
+ by <- enquo(.by)
+
+ if (!quo_is_null(by) && !is_false(.preserve)) {
+ abort("Can't supply both `.by` and `.preserve`.")
+ }
+
UseMethod("slice")
}
#' @export
-slice.data.frame <- function(.data, ..., .preserve = FALSE) {
- loc <- slice_rows(.data, ..., error_call = current_env())
+slice.data.frame <- function(.data, ..., .by = NULL, .preserve = FALSE) {
+ loc <- slice_rows(.data, ..., .by = {{ .by }})
dplyr_row_slice(.data, loc, preserve = .preserve)
}
#' @export
#' @rdname slice
-slice_head <- function(.data, ..., n, prop) {
+slice_head <- function(.data, ..., n, prop, by = NULL) {
check_slice_dots(..., n = n, prop = prop)
UseMethod("slice_head")
}
#' @export
-slice_head.data.frame <- function(.data, ..., n, prop) {
+slice_head.data.frame <- function(.data, ..., n, prop, by = NULL) {
size <- get_slice_size(n = n, prop = prop)
idx <- function(n) {
seq2(1, size(n))
}
dplyr_local_error_call()
- slice(.data, idx(dplyr::n()))
+ dplyr_local_slice_by_arg("by")
+
+ slice(.data, idx(dplyr::n()), .by = {{ by }})
}
#' @export
#' @rdname slice
-slice_tail <- function(.data, ..., n, prop) {
+slice_tail <- function(.data, ..., n, prop, by = NULL) {
check_slice_dots(..., n = n, prop = prop)
UseMethod("slice_tail")
}
#' @export
-slice_tail.data.frame <- function(.data, ..., n, prop) {
+slice_tail.data.frame <- function(.data, ..., n, prop, by = NULL) {
size <- get_slice_size(n = n, prop = prop)
idx <- function(n) {
seq2(n - size(n) + 1, n)
}
dplyr_local_error_call()
- slice(.data, idx(dplyr::n()))
+ dplyr_local_slice_by_arg("by")
+
+ slice(.data, idx(dplyr::n()), .by = {{ by }})
}
#' @export
@@ -178,7 +189,7 @@ slice_tail.data.frame <- function(.data, ..., n, prop) {
#' If `FALSE`, `NA` values are sorted to the end (like in [arrange()]), so
#' they will only be included if there are insufficient non-missing values to
#' reach `n`/`prop`.
-slice_min <- function(.data, order_by, ..., n, prop, with_ties = TRUE, na_rm = FALSE) {
+slice_min <- function(.data, order_by, ..., n, prop, by = NULL, with_ties = TRUE, na_rm = FALSE) {
check_required(order_by)
check_slice_dots(..., n = n, prop = prop)
check_bool(with_ties)
@@ -188,28 +199,34 @@ slice_min <- function(.data, order_by, ..., n, prop, with_ties = TRUE, na_rm = F
}
#' @export
-slice_min.data.frame <- function(.data, order_by, ..., n, prop, with_ties = TRUE, na_rm = FALSE) {
+slice_min.data.frame <- function(.data, order_by, ..., n, prop, by = NULL, with_ties = TRUE, na_rm = FALSE) {
size <- get_slice_size(n = n, prop = prop)
- dplyr_local_error_call()
- slice(.data, local({
- n <- dplyr::n()
- order_by <- {{ order_by }}
- vec_assert(order_by, size = n)
-
- slice_rank_idx(
- order_by,
- size(n),
- direction = "asc",
- with_ties = with_ties,
- na_rm = na_rm
- )
- }))
+ dplyr_local_error_call()
+ dplyr_local_slice_by_arg("by")
+
+ slice(
+ .data,
+ .by = {{ by }},
+ local({
+ n <- dplyr::n()
+ order_by <- {{ order_by }}
+ vec_assert(order_by, size = n)
+
+ slice_rank_idx(
+ order_by,
+ size(n),
+ direction = "asc",
+ with_ties = with_ties,
+ na_rm = na_rm
+ )
+ })
+ )
}
#' @export
#' @rdname slice
-slice_max <- function(.data, order_by, ..., n, prop, with_ties = TRUE, na_rm = FALSE) {
+slice_max <- function(.data, order_by, ..., n, prop, by = NULL, with_ties = TRUE, na_rm = FALSE) {
check_required(order_by)
check_slice_dots(..., n = n, prop = prop)
check_bool(with_ties)
@@ -219,24 +236,30 @@ slice_max <- function(.data, order_by, ..., n, prop, with_ties = TRUE, na_rm = F
}
#' @export
-slice_max.data.frame <- function(.data, order_by, ..., n, prop, with_ties = TRUE, na_rm = FALSE) {
+slice_max.data.frame <- function(.data, order_by, ..., n, prop, by = NULL, with_ties = TRUE, na_rm = FALSE) {
size <- get_slice_size(n = n, prop = prop)
dplyr_local_error_call()
- slice(.data, local({
- n <- dplyr::n()
- order_by <- {{ order_by }}
-
- vec_assert(order_by, size = n)
-
- slice_rank_idx(
- order_by,
- size(n),
- direction = "desc",
- with_ties = with_ties,
- na_rm = na_rm
- )
- }))
+ dplyr_local_slice_by_arg("by")
+
+ slice(
+ .data,
+ .by = {{ by }},
+ local({
+ n <- dplyr::n()
+ order_by <- {{ order_by }}
+
+ vec_assert(order_by, size = n)
+
+ slice_rank_idx(
+ order_by,
+ size(n),
+ direction = "desc",
+ with_ties = with_ties,
+ na_rm = na_rm
+ )
+ })
+ )
}
#' @export
@@ -246,7 +269,7 @@ slice_max.data.frame <- function(.data, order_by, ..., n, prop, with_ties = TRUE
#' @param weight_by <[`data-masking`][dplyr_data_masking]> Sampling weights.
#' This must evaluate to a vector of non-negative numbers the same length as
#' the input. Weights are automatically standardised to sum to 1.
-slice_sample <- function(.data, ..., n, prop, weight_by = NULL, replace = FALSE) {
+slice_sample <- function(.data, ..., n, prop, by = NULL, weight_by = NULL, replace = FALSE) {
check_slice_dots(..., n = n, prop = prop)
check_bool(replace)
@@ -254,31 +277,46 @@ slice_sample <- function(.data, ..., n, prop, weight_by = NULL, replace = FALSE)
}
#' @export
-slice_sample.data.frame <- function(.data, ..., n, prop, weight_by = NULL, replace = FALSE) {
+slice_sample.data.frame <- function(.data, ..., n, prop, by = NULL, weight_by = NULL, replace = FALSE) {
size <- get_slice_size(n = n, prop = prop, allow_outsize = replace)
dplyr_local_error_call()
- slice(.data, local({
- weight_by <- {{ weight_by }}
+ dplyr_local_slice_by_arg("by")
- n <- dplyr::n()
- if (!is.null(weight_by)) {
- weight_by <- vec_assert(weight_by, size = n, arg = "weight_by")
- }
- sample_int(n, size(n), replace = replace, wt = weight_by)
- }))
+ slice(
+ .data,
+ .by = {{ by }},
+ local({
+ weight_by <- {{ weight_by }}
+
+ n <- dplyr::n()
+ if (!is.null(weight_by)) {
+ weight_by <- vec_assert(weight_by, size = n, arg = "weight_by")
+ }
+ sample_int(n, size(n), replace = replace, wt = weight_by)
+ })
+ )
}
# helpers -----------------------------------------------------------------
-slice_rows <- function(.data, ..., error_call = caller_env()) {
+slice_rows <- function(.data, ..., .by = NULL, error_call = caller_env()) {
error_call <- dplyr_error_call(error_call)
+ by <- compute_by(
+ by = {{ .by }},
+ data = .data,
+ by_arg = the$slice_by_arg,
+ data_arg = ".data",
+ error_call = error_call
+ )
+
dots <- enquos(...)
if (is_empty(dots)) {
return(TRUE)
}
- mask <- DataMask$new(.data, "slice", error_call = error_call)
+
+ mask <- DataMask$new(.data, by, "slice", error_call = error_call)
on.exit(mask$forget(), add = TRUE)
chunks <- slice_eval(mask, dots, error_call = error_call)
@@ -500,6 +538,14 @@ slice_rank_idx <- function(
which[order(ranks[which])]
}
+on_load({
+ # Default used by `slice()`
+ the$slice_by_arg <- ".by"
+})
+dplyr_local_slice_by_arg <- function(by_arg, frame = caller_env()) {
+ local_bindings(slice_by_arg = by_arg, .env = the, .frame = frame)
+}
+
# Backports for R 3.5.0 utils
...length2 <- function(frame = caller_env()) {
length(env_get(frame, "..."))
diff --git a/R/summarise.R b/R/summarise.R
index 5aa2f090a3..bad974ac9a 100644
--- a/R/summarise.R
+++ b/R/summarise.R
@@ -33,6 +33,7 @@
#'
#' @export
#' @inheritParams arrange
+#' @inheritParams args_by
#' @param ... <[`data-masking`][dplyr_data_masking]> Name-value pairs of summary
#' functions. The name will be the name of the variable in the result.
#'
@@ -116,7 +117,13 @@
#' var <- "mass"
#' summarise(starwars, avg = mean(.data[[var]], na.rm = TRUE))
#' # Learn more in ?dplyr_data_masking
-summarise <- function(.data, ..., .groups = NULL) {
+summarise <- function(.data, ..., .by = NULL, .groups = NULL) {
+ by <- enquo(.by)
+
+ if (!quo_is_null(by) && !is.null(.groups)) {
+ abort("Can't supply both `.by` and `.groups`.")
+ }
+
UseMethod("summarise")
}
#' @rdname summarise
@@ -124,19 +131,32 @@ summarise <- function(.data, ..., .groups = NULL) {
summarize <- summarise
#' @export
-summarise.data.frame <- function(.data, ..., .groups = NULL) {
- cols <- summarise_cols(.data, dplyr_quosures(...))
- out <- summarise_build(.data, cols)
+summarise.data.frame <- function(.data, ..., .by = NULL, .groups = NULL) {
+ by <- compute_by({{ .by }}, .data, by_arg = ".by", data_arg = ".data")
+
+ cols <- summarise_cols(.data, dplyr_quosures(...), by)
+ out <- summarise_build(by, cols)
+
+ if (!is_tibble(.data)) {
+ # The `by` group data we build from is always a tibble,
+ # so we have to manually downcast as needed
+ out <- as.data.frame(out)
+ }
+
if (identical(.groups, "rowwise")) {
out <- rowwise_df(out, character())
}
+
out
}
#' @export
-summarise.grouped_df <- function(.data, ..., .groups = NULL) {
- cols <- summarise_cols(.data, dplyr_quosures(...))
- out <- summarise_build(.data, cols)
+summarise.grouped_df <- function(.data, ..., .by = NULL, .groups = NULL) {
+ # Will always error if `.by != NULL` b/c you can't use it with grouped/rowwise dfs.
+ by <- compute_by({{ .by }}, .data, by_arg = ".by", data_arg = ".data")
+
+ cols <- summarise_cols(.data, dplyr_quosures(...), by)
+ out <- summarise_build(by, cols)
verbose <- summarise_verbose(.groups, caller_env())
if (is.null(.groups)) {
@@ -147,7 +167,7 @@ summarise.grouped_df <- function(.data, ..., .groups = NULL) {
}
}
- group_vars <- group_vars(.data)
+ group_vars <- by$names
if (identical(.groups, "drop_last")) {
n <- length(group_vars)
if (n > 1) {
@@ -177,12 +197,15 @@ summarise.grouped_df <- function(.data, ..., .groups = NULL) {
}
#' @export
-summarise.rowwise_df <- function(.data, ..., .groups = NULL) {
- cols <- summarise_cols(.data, dplyr_quosures(...))
- out <- summarise_build(.data, cols)
+summarise.rowwise_df <- function(.data, ..., .by = NULL, .groups = NULL) {
+ # Will always error if `.by != NULL` b/c you can't use it with grouped/rowwise dfs.
+ by <- compute_by({{ .by }}, .data, by_arg = ".by", data_arg = ".data")
+
+ cols <- summarise_cols(.data, dplyr_quosures(...), by)
+ out <- summarise_build(by, cols)
verbose <- summarise_verbose(.groups, caller_env())
- group_vars <- group_vars(.data)
+ group_vars <- by$names
if (is.null(.groups) || identical(.groups, "keep")) {
if (verbose && length(group_vars)) {
new_groups <- glue_collapse(paste0("'", group_vars, "'"), sep = ", ")
@@ -202,10 +225,10 @@ summarise.rowwise_df <- function(.data, ..., .groups = NULL) {
out
}
-summarise_cols <- function(.data, dots, error_call = caller_env()) {
+summarise_cols <- function(data, dots, by, error_call = caller_env()) {
error_call <- dplyr_error_call(error_call)
- mask <- DataMask$new(.data, "summarise", error_call = error_call)
+ mask <- DataMask$new(data, by, "summarise", error_call = error_call)
old_current_column <- context_peek_bare("column")
warnings_state <- env(warnings = list())
@@ -343,8 +366,8 @@ summarise_eval_one <- function(quo, mask) {
list(chunks = chunks_k, types = types_k, results = result_k)
}
-summarise_build <- function(.data, cols) {
- out <- group_keys(.data)
+summarise_build <- function(by, cols) {
+ out <- group_keys0(by$data)
if (!cols$all_one) {
out <- vec_slice(out, rep(seq_len(nrow(out)), cols$sizes))
}
@@ -396,9 +419,19 @@ summarise_bullets <- function(cnd, ...) {
# messaging ---------------------------------------------------------------
summarise_verbose <- function(.groups, .env) {
- is.null(.groups) &&
- is_reference(topenv(.env), global_env()) &&
- !identical(getOption("dplyr.summarise.inform"), FALSE)
+ if (!is.null(.groups)) {
+ # User supplied `.groups`
+ return(FALSE)
+ }
+
+ inform <- getOption("dplyr.summarise.inform")
+
+ if (is_true(inform) || is_false(inform)) {
+ # User supplied global option
+ return(inform)
+ }
+
+ is_reference(topenv(.env), global_env())
}
summarise_inform <- function(..., .env = parent.frame()) {
diff --git a/R/transmute.R b/R/transmute.R
index d6024b841c..106cde9860 100644
--- a/R/transmute.R
+++ b/R/transmute.R
@@ -39,7 +39,10 @@ transmute.data.frame <- function(.data, ...) {
dots <- check_transmute_args(...)
dots <- dplyr_quosures(!!!dots)
- cols <- mutate_cols(.data, dots)
+ # We don't expose `.by` because `transmute()` is superseded
+ by <- compute_by(by = NULL, data = .data)
+
+ cols <- mutate_cols(.data, dots, by)
out <- dplyr_col_modify(.data, cols)
@@ -53,7 +56,7 @@ transmute.data.frame <- function(.data, ...) {
cols_expr <- names(cols)
# Retain untouched group variables up front
- cols_group <- group_vars(.data)
+ cols_group <- by$names
cols_group <- setdiff(cols_group, cols_expr)
cols_retain <- c(cols_group, cols_expr)
diff --git a/_pkgdown.yml b/_pkgdown.yml
index 69b824e1bc..ce0f44a6c0 100644
--- a/_pkgdown.yml
+++ b/_pkgdown.yml
@@ -41,6 +41,7 @@ reference:
contents:
- count
- group_by
+ - dplyr_by
- rowwise
- summarise
- 'n'
diff --git a/man/args_by.Rd b/man/args_by.Rd
new file mode 100644
index 0000000000..cdb9aa9e9d
--- /dev/null
+++ b/man/args_by.Rd
@@ -0,0 +1,16 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/by.R
+\name{args_by}
+\alias{args_by}
+\title{Helper for consistent documentation of \code{.by}}
+\arguments{
+\item{.by}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}}
+
+<\code{\link[=dplyr_tidy_select]{tidy-select}}> Optionally, a selection of columns to
+temporarily group by using an inline alternative to \code{\link[=group_by]{group_by()}}. For
+details and examples, see \link[=dplyr_by]{?dplyr_by}.}
+}
+\description{
+Use \verb{@inheritParams args_by} to consistently document \code{.by}.
+}
+\keyword{internal}
diff --git a/man/dplyr_by.Rd b/man/dplyr_by.Rd
new file mode 100644
index 0000000000..9c9a85de1a
--- /dev/null
+++ b/man/dplyr_by.Rd
@@ -0,0 +1,212 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/by.R
+\name{dplyr_by}
+\alias{dplyr_by}
+\title{Temporary grouping with \code{.by}}
+\description{
+There are two ways to group in dplyr:
+\itemize{
+\item Persistent grouping with \code{\link[=group_by]{group_by()}}
+\item Temporary grouping with \code{.by}
+}
+
+This help page is dedicated to explaining where and why you might want to use the latter.
+Grouping radically affects the computation of the dplyr verb you use it with, and one of the goals of \code{.by} is to allow you to place that grouping specification alongside the code that actually uses it.
+As an added benefit, with \code{.by} you no longer need to remember to \code{\link[=ungroup]{ungroup()}} after \code{\link[=summarise]{summarise()}}, and \code{summarise()} won't ever message you about how it's handling the groups!
+
+This great idea comes from \href{https://CRAN.R-project.org/package=data.table}{data.table}, which allows you to specify \code{by} alongside modifications in \code{j}, like: \code{dt[, .(x = mean(x)), by = g]}.
+\subsection{Supported verbs}{
+\itemize{
+\item \code{\link[=mutate]{mutate()}}
+\item \code{\link[=summarise]{summarise()}}
+\item \code{\link[=filter]{filter()}}
+\item \code{\link[=slice]{slice()}} and its variants, such as \code{\link[=slice_head]{slice_head()}}
+}
+}
+
+\subsection{Differences between \code{.by} and \code{group_by()}}{\tabular{ll}{
+ \code{.by} \tab \code{group_by()} \cr
+ Grouping only affects a single verb \tab Grouping is persistent across multiple verbs \cr
+ Selects variables with \link[=dplyr_tidy_select]{tidy-select} \tab Computes expressions with \link[=dplyr_data_masking]{data-masking} \cr
+ Summaries use existing order of group keys \tab Summaries sort group keys in ascending order \cr
+}
+
+}
+
+\subsection{Using \code{.by}}{
+
+Let's take a look at the two grouping approaches using this \code{expenses} data set, which tracks costs accumulated across various \code{id}s and \code{region}s:
+
+\if{html}{\out{
}}\preformatted{expenses <- tibble(
+ id = c(1, 2, 1, 3, 1, 2, 3),
+ region = c("A", "A", "A", "B", "B", "A", "A"),
+ cost = c(25, 20, 19, 12, 9, 6, 6)
+)
+expenses
+#> # A tibble: 7 x 3
+#> id region cost
+#>
+#> 1 1 A 25
+#> 2 2 A 20
+#> 3 1 A 19
+#> 4 3 B 12
+#> 5 1 B 9
+#> 6 2 A 6
+#> 7 3 A 6
+}\if{html}{\out{
}}
+
+Imagine that you wanted to compute the average cost per region.
+You'd probably write something like this:
+
+\if{html}{\out{}}\preformatted{expenses \%>\%
+ group_by(region) \%>\%
+ summarise(cost = mean(cost))
+#> # A tibble: 2 x 2
+#> region cost
+#>
+#> 1 A 15.2
+#> 2 B 10.5
+}\if{html}{\out{
}}
+
+Instead, you can now specify the grouping \emph{inline} within the verb:
+
+\if{html}{\out{}}\preformatted{expenses \%>\%
+ summarise(cost = mean(cost), .by = region)
+#> # A tibble: 2 x 2
+#> region cost
+#>
+#> 1 A 15.2
+#> 2 B 10.5
+}\if{html}{\out{
}}
+
+Grouping with \code{.by} is temporary, meaning that since \code{expenses} was an ungrouped data frame, the result after applying \code{.by} will also always be an ungrouped data frame, regardless of the number of grouping columns.
+
+\if{html}{\out{}}\preformatted{expenses \%>\%
+ summarise(cost = mean(cost), .by = c(id, region))
+#> # A tibble: 5 x 3
+#> id region cost
+#>
+#> 1 1 A 22
+#> 2 2 A 13
+#> 3 3 B 12
+#> 4 1 B 9
+#> 5 3 A 6
+}\if{html}{\out{
}}
+
+Compare that with \code{group_by() \%>\% summarise()}, where \code{summarise()} generally peels off 1 layer of grouping by default, typically with a message that it is doing so:
+
+\if{html}{\out{}}\preformatted{expenses \%>\%
+ group_by(id, region) \%>\%
+ summarise(cost = mean(cost))
+#> `summarise()` has grouped output by 'id'. You can override using the `.groups`
+#> argument.
+#> # A tibble: 5 x 3
+#> # Groups: id [3]
+#> id region cost
+#>
+#> 1 1 A 22
+#> 2 1 B 9
+#> 3 2 A 13
+#> 4 3 A 6
+#> 5 3 B 12
+}\if{html}{\out{
}}
+
+Because \code{.by} grouping is temporary, you don't need to worry about ungrouping, and it never needs to emit a message to remind you what it is doing with the groups.
+
+Note that with \code{.by} we specified multiple columns to group by using the \link[=dplyr_tidy_select]{tidy-select} syntax \code{c(id, region)}.
+If you have a character vector of column names you'd like to group by, you can do so with \code{.by = all_of(my_cols)}.
+It will group by the columns in the order they were provided.
+
+To prevent surprising results, you can't use \code{.by} on an existing grouped data frame:
+
+\if{html}{\out{}}\preformatted{expenses \%>\%
+ group_by(id) \%>\%
+ summarise(cost = mean(cost), .by = c(id, region))
+#> Error in `summarise()`:
+#> ! Can't supply `.by` when `.data` is a grouped data frame.
+}\if{html}{\out{
}}
+
+So far we've focused on the usage of \code{.by} with \code{summarise()}, but \code{.by} works with a number of other dplyr verbs.
+For example, you could append the mean cost per region onto the original data frame as a new column rather than computing a summary:
+
+\if{html}{\out{}}\preformatted{expenses \%>\%
+ mutate(cost_by_region = mean(cost), .by = region)
+#> # A tibble: 7 x 4
+#> id region cost cost_by_region
+#>
+#> 1 1 A 25 15.2
+#> 2 2 A 20 15.2
+#> 3 1 A 19 15.2
+#> 4 3 B 12 10.5
+#> 5 1 B 9 10.5
+#> 6 2 A 6 15.2
+#> 7 3 A 6 15.2
+}\if{html}{\out{
}}
+
+Or you could slice out the maximum cost per combination of id and region:
+
+\if{html}{\out{}}\preformatted{expenses \%>\%
+ slice_max(cost, n = 1, by = c(id, region))
+#> # A tibble: 5 x 3
+#> id region cost
+#>
+#> 1 1 A 25
+#> 2 2 A 20
+#> 3 3 B 12
+#> 4 1 B 9
+#> 5 3 A 6
+}\if{html}{\out{
}}
+}
+
+\subsection{Result ordering}{
+
+When used with \code{.by}, \code{summarise()} and \code{slice()} both maintain the ordering of the existing data.
+This is different from \code{group_by()}, which has always sorted the group keys in ascending order.
+
+\if{html}{\out{}}\preformatted{df <- tibble(
+ month = c("jan", "jan", "feb", "feb", "mar"),
+ temp = c(20, 25, 18, 20, 40)
+)
+
+# Uses ordering by "first appearance" in the original data
+df \%>\%
+ summarise(average_temp = mean(temp), .by = month)
+#> # A tibble: 3 x 2
+#> month average_temp
+#>
+#> 1 jan 22.5
+#> 2 feb 19
+#> 3 mar 40
+
+# Sorts in ascending order
+df \%>\%
+ group_by(month) \%>\%
+ summarise(average_temp = mean(temp))
+#> # A tibble: 3 x 2
+#> month average_temp
+#>
+#> 1 feb 19
+#> 2 jan 22.5
+#> 3 mar 40
+}\if{html}{\out{
}}
+
+If you need sorted group keys, we recommend that you explicitly use \code{\link[=arrange]{arrange()}} either before or after the call to \code{summarise()} or \code{slice()}.
+This also gives you full access to all of \code{arrange()}'s features, such as \code{desc()} and the \code{.locale} argument.
+}
+
+\subsection{Verbs without \code{.by} support}{
+
+If a dplyr verb doesn't support \code{.by}, then that typically means that the verb isn't inherently affected by grouping.
+For example, \code{\link[=pull]{pull()}} and \code{\link[=rename]{rename()}} don't support \code{.by}, because specifying columns to group by would not affect their implementations.
+
+That said, there are a few exceptions to this where sometimes a dplyr verb doesn't support \code{.by}, but \emph{does} have special support for grouped data frames created by \code{\link[=group_by]{group_by()}}.
+This is typically because the verbs are required to retain the grouping columns, for example:
+\itemize{
+\item \code{\link[=select]{select()}} always retains grouping columns, with a message if any aren't specified in the \code{select()} call.
+\item \code{\link[=distinct]{distinct()}} and \code{\link[=count]{count()}} place unspecified grouping columns at the front of the data frame before computing their results.
+\item \code{\link[=arrange]{arrange()}} has a \code{.by_group} argument to optionally order by grouping columns first.
+}
+
+If \code{group_by()} didn't exist, then these verbs would not have special support for grouped data frames.
+}
+}
diff --git a/man/filter.Rd b/man/filter.Rd
index 431013b4e9..6bd7bf7469 100644
--- a/man/filter.Rd
+++ b/man/filter.Rd
@@ -4,7 +4,7 @@
\alias{filter}
\title{Keep rows that match a condition}
\usage{
-filter(.data, ..., .preserve = FALSE)
+filter(.data, ..., .by = NULL, .preserve = FALSE)
}
\arguments{
\item{.data}{A data frame, data frame extension (e.g. a tibble), or a
@@ -16,6 +16,12 @@ logical value, and are defined in terms of the variables in \code{.data}.
If multiple expressions are included, they are combined with the \code{&} operator.
Only rows for which all conditions evaluate to \code{TRUE} are kept.}
+\item{.by}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}}
+
+<\code{\link[=dplyr_tidy_select]{tidy-select}}> Optionally, a selection of columns to
+temporarily group by using an inline alternative to \code{\link[=group_by]{group_by()}}. For
+details and examples, see \link[=dplyr_by]{?dplyr_by}.}
+
\item{.preserve}{Relevant when the \code{.data} input is grouped.
If \code{.preserve = FALSE} (the default), the grouping structure
is recalculated based on the resulting data, otherwise the grouping is kept as is.}
diff --git a/man/mutate.Rd b/man/mutate.Rd
index 018a302be1..666446c3a8 100644
--- a/man/mutate.Rd
+++ b/man/mutate.Rd
@@ -10,6 +10,7 @@ mutate(.data, ...)
\method{mutate}{data.frame}(
.data,
...,
+ .by = NULL,
.keep = c("all", "used", "unused", "none"),
.before = NULL,
.after = NULL
@@ -32,6 +33,12 @@ if ungrouped).
\item A data frame or tibble, to create multiple columns in the output.
}}
+\item{.by}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}}
+
+<\code{\link[=dplyr_tidy_select]{tidy-select}}> Optionally, a selection of columns to
+temporarily group by using an inline alternative to \code{\link[=group_by]{group_by()}}. For
+details and examples, see \link[=dplyr_by]{?dplyr_by}.}
+
\item{.keep}{Control which columns from \code{.data} are retained in the output. Grouping
columns and columns created by \code{...} are always kept.
\itemize{
diff --git a/man/rmd/by.Rmd b/man/rmd/by.Rmd
new file mode 100644
index 0000000000..65756e8685
--- /dev/null
+++ b/man/rmd/by.Rmd
@@ -0,0 +1,153 @@
+---
+output: html_document
+editor_options:
+ chunk_output_type: console
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+```
+
+There are two ways to group in dplyr:
+
+- Persistent grouping with [group_by()]
+
+- Temporary grouping with `.by`
+
+This help page is dedicated to explaining where and why you might want to use the latter.
+Grouping radically affects the computation of the dplyr verb you use it with, and one of the goals of `.by` is to allow you to place that grouping specification alongside the code that actually uses it.
+As an added benefit, with `.by` you no longer need to remember to [ungroup()] after [summarise()], and `summarise()` won't ever message you about how it's handling the groups!
+
+This great idea comes from [data.table](https://CRAN.R-project.org/package=data.table), which allows you to specify `by` alongside modifications in `j`, like: `dt[, .(x = mean(x)), by = g]`.
+
+### Supported verbs
+
+- [mutate()]
+
+- [summarise()]
+
+- [filter()]
+
+- [slice()] and its variants, such as [slice_head()]
+
+### Differences between `.by` and `group_by()`
+
+| `.by` | `group_by()` |
+|----------------------------------|-------------------------------------|
+| Grouping only affects a single verb | Grouping is persistent across multiple verbs |
+| Selects variables with [tidy-select][dplyr_tidy_select] | Computes expressions with [data-masking][dplyr_data_masking] |
+| Summaries use existing order of group keys | Summaries sort group keys in ascending order |
+
+### Using `.by`
+
+Let's take a look at the two grouping approaches using this `expenses` data set, which tracks costs accumulated across various `id`s and `region`s:
+
+```{r}
+expenses <- tibble(
+ id = c(1, 2, 1, 3, 1, 2, 3),
+ region = c("A", "A", "A", "B", "B", "A", "A"),
+ cost = c(25, 20, 19, 12, 9, 6, 6)
+)
+expenses
+```
+
+Imagine that you wanted to compute the average cost per region.
+You'd probably write something like this:
+
+```{r}
+expenses %>%
+ group_by(region) %>%
+ summarise(cost = mean(cost))
+```
+
+Instead, you can now specify the grouping *inline* within the verb:
+
+```{r}
+expenses %>%
+ summarise(cost = mean(cost), .by = region)
+```
+
+Grouping with `.by` is temporary, meaning that since `expenses` was an ungrouped data frame, the result after applying `.by` will also always be an ungrouped data frame, regardless of the number of grouping columns.
+
+```{r}
+expenses %>%
+ summarise(cost = mean(cost), .by = c(id, region))
+```
+
+Compare that with `group_by() %>% summarise()`, where `summarise()` generally peels off 1 layer of grouping by default, typically with a message that it is doing so:
+
+```{r}
+expenses %>%
+ group_by(id, region) %>%
+ summarise(cost = mean(cost))
+```
+
+Because `.by` grouping is temporary, you don't need to worry about ungrouping, and it never needs to emit a message to remind you what it is doing with the groups.
+
+Note that with `.by` we specified multiple columns to group by using the [tidy-select][dplyr_tidy_select] syntax `c(id, region)`.
+If you have a character vector of column names you'd like to group by, you can do so with `.by = all_of(my_cols)`.
+It will group by the columns in the order they were provided.
+
+To prevent surprising results, you can't use `.by` on an existing grouped data frame:
+
+```{r, error=TRUE}
+expenses %>%
+ group_by(id) %>%
+ summarise(cost = mean(cost), .by = c(id, region))
+```
+
+So far we've focused on the usage of `.by` with `summarise()`, but `.by` works with a number of other dplyr verbs.
+For example, you could append the mean cost per region onto the original data frame as a new column rather than computing a summary:
+
+```{r}
+expenses %>%
+ mutate(cost_by_region = mean(cost), .by = region)
+```
+
+Or you could slice out the maximum cost per combination of id and region:
+
+```{r}
+expenses %>%
+ slice_max(cost, n = 1, by = c(id, region))
+```
+
+### Result ordering
+
+When used with `.by`, `summarise()` and `slice()` both maintain the ordering of the existing data.
+This is different from `group_by()`, which has always sorted the group keys in ascending order.
+
+```{r}
+df <- tibble(
+ month = c("jan", "jan", "feb", "feb", "mar"),
+ temp = c(20, 25, 18, 20, 40)
+)
+
+# Uses ordering by "first appearance" in the original data
+df %>%
+ summarise(average_temp = mean(temp), .by = month)
+
+# Sorts in ascending order
+df %>%
+ group_by(month) %>%
+ summarise(average_temp = mean(temp))
+```
+
+If you need sorted group keys, we recommend that you explicitly use [arrange()] either before or after the call to `summarise()` or `slice()`.
+This also gives you full access to all of `arrange()`'s features, such as `desc()` and the `.locale` argument.
+
+### Verbs without `.by` support
+
+If a dplyr verb doesn't support `.by`, then that typically means that the verb isn't inherently affected by grouping.
+For example, [pull()] and [rename()] don't support `.by`, because specifying columns to group by would not affect their implementations.
+
+That said, there are a few exceptions to this where sometimes a dplyr verb doesn't support `.by`, but *does* have special support for grouped data frames created by [group_by()].
+This is typically because the verbs are required to retain the grouping columns, for example:
+
+- [select()] always retains grouping columns, with a message if any aren't specified in the `select()` call.
+
+- [distinct()] and [count()] place unspecified grouping columns at the front of the data frame before computing their results.
+
+- [arrange()] has a `.by_group` argument to optionally order by grouping columns first.
+
+If `group_by()` didn't exist, then these verbs would not have special support for grouped data frames.
+
diff --git a/man/slice.Rd b/man/slice.Rd
index c75a4fd59b..cfb89c59de 100644
--- a/man/slice.Rd
+++ b/man/slice.Rd
@@ -9,17 +9,35 @@
\alias{slice_sample}
\title{Subset rows using their positions}
\usage{
-slice(.data, ..., .preserve = FALSE)
+slice(.data, ..., .by = NULL, .preserve = FALSE)
-slice_head(.data, ..., n, prop)
+slice_head(.data, ..., n, prop, by = NULL)
-slice_tail(.data, ..., n, prop)
+slice_tail(.data, ..., n, prop, by = NULL)
-slice_min(.data, order_by, ..., n, prop, with_ties = TRUE, na_rm = FALSE)
+slice_min(
+ .data,
+ order_by,
+ ...,
+ n,
+ prop,
+ by = NULL,
+ with_ties = TRUE,
+ na_rm = FALSE
+)
-slice_max(.data, order_by, ..., n, prop, with_ties = TRUE, na_rm = FALSE)
+slice_max(
+ .data,
+ order_by,
+ ...,
+ n,
+ prop,
+ by = NULL,
+ with_ties = TRUE,
+ na_rm = FALSE
+)
-slice_sample(.data, ..., n, prop, weight_by = NULL, replace = FALSE)
+slice_sample(.data, ..., n, prop, by = NULL, weight_by = NULL, replace = FALSE)
}
\arguments{
\item{.data}{A data frame, data frame extension (e.g. a tibble), or a
@@ -35,6 +53,12 @@ Indices beyond the number of rows in the input are silently ignored.
For \verb{slice_*()}, these arguments are passed on to methods.}
+\item{.by, by}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}}
+
+<\code{\link[=dplyr_tidy_select]{tidy-select}}> Optionally, a selection of columns to
+temporarily group by using an inline alternative to \code{\link[=group_by]{group_by()}}. For
+details and examples, see \link[=dplyr_by]{?dplyr_by}.}
+
\item{.preserve}{Relevant when the \code{.data} input is grouped.
If \code{.preserve = FALSE} (the default), the grouping structure
is recalculated based on the resulting data, otherwise the grouping is kept as is.}
diff --git a/man/summarise.Rd b/man/summarise.Rd
index 99f09dc56d..7c2055fb20 100644
--- a/man/summarise.Rd
+++ b/man/summarise.Rd
@@ -5,9 +5,9 @@
\alias{summarize}
\title{Summarise each group to fewer rows}
\usage{
-summarise(.data, ..., .groups = NULL)
+summarise(.data, ..., .by = NULL, .groups = NULL)
-summarize(.data, ..., .groups = NULL)
+summarize(.data, ..., .by = NULL, .groups = NULL)
}
\arguments{
\item{.data}{A data frame, data frame extension (e.g. a tibble), or a
@@ -24,6 +24,12 @@ The value can be:
\item A data frame, to add multiple columns from a single expression.
}}
+\item{.by}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}}
+
+<\code{\link[=dplyr_tidy_select]{tidy-select}}> Optionally, a selection of columns to
+temporarily group by using an inline alternative to \code{\link[=group_by]{group_by()}}. For
+details and examples, see \link[=dplyr_by]{?dplyr_by}.}
+
\item{.groups}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} Grouping structure of the result.
\itemize{
\item "drop_last": dropping the last level of grouping. This was the
diff --git a/src/chop.cpp b/src/chop.cpp
index 44a89645d5..06c73a9825 100644
--- a/src/chop.cpp
+++ b/src/chop.cpp
@@ -66,7 +66,10 @@ void dplyr_lazy_vec_chop_ungrouped(SEXP chops_env, SEXP data) {
UNPROTECT(1);
}
-SEXP dplyr_lazy_vec_chop(SEXP data, SEXP rows) {
+SEXP dplyr_lazy_vec_chop(SEXP data, SEXP rows, SEXP ffi_grouped, SEXP ffi_rowwise) {
+ bool grouped = static_cast(LOGICAL_ELT(ffi_grouped, 0));
+ bool rowwise = static_cast(LOGICAL_ELT(ffi_rowwise, 0));
+
// a first environment to hide `.indices` and `.current_group`
// this is for example used by funs::
SEXP indices_env = PROTECT(new_environment(2, R_EmptyEnv));
@@ -75,9 +78,9 @@ SEXP dplyr_lazy_vec_chop(SEXP data, SEXP rows) {
// then an environment to hold the chops of the columns
SEXP chops_env = PROTECT(new_environment(XLENGTH(data), indices_env));
- if (Rf_inherits(data, "grouped_df")) {
+ if (grouped) {
dplyr_lazy_vec_chop_grouped(chops_env, rows, data, false);
- } else if (Rf_inherits(data, "rowwise_df")) {
+ } else if (rowwise) {
dplyr_lazy_vec_chop_grouped(chops_env, rows, data, true);
} else {
dplyr_lazy_vec_chop_ungrouped(chops_env, data);
diff --git a/src/dplyr.h b/src/dplyr.h
index 2517a539ed..d00e601f12 100644
--- a/src/dplyr.h
+++ b/src/dplyr.h
@@ -112,7 +112,7 @@ SEXP dplyr_group_keys(SEXP group_data);
SEXP dplyr_mask_remove(SEXP env_private, SEXP s_name);
SEXP dplyr_mask_add(SEXP env_private, SEXP s_name, SEXP ptype, SEXP chunks);
-SEXP dplyr_lazy_vec_chop(SEXP data, SEXP rows);
+SEXP dplyr_lazy_vec_chop(SEXP data, SEXP rows, SEXP ffi_grouped, SEXP ffi_rowwise);
SEXP dplyr_data_masks_setup(SEXP chops, SEXP data, SEXP rows);
SEXP env_resolved(SEXP env, SEXP names);
void add_mask_binding(SEXP name, SEXP env_bindings, SEXP env_chops);
diff --git a/src/init.cpp b/src/init.cpp
index ba354ad591..7bf0b26f92 100644
--- a/src/init.cpp
+++ b/src/init.cpp
@@ -114,7 +114,7 @@ static const R_CallMethodDef CallEntries[] = {
{"dplyr_mask_remove", (DL_FUNC)& dplyr_mask_remove, 2},
{"dplyr_mask_add", (DL_FUNC)& dplyr_mask_add, 4},
- {"dplyr_lazy_vec_chop_impl", (DL_FUNC)& dplyr_lazy_vec_chop, 2},
+ {"dplyr_lazy_vec_chop_impl", (DL_FUNC)& dplyr_lazy_vec_chop, 4},
{"dplyr_data_masks_setup", (DL_FUNC)& dplyr_data_masks_setup, 3},
{"env_resolved", (DL_FUNC)& env_resolved, 2},
diff --git a/tests/testthat/_snaps/by.md b/tests/testthat/_snaps/by.md
new file mode 100644
index 0000000000..343d99012e
--- /dev/null
+++ b/tests/testthat/_snaps/by.md
@@ -0,0 +1,33 @@
+# throws tidyselect errors
+
+ Code
+ compute_by(by = y, data = df)
+ Condition
+ Error:
+ ! Can't subset columns that don't exist.
+ x Column `y` doesn't exist.
+
+# can't set `.by` with a grouped-df
+
+ Code
+ compute_by(x, gdf)
+ Condition
+ Error:
+ ! Can't supply `by` when `data` is a grouped data frame.
+
+# can't set `.by` with a rowwise-df
+
+ Code
+ compute_by(x, rdf)
+ Condition
+ Error:
+ ! Can't supply `by` when `data` is a rowwise data frame.
+
+# can tweak the error args
+
+ Code
+ compute_by(x, gdf, by_arg = "x", data_arg = "dat")
+ Condition
+ Error:
+ ! Can't supply `x` when `dat` is a grouped data frame.
+
diff --git a/tests/testthat/_snaps/filter.md b/tests/testthat/_snaps/filter.md
index 128e727f1e..4c4f7fd619 100644
--- a/tests/testthat/_snaps/filter.md
+++ b/tests/testthat/_snaps/filter.md
@@ -229,3 +229,27 @@
x y
1 1 1
+# can't use `.by` with `.preserve`
+
+ Code
+ filter(df, .by = x, .preserve = TRUE)
+ Condition
+ Error in `filter()`:
+ ! Can't supply both `.by` and `.preserve`.
+
+# catches `.by` with grouped-df
+
+ Code
+ filter(gdf, .by = x)
+ Condition
+ Error in `filter()`:
+ ! Can't supply `.by` when `.data` is a grouped data frame.
+
+# catches `.by` with rowwise-df
+
+ Code
+ filter(rdf, .by = x)
+ Condition
+ Error in `filter()`:
+ ! Can't supply `.by` when `.data` is a rowwise data frame.
+
diff --git a/tests/testthat/_snaps/mutate.md b/tests/testthat/_snaps/mutate.md
index 1e3edd3e00..2211198702 100644
--- a/tests/testthat/_snaps/mutate.md
+++ b/tests/testthat/_snaps/mutate.md
@@ -59,6 +59,22 @@
! `y` must be size 1, not 2.
i Did you mean: `y = list(x)` ?
+# catches `.by` with grouped-df
+
+ Code
+ mutate(gdf, .by = x)
+ Condition
+ Error in `mutate()`:
+ ! Can't supply `.by` when `.data` is a grouped data frame.
+
+# catches `.by` with rowwise-df
+
+ Code
+ mutate(rdf, .by = x)
+ Condition
+ Error in `mutate()`:
+ ! Can't supply `.by` when `.data` is a rowwise data frame.
+
# mutate() deals with 0 groups (#5534)
Code
diff --git a/tests/testthat/_snaps/slice.md b/tests/testthat/_snaps/slice.md
index d7d90c88ee..750561e94f 100644
--- a/tests/testthat/_snaps/slice.md
+++ b/tests/testthat/_snaps/slice.md
@@ -47,6 +47,30 @@
Caused by error in `1 + ""`:
! non-numeric argument to binary operator
+# can't use `.by` with `.preserve`
+
+ Code
+ slice(df, .by = x, .preserve = TRUE)
+ Condition
+ Error in `slice()`:
+ ! Can't supply both `.by` and `.preserve`.
+
+# catches `.by` with grouped-df
+
+ Code
+ slice(gdf, .by = x)
+ Condition
+ Error in `slice()`:
+ ! Can't supply `.by` when `.data` is a grouped data frame.
+
+# catches `.by` with rowwise-df
+
+ Code
+ slice(rdf, .by = x)
+ Condition
+ Error in `slice()`:
+ ! Can't supply `.by` when `.data` is a rowwise data frame.
+
# slice_helpers() call get_slice_size()
Code
@@ -196,6 +220,34 @@
* ..2 = 2
i Did you forget to name an argument?
+# slice_helper `by` errors use correct error context and correct `by_arg`
+
+ Code
+ slice_head(gdf, n = 1, by = x)
+ Condition
+ Error in `slice_head()`:
+ ! Can't supply `by` when `.data` is a grouped data frame.
+ Code
+ slice_tail(gdf, n = 1, by = x)
+ Condition
+ Error in `slice_tail()`:
+ ! Can't supply `by` when `.data` is a grouped data frame.
+ Code
+ slice_min(gdf, order_by = x, by = x)
+ Condition
+ Error in `slice_min()`:
+ ! Can't supply `by` when `.data` is a grouped data frame.
+ Code
+ slice_max(gdf, order_by = x, by = x)
+ Condition
+ Error in `slice_max()`:
+ ! Can't supply `by` when `.data` is a grouped data frame.
+ Code
+ slice_sample(gdf, n = 1, by = x)
+ Condition
+ Error in `slice_sample()`:
+ ! Can't supply `by` when `.data` is a grouped data frame.
+
# slice_min/max() check size of `order_by=` (#5922)
Code
diff --git a/tests/testthat/_snaps/summarise.md b/tests/testthat/_snaps/summarise.md
index c9fd3c2a86..21f084e581 100644
--- a/tests/testthat/_snaps/summarise.md
+++ b/tests/testthat/_snaps/summarise.md
@@ -1,3 +1,27 @@
+# can't use `.by` with `.groups`
+
+ Code
+ summarise(df, .by = x, .groups = "drop")
+ Condition
+ Error in `summarise()`:
+ ! Can't supply both `.by` and `.groups`.
+
+# catches `.by` with grouped-df
+
+ Code
+ summarise(gdf, .by = x)
+ Condition
+ Error in `summarise()`:
+ ! Can't supply `.by` when `.data` is a grouped data frame.
+
+# catches `.by` with rowwise-df
+
+ Code
+ summarise(rdf, .by = x)
+ Condition
+ Error in `summarise()`:
+ ! Can't supply `.by` when `.data` is a rowwise data frame.
+
# summarise() gives meaningful errors
Code
diff --git a/tests/testthat/test-across.R b/tests/testthat/test-across.R
index b387ea284f..b662b7b5fa 100644
--- a/tests/testthat/test-across.R
+++ b/tests/testthat/test-across.R
@@ -913,7 +913,8 @@ test_that("expand_across() expands lambdas", {
index = 1
)
- DataMask$new(mtcars, "mutate", call("caller"))
+ by <- compute_by(by = NULL, data = mtcars, error_call = call("caller"))
+ DataMask$new(mtcars, by, "mutate", call("caller"))
expect_equal(
map(expand_across(quo), quo_get_expr),
@@ -934,7 +935,8 @@ test_that("expand_if_across() expands lambdas", {
index = 1
)
- DataMask$new(mtcars, "mutate", call("caller"))
+ by <- compute_by(by = NULL, data = mtcars, error_call = call("caller"))
+ DataMask$new(mtcars, by, "mutate", call("caller"))
expect_equal(
map(expand_if_across(quo), quo_squash),
diff --git a/tests/testthat/test-by.R b/tests/testthat/test-by.R
new file mode 100644
index 0000000000..de4dc1a41d
--- /dev/null
+++ b/tests/testthat/test-by.R
@@ -0,0 +1,100 @@
+test_that("computes group data when `by` is set", {
+ df <- tibble(x = c(1, 1, 2, 2, 1))
+
+ out <- compute_by(by = x, data = df)
+ expect_identical(out$type, "grouped")
+ expect_identical(out$names, "x")
+
+ expect_identical(
+ out$data,
+ tibble(x = c(1, 2), ".rows" := list_of(c(1L, 2L, 5L), c(3L, 4L)))
+ )
+})
+
+test_that("computes `by` group data in order of appearance", {
+ df <- tibble(
+ x = c(5, 4, 5, 5),
+ y = c(2, 3, 1, 2)
+ )
+
+ out <- compute_by(by = c(x, y), data = df)
+
+ expect <- tibble(
+ x = c(5, 4, 5),
+ y = c(2, 3, 1),
+ ".rows" := list_of(c(1L, 4L), 2L, 3L)
+ )
+
+ expect_identical(out$data, expect)
+})
+
+test_that("extracts existing data when `by = NULL`", {
+ df <- data.frame(x = c(1, 1, 2, 2, 1))
+ out <- compute_by(by = NULL, data = df)
+ expect_identical(out$type, "ungrouped")
+ expect_identical(out$names, character())
+ # `compute_by()` is always type stable on `$data` and returns a bare tibble
+ expect_identical(out$data, as_tibble(group_data(df)))
+
+ df <- tibble(x = c(1, 1, 2, 2, 1))
+ out <- compute_by(by = NULL, data = df)
+ expect_identical(out$type, "ungrouped")
+ expect_identical(out$names, character())
+ expect_identical(out$data, group_data(df))
+
+ gdf <- group_by(df, x)
+ out <- compute_by(by = NULL, data = gdf)
+ expect_identical(out$type, "grouped")
+ expect_identical(out$names, "x")
+ expect_identical(out$data, group_data(gdf))
+
+ rdf <- rowwise(df)
+ out <- compute_by(by = NULL, data = rdf)
+ expect_identical(out$type, "rowwise")
+ expect_identical(out$names, character())
+ expect_identical(out$data, group_data(rdf))
+})
+
+test_that("empty selection results in ungrouped group data", {
+ df <- tibble(x = 1)
+
+ out <- compute_by(by = c(), data = df)
+ expect_identical(out$type, "ungrouped")
+ expect_identical(out$names, character())
+ expect_identical(out$data, group_data(df))
+})
+
+test_that("throws tidyselect errors", {
+ df <- tibble(x = 1)
+
+ expect_snapshot(error = TRUE, {
+ compute_by(by = y, data = df)
+ })
+})
+
+test_that("can't set `.by` with a grouped-df", {
+ df <- tibble(x = 1:5)
+ gdf <- group_by(df, x)
+
+ expect_snapshot(error = TRUE, {
+ compute_by(x, gdf)
+ })
+})
+
+test_that("can't set `.by` with a rowwise-df", {
+ df <- tibble(x = 1:5)
+ rdf <- rowwise(df)
+
+ expect_snapshot(error = TRUE, {
+ compute_by(x, rdf)
+ })
+})
+
+test_that("can tweak the error args", {
+ df <- tibble(x = 1:5)
+ gdf <- group_by(df, x)
+
+ expect_snapshot(error = TRUE, {
+ compute_by(x, gdf, by_arg = "x", data_arg = "dat")
+ })
+})
diff --git a/tests/testthat/test-filter.R b/tests/testthat/test-filter.R
index f7c33d3cc7..ac78adc544 100644
--- a/tests/testthat/test-filter.R
+++ b/tests/testthat/test-filter.R
@@ -587,3 +587,62 @@ test_that("if_any() and if_all() work", {
filter(df, x1 > 6 | x2 > 6)
)
})
+
+# .by -------------------------------------------------------------------------
+
+test_that("can group transiently using `.by`", {
+ df <- tibble(g = c(1, 1, 2, 1, 2), x = c(5, 10, 1, 2, 3))
+
+ out <- filter(df, x > mean(x), .by = g)
+
+ expect_identical(out$g, c(1, 2))
+ expect_identical(out$x, c(10, 3))
+ expect_s3_class(out, class(df), exact = TRUE)
+})
+
+test_that("transient grouping retains bare data.frame class", {
+ df <- tibble(g = c(1, 1, 2, 1, 2), x = c(5, 10, 1, 2, 3))
+ out <- filter(df, x > mean(x), .by = g)
+ expect_s3_class(out, class(df), exact = TRUE)
+})
+
+test_that("transient grouping retains data frame attributes", {
+ # With data.frames or tibbles
+ df <- data.frame(g = c(1, 1, 2), x = c(1, 2, 1))
+ tbl <- as_tibble(df)
+
+ attr(df, "foo") <- "bar"
+ attr(tbl, "foo") <- "bar"
+
+ out <- filter(df, x > mean(x), .by = g)
+ expect_identical(attr(out, "foo"), "bar")
+
+ out <- filter(tbl, x > mean(x), .by = g)
+ expect_identical(attr(out, "foo"), "bar")
+})
+
+test_that("can't use `.by` with `.preserve`", {
+ df <- tibble(x = 1)
+
+ expect_snapshot(error = TRUE, {
+ filter(df, .by = x, .preserve = TRUE)
+ })
+})
+
+test_that("catches `.by` with grouped-df", {
+ df <- tibble(x = 1)
+ gdf <- group_by(df, x)
+
+ expect_snapshot(error = TRUE, {
+ filter(gdf, .by = x)
+ })
+})
+
+test_that("catches `.by` with rowwise-df", {
+ df <- tibble(x = 1)
+ rdf <- rowwise(df)
+
+ expect_snapshot(error = TRUE, {
+ filter(rdf, .by = x)
+ })
+})
diff --git a/tests/testthat/test-mutate.R b/tests/testthat/test-mutate.R
index b0631a1557..baed61ef78 100644
--- a/tests/testthat/test-mutate.R
+++ b/tests/testthat/test-mutate.R
@@ -3,16 +3,22 @@ test_that("empty mutate returns input", {
gf <- group_by(df, x)
expect_equal(mutate(df), df)
+ expect_equal(mutate(df, .by = x), df)
expect_equal(mutate(gf), gf)
expect_equal(mutate(df, !!!list()), df)
+ expect_equal(mutate(df, !!!list(), .by = x), df)
expect_equal(mutate(gf, !!!list()), gf)
})
test_that("rownames preserved", {
df <- data.frame(x = c(1, 2), row.names = c("a", "b"))
+
df <- mutate(df, y = 2)
expect_equal(row.names(df), c("a", "b"))
+
+ df <- mutate(df, y = 2, .by = x)
+ expect_equal(row.names(df), c("a", "b"))
})
test_that("mutations applied progressively", {
@@ -74,9 +80,8 @@ test_that("assignments don't overwrite variables (#315)", {
expect_equal(out, tibble(x = 1, y = 2, z = 10))
})
-test_that("can mutate a data frame with zero columns and `NULL` column names", {
+test_that("can mutate a data frame with zero columns", {
df <- new_data_frame(n = 2L)
- colnames(df) <- NULL
expect_equal(mutate(df, x = 1), data.frame(x = c(1, 1)))
})
@@ -382,6 +387,65 @@ test_that("DataMask$add() forces chunks (#4677)", {
expect_equal(df$log_e_bf01, log(1 / 0.244))
})
+# .by -------------------------------------------------------------------------
+
+test_that("can group transiently using `.by`", {
+ df <- tibble(g = c(1, 1, 2, 1, 2), x = c(5, 2, 1, 2, 3))
+
+ out <- mutate(df, x = mean(x), .by = g)
+
+ expect_identical(out$g, df$g)
+ expect_identical(out$x, c(3, 3, 2, 3, 2))
+ expect_s3_class(out, class(df), exact = TRUE)
+})
+
+test_that("transient grouping retains bare data.frame class", {
+ df <- data.frame(g = c(1, 1, 2, 1, 2), x = c(5, 2, 1, 2, 3))
+ out <- mutate(df, x = mean(x), .by = g)
+ expect_s3_class(out, class(df), exact = TRUE)
+})
+
+test_that("transient grouping retains data frame attributes (#6100)", {
+ # With data.frames or tibbles
+ df <- data.frame(g = c(1, 1, 2), x = c(1, 2, 1))
+ tbl <- as_tibble(df)
+
+ attr(df, "foo") <- "bar"
+ attr(tbl, "foo") <- "bar"
+
+ out <- mutate(df, x = mean(x), .by = g)
+ expect_identical(attr(out, "foo"), "bar")
+
+ out <- mutate(tbl, x = mean(x), .by = g)
+ expect_identical(attr(out, "foo"), "bar")
+})
+
+test_that("can `NULL` out the `.by` column", {
+ df <- tibble(x = 1:3)
+
+ expect_identical(
+ mutate(df, x = NULL, .by = x),
+ new_tibble(list(), nrow = 3)
+ )
+})
+
+test_that("catches `.by` with grouped-df", {
+ df <- tibble(x = 1)
+ gdf <- group_by(df, x)
+
+ expect_snapshot(error = TRUE, {
+ mutate(gdf, .by = x)
+ })
+})
+
+test_that("catches `.by` with rowwise-df", {
+ df <- tibble(x = 1)
+ rdf <- rowwise(df)
+
+ expect_snapshot(error = TRUE, {
+ mutate(rdf, .by = x)
+ })
+})
# .before, .after, .keep ------------------------------------------------------
diff --git a/tests/testthat/test-pick.R b/tests/testthat/test-pick.R
index 8441cb82cf..48a439042a 100644
--- a/tests/testthat/test-pick.R
+++ b/tests/testthat/test-pick.R
@@ -503,7 +503,8 @@ test_that("`pick()` can be used inside `group_by()` wrappers", {
test_that("`pick()` doesn't expand across anonymous function boundaries", {
df <- tibble(x = 1, y = 2)
- mask <- DataMask$new(df, verb = "mutate", error_call = current_env())
+ by <- compute_by(by = NULL, data = df, error_call = current_env())
+ mask <- DataMask$new(df, by, verb = "mutate", error_call = current_env())
# With inline `function() { }` calls (this also handles native R anonymous functions)
quo <- dplyr_quosures(z = function() pick(y, x))$z
@@ -516,7 +517,8 @@ test_that("`pick()` doesn't expand across anonymous function boundaries", {
test_that("`pick()` expands embedded quosures", {
df <- tibble(x = 1, y = 2)
- mask <- DataMask$new(df, verb = "mutate", error_call = current_env())
+ by <- compute_by(by = NULL, data = df, error_call = current_env())
+ mask <- DataMask$new(df, by, verb = "mutate", error_call = current_env())
wrapper <- function(x) {
dplyr_quosures(z = dense_rank({{x}}))
diff --git a/tests/testthat/test-slice.R b/tests/testthat/test-slice.R
index 8e78cde13d..95ba3350d7 100644
--- a/tests/testthat/test-slice.R
+++ b/tests/testthat/test-slice.R
@@ -91,6 +91,72 @@ test_that("user errors are correctly labelled", {
})
})
+test_that("can group transiently using `.by`", {
+ df <- tibble(g = c(1, 1, 2), x = c(1, 2, 3))
+
+ out <- slice(df, n(), .by = g)
+
+ expect_identical(out$g, c(1, 2))
+ expect_identical(out$x, c(2, 3))
+ expect_s3_class(out, class(df), exact = TRUE)
+})
+
+test_that("transient grouping retains bare data.frame class", {
+ df <- tibble(g = c(1, 1, 2), x = c(1, 2, 3))
+ out <- slice(df, n(), .by = g)
+ expect_s3_class(out, class(df), exact = TRUE)
+})
+
+test_that("transient grouping retains data frame attributes", {
+ # With data.frames or tibbles
+ df <- data.frame(g = c(1, 1, 2), x = c(1, 2, 3))
+ tbl <- as_tibble(df)
+
+ attr(df, "foo") <- "bar"
+ attr(tbl, "foo") <- "bar"
+
+ out <- slice(df, n(), .by = g)
+ expect_identical(attr(out, "foo"), "bar")
+
+ out <- slice(tbl, n(), .by = g)
+ expect_identical(attr(out, "foo"), "bar")
+})
+
+test_that("transient grouping orders by first appearance", {
+ df <- tibble(g = c(2, 1, 2, 0), x = c(4, 2, 8, 5))
+
+ out <- slice(df, which(x == max(x)), .by = g)
+
+ expect_identical(out$g, c(2, 1, 0))
+ expect_identical(out$x, c(8, 2, 5))
+})
+
+test_that("can't use `.by` with `.preserve`", {
+ df <- tibble(x = 1)
+
+ expect_snapshot(error = TRUE, {
+ slice(df, .by = x, .preserve = TRUE)
+ })
+})
+
+test_that("catches `.by` with grouped-df", {
+ df <- tibble(x = 1)
+ gdf <- group_by(df, x)
+
+ expect_snapshot(error = TRUE, {
+ slice(gdf, .by = x)
+ })
+})
+
+test_that("catches `.by` with rowwise-df", {
+ df <- tibble(x = 1)
+ rdf <- rowwise(df)
+
+ expect_snapshot(error = TRUE, {
+ slice(rdf, .by = x)
+ })
+})
+
# Slice variants ----------------------------------------------------------
test_that("slice_helpers() call get_slice_size()", {
@@ -221,6 +287,19 @@ test_that("slice_helpers do call slice() and benefit from dispatch (#6084)", {
expect_warning(sample_frac(nf, .5), "noisy")
})
+test_that("slice_helper `by` errors use correct error context and correct `by_arg`", {
+ df <- tibble(x = 1)
+ gdf <- group_by(df, x)
+
+ expect_snapshot(error = TRUE, {
+ slice_head(gdf, n = 1, by = x)
+ slice_tail(gdf, n = 1, by = x)
+ slice_min(gdf, order_by = x, by = x)
+ slice_max(gdf, order_by = x, by = x)
+ slice_sample(gdf, n = 1, by = x)
+ })
+})
+
# slice_min/slice_max -----------------------------------------------------
test_that("min and max return ties by default", {
@@ -287,6 +366,13 @@ test_that("slice_min/max() can order by multiple variables (#6176)", {
expect_equal(slice_max(df, tibble(x, y), n = 1)$id, 2)
})
+test_that("slice_min/max() work with `by`", {
+ df <- tibble(g = c(2, 2, 1, 1), x = c(1, 2, 3, 1))
+
+ expect_identical(slice_min(df, x, by = g), df[c(1, 4),])
+ expect_identical(slice_max(df, x, by = g), df[c(2, 3),])
+})
+
test_that("slice_min/max() check size of `order_by=` (#5922)", {
expect_snapshot(error = TRUE, {
slice_min(data.frame(x = 1:10), 1:6)
@@ -360,6 +446,11 @@ test_that("slice_sample() handles negative n= and prop= (#6402)", {
expect_equal(nrow(slice_sample(df, prop = -2)), 0)
})
+test_that("slice_sample() works with `by`", {
+ df <- tibble(g = c(2, 2, 2, 1), x = c(1, 2, 3, 1))
+ expect_identical(slice_sample(df, n = 2, by = g)$g, c(2, 2, 1))
+})
+
# slice_head/slice_tail ---------------------------------------------------
test_that("slice_head/slice_tail keep positive values", {
@@ -404,3 +495,9 @@ test_that("slice_head/slice_tail handle infinite n/prop", {
expect_identical(slice_head(df, prop = -Inf), df[0, ])
expect_identical(slice_tail(df, prop = -Inf), df[0, ])
})
+
+test_that("slice_head/slice_tail work with `by`", {
+ df <- tibble(g = c(2, 2, 2, 1), x = c(1, 2, 3, 1))
+ expect_identical(slice_head(df, n = 2, by = g), df[c(1, 2, 4),])
+ expect_identical(slice_tail(df, n = 2, by = g), df[c(2, 3, 4),])
+})
diff --git a/tests/testthat/test-summarise.R b/tests/testthat/test-summarise.R
index c42b39629c..31f85f3f47 100644
--- a/tests/testthat/test-summarise.R
+++ b/tests/testthat/test-summarise.R
@@ -257,6 +257,76 @@ test_that("summarise() silently skips when all results are NULL (#5708)", {
expect_error(summarise(df, x = if(g == 1) 42))
})
+# .by ----------------------------------------------------------------------
+
+test_that("can group transiently using `.by`", {
+ df <- tibble(g = c(1, 1, 2, 1, 2), x = c(5, 2, 1, 2, 3))
+
+ out <- summarise(df, x = mean(x), .by = g)
+
+ expect_identical(out$g, c(1, 2))
+ expect_identical(out$x, c(3, 2))
+ expect_s3_class(out, class(df), exact = TRUE)
+})
+
+test_that("transient grouping retains bare data.frame class", {
+ df <- data.frame(g = c(1, 1, 2, 1, 2), x = c(5, 2, 1, 2, 3))
+ out <- summarise(df, x = mean(x), .by = g)
+ expect_s3_class(out, class(df), exact = TRUE)
+})
+
+test_that("transient grouping drops data frame attributes", {
+ # Because `summarise()` theoretically creates a "new" data frame
+
+ # With data.frames or tibbles
+ df <- data.frame(g = c(1, 1, 2), x = c(1, 2, 1))
+ tbl <- as_tibble(df)
+
+ attr(df, "foo") <- "bar"
+ attr(tbl, "foo") <- "bar"
+
+ out <- summarise(df, x = mean(x), .by = g)
+ expect_null(attr(out, "foo"))
+
+ out <- summarise(tbl, x = mean(x), .by = g)
+ expect_null(attr(out, "foo"))
+})
+
+test_that("transient grouping orders by first appearance", {
+ df <- tibble(g = c(2, 1, 2, 0), x = c(4, 2, 8, 5))
+
+ out <- summarise(df, x = mean(x), .by = g)
+
+ expect_identical(out$g, c(2, 1, 0))
+ expect_identical(out$x, c(6, 2, 5))
+})
+
+test_that("can't use `.by` with `.groups`", {
+ df <- tibble(x = 1)
+
+ expect_snapshot(error = TRUE, {
+ summarise(df, .by = x, .groups = "drop")
+ })
+})
+
+test_that("catches `.by` with grouped-df", {
+ df <- tibble(x = 1)
+ gdf <- group_by(df, x)
+
+ expect_snapshot(error = TRUE, {
+ summarise(gdf, .by = x)
+ })
+})
+
+test_that("catches `.by` with rowwise-df", {
+ df <- tibble(x = 1)
+ rdf <- rowwise(df)
+
+ expect_snapshot(error = TRUE, {
+ summarise(rdf, .by = x)
+ })
+})
+
# errors -------------------------------------------------------------------
test_that("summarise() preserves the call stack on error (#5308)", {