Performance issues with data_filter() when tidyverse is loaded? (#651)

strengejacke · etiennebacher · web-flow · commit 98a5a78088b9 · 2025-09-09T18:48:21.000+02:00
* Performance issues with `data_filter()` when tidyverse is loaded? Fixes #650 * add test * news * same for data_arrange * news * also data_duplicated * also data_unique * version * Update NEWS.md Co-authored-by: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> --------- Co-authored-by: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com>
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: datawizard
 Title: Easy Data Wrangling and Statistical Transformations
-Version: 1.2.0.6
+Version: 1.2.0.7
 Authors@R: c(
     person("Indrajeet", "Patil", , "patilindrajeet.science@gmail.com", role = "aut",
            comment = c(ORCID = "0000-0003-1995-6531")),
diff --git a/NEWS.md b/NEWS.md
@@ -21,6 +21,9 @@ CHANGES
 * `display()` methods now support the `{tinytable}` package. Use `format = "tt"`
   to export tables as `tinytable` objects (#646).
 
+* Improved performance for several functions that process grouped data frames
+  when the input is a grouped `tibble` (#651).
+
 BUG FIXES
 
 * Fixed an issue when `demean()`ing nested structures with more than 2 grouping
diff --git a/R/data_arrange.R b/R/data_arrange.R
@@ -33,9 +33,19 @@ data_arrange.default <- function(data, select = NULL, safe = TRUE) {
     return(data)
   }
 
+  original_x <- data
+
   # Input validation check
   data <- .coerce_to_dataframe(data)
 
+  # Remove tidyverse attributes, will add them back at the end
+  if (inherits(original_x, "tbl_df")) {
+    tbl_input <- TRUE
+    data <- as.data.frame(data, stringsAsFactors = FALSE)
+  } else {
+    tbl_input <- FALSE
+  }
+
   # find which vars should be decreasing
   desc <- select[startsWith(select, "-")]
   desc <- gsub("^-", "", desc)
@@ -95,15 +105,27 @@ data_arrange.default <- function(data, select = NULL, safe = TRUE) {
     rownames(out) <- NULL
   }
 
+  # add back custom attributes
+  out <- .replace_attrs(out, attributes(original_x))
+
   out
 }
 
 
 #' @export
 data_arrange.grouped_df <- function(data, select = NULL, safe = TRUE) {
+  original_x <- data
   grps <- attr(data, "groups", exact = TRUE)
   grps <- grps[[".rows"]]
 
+  # Remove tidyverse attributes, will add them back at the end
+  if (inherits(data, "tbl_df")) {
+    tbl_input <- TRUE
+    data <- as.data.frame(data, stringsAsFactors = FALSE)
+  } else {
+    tbl_input <- FALSE
+  }
+
   out <- lapply(grps, function(x) {
     data_arrange.default(data[x, ], select = select, safe = safe)
   })
@@ -114,5 +136,13 @@ data_arrange.grouped_df <- function(data, select = NULL, safe = TRUE) {
     rownames(out) <- NULL
   }
 
+  # add back tidyverse attributes
+  if (isTRUE(tbl_input)) {
+    class(out) <- c("tbl_df", "tbl", "data.frame")
+  }
+
+  # add back custom attributes
+  out <- .replace_attrs(out, attributes(original_x))
+
   out
 }
diff --git a/R/data_duplicated.R b/R/data_duplicated.R
@@ -86,6 +86,8 @@ data_duplicated.grouped_df <- function(data,
   grps <- attr(data, "groups", exact = TRUE)
   grps <- grps[[".rows"]]
 
+  data <- as.data.frame(data)
+
   out <- lapply(grps, function(x) {
     data_duplicated.data.frame(data[x, ], select = select)
   })
diff --git a/R/data_match.R b/R/data_match.R
@@ -183,6 +183,15 @@ data_filter <- function(x, ...) {
 #' @export
 data_filter.data.frame <- function(x, ...) {
   out <- x
+
+  # convert tibble to data.frame
+  if (inherits(x, "tbl_df")) {
+    out <- as.data.frame(out, stringsAsFactors = FALSE)
+    tbl_input <- TRUE
+  } else {
+    tbl_input <- FALSE
+  }
+
   dots <- match.call(expand.dots = FALSE)[["..."]]
 
   if (any(nzchar(names(dots), keepNA = TRUE))) {
@@ -275,15 +284,30 @@ data_filter.data.frame <- function(x, ...) {
 
   # add back custom attributes
   out <- .replace_attrs(out, attributes(x))
+
+  # add back tidyverse attributes
+  if (isTRUE(tbl_input)) {
+    class(out) <- c("tbl_df", "tbl", "data.frame")
+  }
+
   out
 }
 
 
 #' @export
 data_filter.grouped_df <- function(x, ...) {
+  original_x <- x
   grps <- attr(x, "groups", exact = TRUE)
   grps <- grps[[".rows"]]
 
+  # Remove tidyverse attributes, will add them back at the end
+  if (inherits(x, "tbl_df")) {
+    tbl_input <- TRUE
+    x <- as.data.frame(x, stringsAsFactors = FALSE)
+  } else {
+    tbl_input <- FALSE
+  }
+
   dots <- match.call(expand.dots = FALSE)[["..."]]
   out <- lapply(grps, function(grp) {
     arguments <- list(x[grp, ])
@@ -297,6 +321,14 @@ data_filter.grouped_df <- function(x, ...) {
     rownames(out) <- NULL
   }
 
+  # add back tidyverse attributes
+  if (isTRUE(tbl_input)) {
+    class(out) <- c("tbl_df", "tbl", "data.frame")
+  }
+
+  # add back custom attributes
+  out <- .replace_attrs(out, attributes(original_x))
+
   out
 }
 
diff --git a/R/data_unique.R b/R/data_unique.R
@@ -122,7 +122,7 @@ data_unique.grouped_df <- function(data,
   grps <- attr(data, "groups", exact = TRUE)
   grps <- grps[[".rows"]]
 
-  data2 <- data_ungroup(data)
+  data2 <- as.data.frame(data_ungroup(data))
 
   out <- lapply(grps, function(x) {
     data_unique.data.frame(data2[x, ], select = select, keep = keep, verbose = verbose)
diff --git a/tests/testthat/test-data_match.R b/tests/testthat/test-data_match.R
@@ -345,3 +345,22 @@ test_that("data_filter, slicing works with functions", {
   )
   # styler: on
 })
+
+
+test_that("data_filter works with tibbles", {
+  skip_if_not_installed("tibble")
+  skip_if_not_installed("dplyr")
+  data(mtcars)
+
+  # preserve class
+  d <- tibble::as_tibble(mtcars)
+  out <- data_filter(d, mpg > 15)
+  expect_s3_class(out, "tbl_df")
+
+  # preserve attributes
+  d <- tibble::as_tibble(mtcars)
+  d <- dplyr::group_by(d, cyl)
+  out <- data_filter(d, mpg > 15)
+  expect_s3_class(out, "tbl_df")
+  expect_named(attr(out, "groups"), c("cyl", ".rows"))
+})