add sort_by.data.table (#6679)

rikivillalba · MichaelChirico · web-flow · commit 3aee4303860d · 2025-02-25T11:35:27.000-08:00
* add sort_by.data.table

* style

* add R &gt;= 4.4.0 guard for sort_by method

* Tidy up NEWS

* additional tests, test id -&gt; 2301.

* man entry on setorder.Rd

* style

* style

* only run tests if sort_by exists

* clarify sort_by R&gt;=4.4.0

---------

Co-authored-by: Michael Chirico &lt;chiricom@google.com&gt;
Co-authored-by: Michael Chirico &lt;michaelchirico4@gmail.com&gt;
diff --git a/NAMESPACE b/NAMESPACE
@@ -206,3 +206,6 @@ S3method(format_list_item, data.frame)
 
 export(fdroplevels, setdroplevels)
 S3method(droplevels, data.table)
+
+# sort_by added in R 4.4.0, #6662, https://stat.ethz.ch/pipermail/r-announce/2024/000701.html
+if (getRversion() >= "4.4.0") S3method(sort_by, data.table)  
diff --git a/NEWS.md b/NEWS.md
@@ -2,7 +2,9 @@
 
 # data.table [v1.17.99](https://github.com/Rdatatable/data.table/milestone/35)  (in development)
 
+## NEW FEATURES
 
+1. New `sort_by()` method for data.tables, [#6662](https://github.com/Rdatatable/data.table/issues/6662). It uses `forder()` to improve upon the data.frame method and also match `DT[order(...)]` behavior with respect to locale. Thanks @rikivillalba for the suggestion and PR.
 
 # data.table [v1.17.0](https://github.com/Rdatatable/data.table/milestone/34)  (20 Feb 2025)
 
diff --git a/R/data.table.R b/R/data.table.R
@@ -2526,6 +2526,18 @@ split.data.table = function(x, f, drop = FALSE, by, sorted = FALSE, keep.by = TR
   }
 }
 
+sort_by.data.table <- function(x, y, ...)
+{
+  if (!cedta()) return(NextMethod()) # nocov
+  if (inherits(y, "formula"))
+    y <- .formula2varlist(y, x)
+  if (!is.list(y))
+    y <- list(y)
+  # use forder instead of base 'order'
+  o <- do.call(forder, c(unname(y), list(...)))
+  x[o, , drop=FALSE]
+}
+
 # TO DO, add more warnings e.g. for by.data.table(), telling user what the data.table syntax is but letting them dispatch to data.frame if they want
 
 copy = function(x) {
diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
@@ -21041,3 +21041,31 @@ test(2304.100, set(copy(DT), i=2L, j=c("L1", "L2"), value=list(list(NULL), list(
 
 # the integer overflow in #6729 is only noticeable with UBSan
 test(2305, { fread(testDir("issue_6729.txt.bz2")); TRUE })
+
+if (exists("sort_by", "package:base")) {
+  # sort_by.data.table
+  DT1 = data.table(a=c(1, 3, 2, NA, 3), b=4:0)
+  DT2 = data.table(a=c("c", "a", "B")) # data.table uses C-locale and should sort_by if cedta()
+  DT3 = data.table(a=c(1, 2, 3), b=list(c("a", "b", "", NA), c(1, 3, 2, 0), c(TRUE, TRUE, FALSE, NA))) # list column
+
+  # sort_by.data.table: basics
+  test(2306.01, sort_by(DT1, ~a + b), data.table(a=c(1, 2, 3, 3, NA), b=c(4L, 2L, 0L, 3L, 1L)))
+  test(2306.02, sort_by(DT1, ~I(a + b)), data.table(a=c(3, 2, 1, 3, NA), b=c(0L, 2L, 4L, 3L, 1L)))
+  test(2306.03, sort_by(DT2, ~a), data.table(a=c("B", "a", "c")))
+
+  # sort_by.data.table: list columns.
+  # NOTE 1: .formula2varlist works well with list columns.
+  # NOTE 2: 4 elem in DT of 3 row because forderv takes a list column as a DT.
+  test(2306.04, sort_by(DT3, ~b), DT3[order(b)]) # should be consistent.
+
+  # sort_by.data.table: additional C-locale sorting
+  test(2306.10, DT2[, sort_by(.SD, a)], data.table(a=c("B", "a", "c")))
+  test(2306.11, DT2[, sort_by(.SD, ~a)], data.table(a=c("B", "a", "c")))
+
+  # sort_by.data.table: various working interfaces
+  test(2306.20, sort_by(DT1, list(DT1$a, DT1$b)), data.table(a=c(1, 2, 3, 3, NA), b=c(4L, 2L, 0L, 3L, 1L)))
+  test(2306.21, sort_by(DT1, DT1[, .(a, b)]), data.table(a=c(1, 2, 3, 3, NA), b=c(4L, 2L, 0L, 3L, 1L)))
+  test(2306.22, DT1[, sort_by(.SD, .(a, b))], data.table(a=c(1, 2, 3, 3, NA), b=c(4L, 2L, 0L, 3L, 1L)))
+  test(2306.23, DT1[, sort_by(.SD, ~a + b)], data.table(a=c(1, 2, 3, 3, NA), b=c(4L, 2L, 0L, 3L, 1L)))
+  test(2306.24, DT1[, sort_by(.SD, ~.(a, b))], data.table(a=c(1, 2, 3, 3, NA), b=c(4L, 2L, 0L, 3L, 1L)))
+}
diff --git a/man/setorder.Rd b/man/setorder.Rd
@@ -5,6 +5,7 @@
 \alias{fastorder}
 \alias{forder}
 \alias{forderv}
+\alias{sort_by}
 
 \title{Fast row reordering of a data.table by reference}
 \description{
@@ -32,6 +33,7 @@ setorderv(x, cols = colnames(x), order=1L, na.last=FALSE)
 # optimised to use data.table's internal fast order
 # x[order(., na.last=TRUE)]
 # x[order(., decreasing=TRUE)]
+# sort_by(x, ., na.last=TRUE, decreasing=FALSE)    # R >= 4.4.0
 }
 \arguments{
 \item{x}{ A \code{data.table}. }
@@ -46,7 +48,7 @@ when \code{b} is of type \code{character} as well. }
 \code{order} must be either \code{1} or equal to that of \code{cols}. If
 \code{length(order) == 1}, it is recycled to \code{length(cols)}. }
 \item{na.last}{ \code{logical}. If \code{TRUE}, missing values in the data are placed last; if \code{FALSE}, they are placed first; if \code{NA} they are removed.
-\code{na.last=NA} is valid only for \code{x[order(., na.last)]} and its
+\code{na.last=NA} is valid only for \code{x[order(., na.last)]} and related \code{sort_by(x, .)} (\eqn{\R \ge 4.4.0}) and its
 default is \code{TRUE}. \code{setorder} and \code{setorderv} only accept
 \code{TRUE}/\code{FALSE} with default \code{FALSE}. }
 }
@@ -71,8 +73,8 @@ sets the \code{sorted} attribute.
 
 \code{na.last} argument, by default, is \code{FALSE} for \code{setorder} and
 \code{setorderv} to be consistent with \code{data.table}'s \code{setkey} and
-is \code{TRUE} for \code{x[order(.)]} to be consistent with \code{base::order}.
-Only \code{x[order(.)]} can have \code{na.last = NA} as it is a subset operation
+is \code{TRUE} for \code{x[order(.)]} and \code{sort_by(x, .)} (\eqn{\R \ge 4.4.0}) to be consistent with \code{base::order}.
+Only \code{x[order(.)]} (and related \code{sort_by(x, .)}) can have \code{na.last = NA} as it is a subset operation
 as opposed to \code{setorder} or \code{setorderv} which reorders the data.table
 by reference.
 
@@ -96,6 +98,11 @@ was started in. By contrast, \code{"america" < "BRAZIL"} is always \code{FALSE}
 
 If \code{setorder} results in reordering of the rows of a keyed \code{data.table},
 then its key will be set to \code{NULL}.
+
+Starting from \R 4.4.0, \code{sort_by(x, y, \dots)} is the S3 method for the generic \code{sort_by} for \code{data.table}'s. 
+It uses the same formula or list interfaces as data.frame's \code{sort_by} but internally uses \code{data.table}'s fast ordering, 
+hence it behaves the same as \code{x[order(.)]} and takes the same optional named arguments and their defaults.
+
 }
 \value{
 The input is modified by reference, and returned (invisibly) so it can be used