diff --git a/NAMESPACE b/NAMESPACE index 2341ac9356..dc3cfe7d0d 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -206,3 +206,6 @@ S3method(format_list_item, data.frame) export(fdroplevels, setdroplevels) S3method(droplevels, data.table) + +# sort_by added in R 4.4.0, #6662, https://stat.ethz.ch/pipermail/r-announce/2024/000701.html +if (getRversion() >= "4.4.0") S3method(sort_by, data.table) diff --git a/NEWS.md b/NEWS.md index 3f72429fb2..f22d066253 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,7 +2,9 @@ # data.table [v1.17.99](https://github.com/Rdatatable/data.table/milestone/35) (in development) +## NEW FEATURES +1. New `sort_by()` method for data.tables, [#6662](https://github.com/Rdatatable/data.table/issues/6662). It uses `forder()` to improve upon the data.frame method and also match `DT[order(...)]` behavior with respect to locale. Thanks @rikivillalba for the suggestion and PR. # data.table [v1.17.0](https://github.com/Rdatatable/data.table/milestone/34) (20 Feb 2025) diff --git a/R/data.table.R b/R/data.table.R index 34e9958074..d1f6798100 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -2526,6 +2526,18 @@ split.data.table = function(x, f, drop = FALSE, by, sorted = FALSE, keep.by = TR } } +sort_by.data.table <- function(x, y, ...) +{ + if (!cedta()) return(NextMethod()) # nocov + if (inherits(y, "formula")) + y <- .formula2varlist(y, x) + if (!is.list(y)) + y <- list(y) + # use forder instead of base 'order' + o <- do.call(forder, c(unname(y), list(...))) + x[o, , drop=FALSE] +} + # TO DO, add more warnings e.g. for by.data.table(), telling user what the data.table syntax is but letting them dispatch to data.frame if they want copy = function(x) { diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 1c3c358a19..e4231b5fa8 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -21041,3 +21041,31 @@ test(2304.100, set(copy(DT), i=2L, j=c("L1", "L2"), value=list(list(NULL), list( # the integer overflow in #6729 is only noticeable with UBSan test(2305, { fread(testDir("issue_6729.txt.bz2")); TRUE }) + +if (exists("sort_by", "package:base")) { + # sort_by.data.table + DT1 = data.table(a=c(1, 3, 2, NA, 3), b=4:0) + DT2 = data.table(a=c("c", "a", "B")) # data.table uses C-locale and should sort_by if cedta() + DT3 = data.table(a=c(1, 2, 3), b=list(c("a", "b", "", NA), c(1, 3, 2, 0), c(TRUE, TRUE, FALSE, NA))) # list column + + # sort_by.data.table: basics + test(2306.01, sort_by(DT1, ~a + b), data.table(a=c(1, 2, 3, 3, NA), b=c(4L, 2L, 0L, 3L, 1L))) + test(2306.02, sort_by(DT1, ~I(a + b)), data.table(a=c(3, 2, 1, 3, NA), b=c(0L, 2L, 4L, 3L, 1L))) + test(2306.03, sort_by(DT2, ~a), data.table(a=c("B", "a", "c"))) + + # sort_by.data.table: list columns. + # NOTE 1: .formula2varlist works well with list columns. + # NOTE 2: 4 elem in DT of 3 row because forderv takes a list column as a DT. + test(2306.04, sort_by(DT3, ~b), DT3[order(b)]) # should be consistent. + + # sort_by.data.table: additional C-locale sorting + test(2306.10, DT2[, sort_by(.SD, a)], data.table(a=c("B", "a", "c"))) + test(2306.11, DT2[, sort_by(.SD, ~a)], data.table(a=c("B", "a", "c"))) + + # sort_by.data.table: various working interfaces + test(2306.20, sort_by(DT1, list(DT1$a, DT1$b)), data.table(a=c(1, 2, 3, 3, NA), b=c(4L, 2L, 0L, 3L, 1L))) + test(2306.21, sort_by(DT1, DT1[, .(a, b)]), data.table(a=c(1, 2, 3, 3, NA), b=c(4L, 2L, 0L, 3L, 1L))) + test(2306.22, DT1[, sort_by(.SD, .(a, b))], data.table(a=c(1, 2, 3, 3, NA), b=c(4L, 2L, 0L, 3L, 1L))) + test(2306.23, DT1[, sort_by(.SD, ~a + b)], data.table(a=c(1, 2, 3, 3, NA), b=c(4L, 2L, 0L, 3L, 1L))) + test(2306.24, DT1[, sort_by(.SD, ~.(a, b))], data.table(a=c(1, 2, 3, 3, NA), b=c(4L, 2L, 0L, 3L, 1L))) +} diff --git a/man/setorder.Rd b/man/setorder.Rd index e1cdc40bba..e862567cbd 100644 --- a/man/setorder.Rd +++ b/man/setorder.Rd @@ -5,6 +5,7 @@ \alias{fastorder} \alias{forder} \alias{forderv} +\alias{sort_by} \title{Fast row reordering of a data.table by reference} \description{ @@ -32,6 +33,7 @@ setorderv(x, cols = colnames(x), order=1L, na.last=FALSE) # optimised to use data.table's internal fast order # x[order(., na.last=TRUE)] # x[order(., decreasing=TRUE)] +# sort_by(x, ., na.last=TRUE, decreasing=FALSE) # R >= 4.4.0 } \arguments{ \item{x}{ A \code{data.table}. } @@ -46,7 +48,7 @@ when \code{b} is of type \code{character} as well. } \code{order} must be either \code{1} or equal to that of \code{cols}. If \code{length(order) == 1}, it is recycled to \code{length(cols)}. } \item{na.last}{ \code{logical}. If \code{TRUE}, missing values in the data are placed last; if \code{FALSE}, they are placed first; if \code{NA} they are removed. -\code{na.last=NA} is valid only for \code{x[order(., na.last)]} and its +\code{na.last=NA} is valid only for \code{x[order(., na.last)]} and related \code{sort_by(x, .)} (\eqn{\R \ge 4.4.0}) and its default is \code{TRUE}. \code{setorder} and \code{setorderv} only accept \code{TRUE}/\code{FALSE} with default \code{FALSE}. } } @@ -71,8 +73,8 @@ sets the \code{sorted} attribute. \code{na.last} argument, by default, is \code{FALSE} for \code{setorder} and \code{setorderv} to be consistent with \code{data.table}'s \code{setkey} and -is \code{TRUE} for \code{x[order(.)]} to be consistent with \code{base::order}. -Only \code{x[order(.)]} can have \code{na.last = NA} as it is a subset operation +is \code{TRUE} for \code{x[order(.)]} and \code{sort_by(x, .)} (\eqn{\R \ge 4.4.0}) to be consistent with \code{base::order}. +Only \code{x[order(.)]} (and related \code{sort_by(x, .)}) can have \code{na.last = NA} as it is a subset operation as opposed to \code{setorder} or \code{setorderv} which reorders the data.table by reference. @@ -96,6 +98,11 @@ was started in. By contrast, \code{"america" < "BRAZIL"} is always \code{FALSE} If \code{setorder} results in reordering of the rows of a keyed \code{data.table}, then its key will be set to \code{NULL}. + +Starting from \R 4.4.0, \code{sort_by(x, y, \dots)} is the S3 method for the generic \code{sort_by} for \code{data.table}'s. +It uses the same formula or list interfaces as data.frame's \code{sort_by} but internally uses \code{data.table}'s fast ordering, +hence it behaves the same as \code{x[order(.)]} and takes the same optional named arguments and their defaults. + } \value{ The input is modified by reference, and returned (invisibly) so it can be used