Skip to content

Commit 3aee430

Browse files
add sort_by.data.table (#6679)
* add sort_by.data.table * style * add R >= 4.4.0 guard for sort_by method * Tidy up NEWS * additional tests, test id -> 2301. * man entry on setorder.Rd * style * style * only run tests if sort_by exists * clarify sort_by R>=4.4.0 --------- Co-authored-by: Michael Chirico <[email protected]> Co-authored-by: Michael Chirico <[email protected]>
1 parent 5a04ba8 commit 3aee430

File tree

5 files changed

+55
-3
lines changed

5 files changed

+55
-3
lines changed

NAMESPACE

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -206,3 +206,6 @@ S3method(format_list_item, data.frame)
206206

207207
export(fdroplevels, setdroplevels)
208208
S3method(droplevels, data.table)
209+
210+
# sort_by added in R 4.4.0, #6662, https://stat.ethz.ch/pipermail/r-announce/2024/000701.html
211+
if (getRversion() >= "4.4.0") S3method(sort_by, data.table)

NEWS.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,9 @@
22

33
# data.table [v1.17.99](https://github.com/Rdatatable/data.table/milestone/35) (in development)
44

5+
## NEW FEATURES
56

7+
1. New `sort_by()` method for data.tables, [#6662](https://github.com/Rdatatable/data.table/issues/6662). It uses `forder()` to improve upon the data.frame method and also match `DT[order(...)]` behavior with respect to locale. Thanks @rikivillalba for the suggestion and PR.
68

79
# data.table [v1.17.0](https://github.com/Rdatatable/data.table/milestone/34) (20 Feb 2025)
810

R/data.table.R

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2526,6 +2526,18 @@ split.data.table = function(x, f, drop = FALSE, by, sorted = FALSE, keep.by = TR
25262526
}
25272527
}
25282528

2529+
sort_by.data.table <- function(x, y, ...)
2530+
{
2531+
if (!cedta()) return(NextMethod()) # nocov
2532+
if (inherits(y, "formula"))
2533+
y <- .formula2varlist(y, x)
2534+
if (!is.list(y))
2535+
y <- list(y)
2536+
# use forder instead of base 'order'
2537+
o <- do.call(forder, c(unname(y), list(...)))
2538+
x[o, , drop=FALSE]
2539+
}
2540+
25292541
# TO DO, add more warnings e.g. for by.data.table(), telling user what the data.table syntax is but letting them dispatch to data.frame if they want
25302542

25312543
copy = function(x) {

inst/tests/tests.Rraw

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21041,3 +21041,31 @@ test(2304.100, set(copy(DT), i=2L, j=c("L1", "L2"), value=list(list(NULL), list(
2104121041

2104221042
# the integer overflow in #6729 is only noticeable with UBSan
2104321043
test(2305, { fread(testDir("issue_6729.txt.bz2")); TRUE })
21044+
21045+
if (exists("sort_by", "package:base")) {
21046+
# sort_by.data.table
21047+
DT1 = data.table(a=c(1, 3, 2, NA, 3), b=4:0)
21048+
DT2 = data.table(a=c("c", "a", "B")) # data.table uses C-locale and should sort_by if cedta()
21049+
DT3 = data.table(a=c(1, 2, 3), b=list(c("a", "b", "", NA), c(1, 3, 2, 0), c(TRUE, TRUE, FALSE, NA))) # list column
21050+
21051+
# sort_by.data.table: basics
21052+
test(2306.01, sort_by(DT1, ~a + b), data.table(a=c(1, 2, 3, 3, NA), b=c(4L, 2L, 0L, 3L, 1L)))
21053+
test(2306.02, sort_by(DT1, ~I(a + b)), data.table(a=c(3, 2, 1, 3, NA), b=c(0L, 2L, 4L, 3L, 1L)))
21054+
test(2306.03, sort_by(DT2, ~a), data.table(a=c("B", "a", "c")))
21055+
21056+
# sort_by.data.table: list columns.
21057+
# NOTE 1: .formula2varlist works well with list columns.
21058+
# NOTE 2: 4 elem in DT of 3 row because forderv takes a list column as a DT.
21059+
test(2306.04, sort_by(DT3, ~b), DT3[order(b)]) # should be consistent.
21060+
21061+
# sort_by.data.table: additional C-locale sorting
21062+
test(2306.10, DT2[, sort_by(.SD, a)], data.table(a=c("B", "a", "c")))
21063+
test(2306.11, DT2[, sort_by(.SD, ~a)], data.table(a=c("B", "a", "c")))
21064+
21065+
# sort_by.data.table: various working interfaces
21066+
test(2306.20, sort_by(DT1, list(DT1$a, DT1$b)), data.table(a=c(1, 2, 3, 3, NA), b=c(4L, 2L, 0L, 3L, 1L)))
21067+
test(2306.21, sort_by(DT1, DT1[, .(a, b)]), data.table(a=c(1, 2, 3, 3, NA), b=c(4L, 2L, 0L, 3L, 1L)))
21068+
test(2306.22, DT1[, sort_by(.SD, .(a, b))], data.table(a=c(1, 2, 3, 3, NA), b=c(4L, 2L, 0L, 3L, 1L)))
21069+
test(2306.23, DT1[, sort_by(.SD, ~a + b)], data.table(a=c(1, 2, 3, 3, NA), b=c(4L, 2L, 0L, 3L, 1L)))
21070+
test(2306.24, DT1[, sort_by(.SD, ~.(a, b))], data.table(a=c(1, 2, 3, 3, NA), b=c(4L, 2L, 0L, 3L, 1L)))
21071+
}

man/setorder.Rd

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
\alias{fastorder}
66
\alias{forder}
77
\alias{forderv}
8+
\alias{sort_by}
89

910
\title{Fast row reordering of a data.table by reference}
1011
\description{
@@ -32,6 +33,7 @@ setorderv(x, cols = colnames(x), order=1L, na.last=FALSE)
3233
# optimised to use data.table's internal fast order
3334
# x[order(., na.last=TRUE)]
3435
# x[order(., decreasing=TRUE)]
36+
# sort_by(x, ., na.last=TRUE, decreasing=FALSE) # R >= 4.4.0
3537
}
3638
\arguments{
3739
\item{x}{ A \code{data.table}. }
@@ -46,7 +48,7 @@ when \code{b} is of type \code{character} as well. }
4648
\code{order} must be either \code{1} or equal to that of \code{cols}. If
4749
\code{length(order) == 1}, it is recycled to \code{length(cols)}. }
4850
\item{na.last}{ \code{logical}. If \code{TRUE}, missing values in the data are placed last; if \code{FALSE}, they are placed first; if \code{NA} they are removed.
49-
\code{na.last=NA} is valid only for \code{x[order(., na.last)]} and its
51+
\code{na.last=NA} is valid only for \code{x[order(., na.last)]} and related \code{sort_by(x, .)} (\eqn{\R \ge 4.4.0}) and its
5052
default is \code{TRUE}. \code{setorder} and \code{setorderv} only accept
5153
\code{TRUE}/\code{FALSE} with default \code{FALSE}. }
5254
}
@@ -71,8 +73,8 @@ sets the \code{sorted} attribute.
7173

7274
\code{na.last} argument, by default, is \code{FALSE} for \code{setorder} and
7375
\code{setorderv} to be consistent with \code{data.table}'s \code{setkey} and
74-
is \code{TRUE} for \code{x[order(.)]} to be consistent with \code{base::order}.
75-
Only \code{x[order(.)]} can have \code{na.last = NA} as it is a subset operation
76+
is \code{TRUE} for \code{x[order(.)]} and \code{sort_by(x, .)} (\eqn{\R \ge 4.4.0}) to be consistent with \code{base::order}.
77+
Only \code{x[order(.)]} (and related \code{sort_by(x, .)}) can have \code{na.last = NA} as it is a subset operation
7678
as opposed to \code{setorder} or \code{setorderv} which reorders the data.table
7779
by reference.
7880
@@ -96,6 +98,11 @@ was started in. By contrast, \code{"america" < "BRAZIL"} is always \code{FALSE}
9698
9799
If \code{setorder} results in reordering of the rows of a keyed \code{data.table},
98100
then its key will be set to \code{NULL}.
101+
102+
Starting from \R 4.4.0, \code{sort_by(x, y, \dots)} is the S3 method for the generic \code{sort_by} for \code{data.table}'s.
103+
It uses the same formula or list interfaces as data.frame's \code{sort_by} but internally uses \code{data.table}'s fast ordering,
104+
hence it behaves the same as \code{x[order(.)]} and takes the same optional named arguments and their defaults.
105+
99106
}
100107
\value{
101108
The input is modified by reference, and returned (invisibly) so it can be used

0 commit comments

Comments
 (0)