Skip to content

Commit dccca8f

Browse files
Merge branch 'master' into issue_2606
2 parents 597a5b3 + 67670e9 commit dccca8f

File tree

19 files changed

+1750
-27
lines changed

19 files changed

+1750
-27
lines changed

.ci/.lintr.R

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ linters = c(dt_linters, all_linters(
2121
message = "Use messagef to avoid fragmented translations.",
2222
warning = "Use warningf to avoid fragmented translations.",
2323
stop = "Use stopf to avoid fragmented translations.",
24+
rev = "Use frev internally, or setfrev if by-reference is safe.",
2425
NULL
2526
)),
2627
# undesirable_function_linter(modify_defaults(

NAMESPACE

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ export(nafill)
5959
export(setnafill)
6060
export(.Last.updated)
6161
export(fcoalesce)
62+
export(mergelist, setmergelist)
6263
export(cbindlist, setcbindlist)
6364
export(substitute2)
6465
#export(DT) # mtcars |> DT(i,j,by) #4872 #5472
@@ -208,6 +209,7 @@ S3method(format_list_item, data.frame)
208209

209210
export(fdroplevels, setdroplevels)
210211
S3method(droplevels, data.table)
212+
export(frev)
211213

212214
# sort_by added in R 4.4.0, #6662, https://stat.ethz.ch/pipermail/r-announce/2024/000701.html
213215
if (getRversion() >= "4.4.0") S3method(sort_by, data.table)

NEWS.md

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,13 @@
4646

4747
10. `data.table()` and `as.data.table()` with `keep.rownames=TRUE` now extract row names from named vectors, matching `data.frame()` behavior. Names from the first named vector in the input are used to create the row names column (default name `"rn"` or custom name via `keep.rownames="column_name"`), [#1916](https://github.com/Rdatatable/data.table/issues/1916). Thanks to @richierocks for the feature request and @Mukulyadav2004 for the implementation.
4848

49-
11. `tables()` now supports a `recursive=TRUE` argument to detect `data.table` objects nested within plain lists, such as those produced by `split()` or manual list construction, [#2606](https://github.com/Rdatatable/data.table/issues/2606). The recursive search skips data.frame and data.table objects to avoid descending into list-columns. Nested data.tables are reported with intuitive R-like names using $ and [[ ]] notation. Thanks to @MichaelChirico for the suggestion and @venom1204 for the implementation.
49+
11. New `frev(x)` as a faster analogue to `base::rev()` for atomic vectors/lists, [#5885](https://github.com/Rdatatable/data.table/issues/5885). Twice as fast as `base::rev()` on large inputs, and faster with more threads. Thanks to Benjamin Schwendinger for suggesting and implementing.
50+
51+
12. New `cbindlist()` and `setcbindlist()` for concatenating a `list` of data.tables column-wise, evocative of the analogous `do.call(rbind, l)` <-> `rbindlist(l)`, [#2576](https://github.com/Rdatatable/data.table/issues/2576). `setcbindlist()` does so without making any copies. Thanks @MichaelChirico for the FR, @jangorecki for the PR, and @MichaelChirico for extensive reviews and fine-tuning.
52+
53+
13. New `mergelist()` and `setmergelist()` similarly work _a la_ `Reduce()` to recursively merge a `list` of data.tables, [#599](https://github.com/Rdatatable/data.table/issues/599). Different join modes (_left_, _inner_, _full_, _right_, _semi_, _anti_, and _cross_) are supported through the `how` argument; duplicate handling goes through the `mult` argument. `setmergelist()` carefully avoids copies where one is not needed, e.g. in a 1:1 left join. Thanks Patrick Nicholson for the FR (in 2013!), @jangorecki for the PR, and @MichaelChirico for extensive reviews and fine-tuning.
54+
55+
14. `tables()` now supports a `recursive=TRUE` argument to detect `data.table` objects nested within plain lists, such as those produced by `split()` or manual list construction, [#2606](https://github.com/Rdatatable/data.table/issues/2606). The recursive search skips data.frame and data.table objects to avoid descending into list-columns. Nested data.tables are reported with intuitive R-like names using $ and [[ ]] notation. Thanks to @MichaelChirico for the suggestion and @venom1204 for the implementation.
5056

5157
### BUG FIXES
5258

R/as.data.table.R

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -36,11 +36,11 @@ as.data.table.table = function(x, keep.rownames=FALSE, key=NULL, ...) {
3636
# prevent #4179 & just cut out here
3737
if (any(dim(x) == 0L)) return(null.data.table())
3838
# Fix for bug #43 - order of columns are different when doing as.data.table(with(DT, table(x, y)))
39-
val = rev(dimnames(provideDimnames(x)))
39+
val = frev(dimnames(provideDimnames(x)))
4040
if (is.null(names(val)) || !any(nzchar(names(val))))
41-
setattr(val, 'names', paste0("V", rev(seq_along(val))))
41+
setattr(val, 'names', paste0("V", frev(seq_along(val))))
4242
ans = data.table(do.call(CJ, c(val, sorted=FALSE)), N = as.vector(x), key=key)
43-
setcolorder(ans, c(rev(head(names(ans), -1L)), "N"))
43+
setcolorder(ans, c(frev(head(names(ans), -1L)), "N"))
4444
ans
4545
}
4646

@@ -104,18 +104,18 @@ as.data.table.array = function(x, keep.rownames=FALSE, key=NULL, sorted=TRUE, va
104104
dnx[nulldnx] = lapply(dx[nulldnx], seq_len) #3636
105105
dnx
106106
} else dnx
107-
val = rev(val)
107+
setfrev(val)
108108
if (is.null(names(val)) || !any(nzchar(names(val))))
109-
setattr(val, 'names', paste0("V", rev(seq_along(val))))
109+
setattr(val, 'names', paste0("V", frev(seq_along(val))))
110110
if (value.name %chin% names(val))
111-
stopf("Argument 'value.name' should not overlap with column names in result: %s", brackify(rev(names(val))))
111+
stopf("Argument 'value.name' should not overlap with column names in result: %s", brackify(frev(names(val))))
112112
N = NULL
113113
ans = do.call(CJ, c(val, sorted=FALSE))
114114
set(ans, j="N", value=as.vector(x))
115115
if (isTRUE(na.rm))
116116
ans = ans[!is.na(N)]
117117
setnames(ans, "N", value.name)
118-
dims = rev(head(names(ans), -1L))
118+
dims = frev(head(names(ans), -1L))
119119
setcolorder(ans, c(dims, value.name))
120120
if (isTRUE(sorted) && is.null(key)) key = dims
121121
setkeyv(ans, key)

R/bmerge.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos
110110
}
111111
if (x_merge_type=="integer64" || i_merge_type=="integer64") {
112112
nm = c(iname, xname)
113-
if (x_merge_type=="integer64") { w=i; wc=icol; wclass=i_merge_type; } else { w=x; wc=xcol; wclass=x_merge_type; nm=rev(nm) } # w is which to coerce
113+
if (x_merge_type=="integer64") { w=i; wc=icol; wclass=i_merge_type; } else { w=x; wc=xcol; wclass=x_merge_type; setfrev(nm) } # w is which to coerce
114114
if (wclass=="integer" || (wclass=="double" && fitsInInt64(w[[wc]]))) {
115115
from_detail = if (wclass == "double") gettext(" (which has integer64 representation, e.g. no fractions)") else ""
116116
coerce_col(w, wc, wclass, "integer64", nm[1L], nm[2L], from_detail, verbose=verbose)

R/cedta.R

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,4 @@
11

2-
cedta.override = NULL # If no need arises, will deprecate.
3-
42
cedta.pkgEvalsUserCode = c("gWidgetsWWW","statET","FastRWeb","slidify","rmarkdown","knitr","ezknitr","IRkernel", "rtvs")
53
# These packages run user code in their own environment and thus do not
64
# themselves Depend or Import data.table. knitr's eval is passed envir=globalenv() so doesn't
@@ -72,7 +70,6 @@ cedta = function(n=2L) {
7270
(all(c("FUN", "X") %chin% ls(parent.frame(n))) ||
7371
.any_sd_queries_in_stack(sc))) ||
7472
(nsname %chin% cedta.pkgEvalsUserCode && .any_eval_calls_in_stack()) ||
75-
nsname %chin% cedta.override ||
7673
isTRUE(ns$.datatable.aware) || # As of Sep 2018: RCAS, caretEnsemble, dtplyr, rstanarm, rbokeh, CEMiTool, rqdatatable, RImmPort, BPRMeth, rlist
7774
tryCatch("data.table" %chin% get(".Depends",paste("package",nsname,sep=":"),inherits=FALSE),error=function(e)FALSE) # both ns$.Depends and get(.Depends,ns) are not sufficient
7875
if (!ans && getOption("datatable.verbose")) {

R/data.table.R

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -221,7 +221,7 @@ replace_dot_alias = function(e) {
221221
}
222222
return(x)
223223
}
224-
if (!mult %chin% c("first", "last", "all")) stopf("mult argument can only be 'first', 'last' or 'all'")
224+
if (!mult %chin% c("first", "last", "all", "error")) stopf("mult argument can only be 'first', 'last', 'all' or 'error'")
225225
missingroll = missing(roll)
226226
if (length(roll)!=1L || is.na(roll)) stopf("roll must be a single TRUE, FALSE, positive/negative integer/double including +Inf and -Inf or 'nearest'")
227227
if (is.character(roll)) {
@@ -520,6 +520,7 @@ replace_dot_alias = function(e) {
520520
}
521521
i = .shallow(i, retain.key = TRUE)
522522
ans = bmerge(i, x, leftcols, rightcols, roll, rollends, nomatch, mult, ops, verbose=verbose)
523+
if (mult == "error") mult = "all" ## error should have been raised inside bmerge() call above already, if it wasn't continue as mult="all"
523524
xo = ans$xo ## to make it available for further use.
524525
# temp fix for issue spotted by Jan, test #1653.1. TODO: avoid this
525526
# 'setorder', as there's another 'setorder' in generating 'irows' below...

R/mergelist.R

Lines changed: 102 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ cbindlist_impl_ = function(l, copy) {
99
}
1010

1111
cbindlist = function(l) cbindlist_impl_(l, copy=TRUE)
12-
setcbindlist = function(l) cbindlist_impl_(l, copy=FALSE)
12+
setcbindlist = function(l) invisible(cbindlist_impl_(l, copy=FALSE))
1313

1414
# when 'on' is missing then use keys, used only for inner and full join
1515
onkeys = function(x, y) {
@@ -157,9 +157,9 @@ mergepair = function(lhs, rhs, on, how, mult, lhs.cols=names(lhs), rhs.cols=name
157157
stopf("'on' is missing and necessary key is not present")
158158
}
159159
if (any(bad.on <- !on %chin% names(lhs)))
160-
stopf("'on' argument specifies columns to join [%s] that are not present in %s table [%s]", brackify(on[bad.on]), "LHS", brackify(names(lhs)))
160+
stopf("'on' argument specifies columns to join %s that are not present in %s table %s", brackify(on[bad.on]), "LHS", brackify(names(lhs)))
161161
if (any(bad.on <- !on %chin% names(rhs)))
162-
stopf("'on' argument specifies columns to join [%s] that are not present in %s table [%s]", brackify(on[bad.on]), "RHS", brackify(names(rhs)))
162+
stopf("'on' argument specifies columns to join %s that are not present in %s table %s", brackify(on[bad.on]), "RHS", brackify(names(rhs)))
163163
} else if (is.null(on)) {
164164
on = character() ## cross join only
165165
}
@@ -203,7 +203,7 @@ mergepair = function(lhs, rhs, on, how, mult, lhs.cols=names(lhs), rhs.cols=name
203203
copy_x = TRUE
204204
## ensure no duplicated column names in merge results
205205
if (any(dup.i <- names(out.i) %chin% names(out.x)))
206-
stopf("merge result has duplicated column names [%s], use 'cols' argument or rename columns in 'l' tables", brackify(names(out.i)[dup.i]))
206+
stopf("merge result has duplicated column names %s, use 'cols' argument or rename columns in 'l' tables", brackify(names(out.i)[dup.i]))
207207
}
208208

209209
## stack i and x
@@ -257,6 +257,104 @@ mergepair = function(lhs, rhs, on, how, mult, lhs.cols=names(lhs), rhs.cols=name
257257
setDT(out)
258258
}
259259

260+
mergelist_impl_ = function(l, on, cols, how, mult, join.many, copy) {
261+
verbose = getOption("datatable.verbose")
262+
if (verbose)
263+
p = proc.time()[[3L]]
264+
265+
if (!is.list(l) || is.data.frame(l))
266+
stopf("'%s' must be a list", "l")
267+
if (!all(vapply_1b(l, is.data.table)))
268+
stopf("Every element of 'l' list must be data.table objects")
269+
if (!all(idx <- lengths(l) > 0L))
270+
stopf("Tables in 'l' must all have columns, but these entries have 0: %s", brackify(which(!idx)))
271+
if (any(idx <- vapply_1i(l, function(x) anyDuplicated(names(x))) > 0L))
272+
stopf("Column names in individual 'l' entries must be unique, but these have some duplicates: %s", brackify(which(idx)))
273+
274+
n = length(l)
275+
if (n < 2L) {
276+
out = if (n) l[[1L]] else as.data.table(l)
277+
if (copy) out = copy(out)
278+
if (verbose)
279+
catf("mergelist: merging %d table(s), took %.3fs\n", n, proc.time()[[3L]]-p)
280+
return(out)
281+
}
282+
283+
if (!is.list(join.many))
284+
join.many = rep(list(join.many), n - 1L)
285+
if (length(join.many) != n - 1L || !all(vapply_1b(join.many, isTRUEorFALSE)))
286+
stopf("'join.many' must be TRUE or FALSE, or a list of such whose length must be length(l)-1L")
287+
288+
if (missing(mult))
289+
mult = NULL
290+
if (!is.list(mult))
291+
mult = rep(list(mult), n - 1L)
292+
if (length(mult) != n - 1L || !all(vapply_1b(mult, function(x) is.null(x) || (is.character(x) && length(x) == 1L && !anyNA(x) && x %chin% c("error", "all", "first", "last")))))
293+
stopf("'mult' must be one of [error, all, first, last] or NULL, or a list of such whose length must be length(l)-1L")
294+
295+
if (!is.list(how))
296+
how = rep(list(how), n-1L)
297+
if (length(how)!=n-1L || !all(vapply_1b(how, function(x) is.character(x) && length(x)==1L && !anyNA(x) && x %chin% c("left", "inner", "full", "right", "semi", "anti", "cross"))))
298+
stopf("'how' must be one of [left, inner, full, right, semi, anti, cross], or a list of such whose length must be length(l)-1L")
299+
300+
if (is.null(cols)) {
301+
cols = vector("list", n)
302+
} else {
303+
if (!is.list(cols))
304+
stopf("'%s' must be a list", "cols")
305+
if (length(cols) != n)
306+
stopf("'cols' must be same length as 'l' (%d != %d)", length(cols), n)
307+
skip = vapply_1b(cols, is.null)
308+
if (!all(vapply_1b(cols[!skip], function(x) is.character(x) && !anyNA(x) && !anyDuplicated(x))))
309+
stopf("'cols' must be a list of non-zero length, non-NA, non-duplicated, character vectors, or eventually NULLs (all columns)")
310+
if (any(mapply(function(x, icols) !all(icols %chin% names(x)), l[!skip], cols[!skip])))
311+
stopf("'cols' specify columns not present in corresponding table")
312+
}
313+
314+
if (missing(on) || is.null(on)) {
315+
on = vector("list", n - 1L)
316+
} else {
317+
if (!is.list(on))
318+
on = rep(list(on), n - 1L)
319+
if (length(on) != n-1L || !all(vapply_1b(on, function(x) is.character(x) && !anyNA(x) && !anyDuplicated(x)))) ## length checked in dtmerge
320+
stopf("'on' must be non-NA, non-duplicated, character vector, or a list of such which length must be length(l)-1L")
321+
}
322+
323+
l.mem = lapply(l, vapply, address, "")
324+
out = l[[1L]]
325+
out.cols = cols[[1L]]
326+
for (join.i in seq_len(n - 1L)) {
327+
rhs.i = join.i + 1L
328+
out = mergepair(
329+
lhs = out, rhs = l[[rhs.i]],
330+
on = on[[join.i]],
331+
how = how[[join.i]], mult = mult[[join.i]],
332+
lhs.cols = out.cols, rhs.cols = cols[[rhs.i]],
333+
copy = FALSE, ## avoid any copies inside, will copy once below
334+
join.many = join.many[[join.i]],
335+
verbose = verbose
336+
)
337+
out.cols = copy(names(out))
338+
}
339+
out.mem = vapply_1c(out, address)
340+
if (copy)
341+
.Call(CcopyCols, out, colnamesInt(out, names(out.mem)[out.mem %chin% unique(unlist(l.mem, recursive=FALSE))]))
342+
if (verbose)
343+
catf("mergelist: merging %d tables, took %.3fs\n", n, proc.time()[[3L]] - p)
344+
out
345+
}
346+
347+
mergelist = function(l, on, cols=NULL, how=c("left", "inner", "full", "right", "semi", "anti", "cross"), mult, join.many=getOption("datatable.join.many")) {
348+
if (missing(how) || is.null(how))
349+
how = match.arg(how)
350+
mergelist_impl_(l, on, cols, how, mult, join.many, copy=TRUE)
351+
}
352+
setmergelist = function(l, on, cols=NULL, how=c("left", "inner", "full", "right", "semi", "anti", "cross"), mult, join.many=getOption("datatable.join.many")) {
353+
if (missing(how) || is.null(how))
354+
how = match.arg(how)
355+
invisible(mergelist_impl_(l, on, cols, how, mult, join.many, copy=FALSE))
356+
}
357+
260358
# Previously, we had a custom C implementation here, which is ~2x faster,
261359
# but this is fast enough we don't bother maintaining a new routine.
262360
# Hopefully in the future rep() can recognize the ALTREP and use that, too.

R/utils.R

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ which.last = function(x)
8686
if (!is.logical(x)) {
8787
stopf("x not boolean")
8888
}
89-
length(x) - match(TRUE, rev(x)) + 1L
89+
length(x) - match(TRUE, frev(x)) + 1L
9090
}
9191

9292
require_bit64_if_needed = function(DT) {
@@ -226,7 +226,7 @@ fctr = function(x, levels=unique(x), ..., sort=FALSE, rev=FALSE) {
226226
if (!isTRUEorFALSE(rev))
227227
stopf("argument 'rev' must be TRUE or FALSE")
228228
if (sort) levels = sort(levels)
229-
if (rev) levels = rev(levels)
229+
if (rev) levels = frev(levels)
230230
factor(x, levels=levels, ...)
231231
}
232232

R/wrappers.R

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,3 +21,6 @@ fitsInInt32 = function(x) .Call(CfitsInInt32R, x)
2121
fitsInInt64 = function(x) .Call(CfitsInInt64R, x)
2222

2323
coerceAs = function(x, as, copy=TRUE) .Call(CcoerceAs, x, as, copy)
24+
25+
frev = function(x) .Call(Cfrev, x, TRUE)
26+
setfrev = function(x) invisible(.Call(Cfrev, x, FALSE))

0 commit comments

Comments
 (0)