Skip to content

Commit 8e26521

Browse files
committed
Merge branch 'DataCodeIntegration' of https://github.com/Rdatatable/data.table into DataCodeIntegration
2 parents 83b0de0 + 0223a04 commit 8e26521

31 files changed

+2190
-272
lines changed

.ci/.lintr.R

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ linters = c(dt_linters, all_linters(
2121
message = "Use messagef to avoid fragmented translations.",
2222
warning = "Use warningf to avoid fragmented translations.",
2323
stop = "Use stopf to avoid fragmented translations.",
24+
rev = "Use frev internally, or setfrev if by-reference is safe.",
2425
NULL
2526
)),
2627
# undesirable_function_linter(modify_defaults(

.ci/atime/tests.R

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -277,5 +277,14 @@ test.list <- atime::atime_test_list(
277277
Slow = "73d79edf8ff8c55163e90631072192301056e336", # Parent of the first commit in the PR (https://github.com/Rdatatable/data.table/commit/8397dc3c993b61a07a81c786ca68c22bc589befc)
278278
Fast = "8397dc3c993b61a07a81c786ca68c22bc589befc"), # Commit in the PR (https://github.com/Rdatatable/data.table/pull/7019/commits) that removes inefficiency
279279

280+
"isoweek improved in #7144" = atime::atime_test(
281+
setup = {
282+
set.seed(349)
283+
x = sample(Sys.Date() - 0:5000, N, replace=TRUE)
284+
},
285+
expr = data.table::isoweek(x),
286+
Slow = "548410d23dd74b625e8ea9aeb1a5d2e9dddd2927", # Parent of the first commit in the PR (https://github.com/Rdatatable/data.table/commit/548410d23dd74b625e8ea9aeb1a5d2e9dddd2927)
287+
Fast = "c0b32a60466bed0e63420ec105bc75c34590865e"), # Commit in the PR (https://github.com/Rdatatable/data.table/pull/7144/commits) that uses a much faster implementation
288+
280289
tests=extra.test.list)
281290
# nolint end: undesirable_operator_linter.

NAMESPACE

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ export(nafill)
5959
export(setnafill)
6060
export(.Last.updated)
6161
export(fcoalesce)
62+
export(mergelist, setmergelist)
6263
export(cbindlist, setcbindlist)
6364
export(substitute2)
6465
#export(DT) # mtcars |> DT(i,j,by) #4872 #5472
@@ -208,6 +209,7 @@ S3method(format_list_item, data.frame)
208209

209210
export(fdroplevels, setdroplevels)
210211
S3method(droplevels, data.table)
212+
export(frev)
211213

212214
# sort_by added in R 4.4.0, #6662, https://stat.ethz.ch/pipermail/r-announce/2024/000701.html
213215
if (getRversion() >= "4.4.0") S3method(sort_by, data.table)

NEWS.md

Lines changed: 63 additions & 31 deletions
Large diffs are not rendered by default.

R/IDateTime.R

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -342,19 +342,20 @@ yday = function(x) convertDate(as.IDate(x), "yday")
342342
wday = function(x) convertDate(as.IDate(x), "wday")
343343
mday = function(x) convertDate(as.IDate(x), "mday")
344344
week = function(x) convertDate(as.IDate(x), "week")
345-
isoweek = function(x) {
345+
# TODO(#3279): Investigate if improved as.IDate() makes our below implementation faster than this
346+
isoweek = function(x) as.integer(format(as.IDate(x), "%V"))
346347
# ISO 8601-conformant week, as described at
347348
# https://en.wikipedia.org/wiki/ISO_week_date
348349
# Approach:
349350
# * Find nearest Thursday to each element of x
350351
# * Find the number of weeks having passed between
351352
# January 1st of the year of the nearest Thursdays and x
352353

353-
x = as.IDate(x) # number of days since 1 Jan 1970 (a Thurs)
354-
nearest_thurs = as.IDate(7L * (as.integer(x + 3L) %/% 7L))
355-
year_start = as.IDate(format(nearest_thurs, '%Y-01-01'))
356-
1L + (nearest_thurs - year_start) %/% 7L
357-
}
354+
# x = as.IDate(x) # number of days since 1 Jan 1970 (a Thurs)
355+
# nearest_thurs = as.IDate(7L * (as.integer(x + 3L) %/% 7L))
356+
# year_start = as.IDate(format(nearest_thurs, '%Y-01-01'))
357+
# 1L + (nearest_thurs - year_start) %/% 7L
358+
358359

359360
month = function(x) convertDate(as.IDate(x), "month")
360361
quarter = function(x) convertDate(as.IDate(x), "quarter")

R/as.data.table.R

Lines changed: 40 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -36,18 +36,21 @@ as.data.table.table = function(x, keep.rownames=FALSE, key=NULL, ...) {
3636
# prevent #4179 & just cut out here
3737
if (any(dim(x) == 0L)) return(null.data.table())
3838
# Fix for bug #43 - order of columns are different when doing as.data.table(with(DT, table(x, y)))
39-
val = rev(dimnames(provideDimnames(x)))
39+
val = frev(dimnames(provideDimnames(x)))
4040
if (is.null(names(val)) || !any(nzchar(names(val))))
41-
setattr(val, 'names', paste0("V", rev(seq_along(val))))
41+
setattr(val, 'names', paste0("V", frev(seq_along(val))))
4242
ans = data.table(do.call(CJ, c(val, sorted=FALSE)), N = as.vector(x), key=key)
43-
setcolorder(ans, c(rev(head(names(ans), -1L)), "N"))
43+
setcolorder(ans, c(frev(head(names(ans), -1L)), "N"))
4444
ans
4545
}
4646

4747
as.data.table.matrix = function(x, keep.rownames=FALSE, key=NULL, ...) {
4848
if (!identical(keep.rownames, FALSE)) {
4949
# can specify col name to keep.rownames, #575
5050
ans = data.table(rn=rownames(x), x, keep.rownames=FALSE)
51+
# auto-inferred name 'x' is not back-compatible & inconsistent, #7145
52+
if (ncol(x) == 1L && is.null(colnames(x)))
53+
setnames(ans, 'x', 'V1', skip_absent=TRUE)
5154
if (is.character(keep.rownames))
5255
setnames(ans, 'rn', keep.rownames[1L])
5356
return(ans)
@@ -101,18 +104,18 @@ as.data.table.array = function(x, keep.rownames=FALSE, key=NULL, sorted=TRUE, va
101104
dnx[nulldnx] = lapply(dx[nulldnx], seq_len) #3636
102105
dnx
103106
} else dnx
104-
val = rev(val)
107+
setfrev(val)
105108
if (is.null(names(val)) || !any(nzchar(names(val))))
106-
setattr(val, 'names', paste0("V", rev(seq_along(val))))
109+
setattr(val, 'names', paste0("V", frev(seq_along(val))))
107110
if (value.name %chin% names(val))
108-
stopf("Argument 'value.name' should not overlap with column names in result: %s", brackify(rev(names(val))))
111+
stopf("Argument 'value.name' should not overlap with column names in result: %s", brackify(frev(names(val))))
109112
N = NULL
110113
ans = do.call(CJ, c(val, sorted=FALSE))
111114
set(ans, j="N", value=as.vector(x))
112115
if (isTRUE(na.rm))
113116
ans = ans[!is.na(N)]
114117
setnames(ans, "N", value.name)
115-
dims = rev(head(names(ans), -1L))
118+
dims = frev(head(names(ans), -1L))
116119
setcolorder(ans, c(dims, value.name))
117120
if (isTRUE(sorted) && is.null(key)) key = dims
118121
setkeyv(ans, key)
@@ -133,16 +136,33 @@ as.data.table.list = function(x,
133136
missing.check.names = missing(check.names)
134137
origListNames = if (missing(.named)) names(x) else NULL # as.data.table called directly, not from inside data.table() which provides .named, #3854
135138
empty_atomic = FALSE
139+
140+
# Handle keep.rownames for vectors (mimicking data.frame behavior)
141+
rownames_ = NULL
142+
check_rownames = !isFALSE(keep.rownames)
143+
136144
for (i in seq_len(n)) {
137145
xi = x[[i]]
138146
if (is.null(xi)) next # eachncol already initialized to 0 by integer() above
147+
if (check_rownames && is.null(rownames_)) {
148+
if (is.null(dim(xi))) {
149+
if (!is.null(nm <- names(xi))) {
150+
rownames_ = nm
151+
x[[i]] = unname(xi)
152+
}
153+
} else {
154+
if (!is.null(nm <- rownames(xi))) {
155+
rownames_ = nm
156+
}
157+
}
158+
}
139159
if (!is.null(dim(xi)) && missing.check.names) check.names=TRUE
140160
if ("POSIXlt" %chin% class(xi)) {
141161
warningf("POSIXlt column type detected and converted to POSIXct. We do not recommend use of POSIXlt at all because it uses 40 bytes to store one date.")
142162
xi = x[[i]] = as.POSIXct(xi)
143163
} else if (is.matrix(xi) || is.data.frame(xi)) {
144164
if (!is.data.table(xi)) {
145-
if (is.matrix(xi) && NCOL(xi)<=1L && is.null(colnames(xi))) { # 1 column matrix naming #4124
165+
if (is.matrix(xi) && NCOL(xi)==1L && is.null(colnames(xi)) && isFALSE(getOption('datatable.old.matrix.autoname'))) { # 1 column matrix naming #4124
146166
xi = x[[i]] = c(xi)
147167
} else {
148168
xi = x[[i]] = as.data.table(xi, keep.rownames=keep.rownames) # we will never allow a matrix to be a column; always unpack the columns
@@ -200,6 +220,18 @@ as.data.table.list = function(x,
200220
}
201221
if (any(vnames==".SD")) stopf("A column may not be called .SD. That has special meaning.")
202222
if (check.names) vnames = make.names(vnames, unique=TRUE)
223+
224+
# Add rownames column when vector names were found
225+
if (!is.null(rownames_)) {
226+
rn_name = if (is.character(keep.rownames)) keep.rownames[1L] else "rn"
227+
if (!is.na(idx <- chmatch(rn_name, vnames)[1L])) {
228+
ans = c(list(ans[[idx]]), ans[-idx])
229+
vnames = c(vnames[idx], vnames[-idx])
230+
} else {
231+
ans = c(list(recycle(rownames_, nrow)), ans)
232+
vnames = c(rn_name, vnames)
233+
}
234+
}
203235
setattr(ans, "names", vnames)
204236
setDT(ans, key=key) # copy ensured above; also, setDT handles naming
205237
if (length(origListNames)==length(ans)) setattr(ans, "names", origListNames) # PR 3854 and tests 2058.15-17

R/bmerge.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@ bmerge = function(i, x, icols, xcols, roll, rollends, nomatch, mult, ops, verbos
110110
}
111111
if (x_merge_type=="integer64" || i_merge_type=="integer64") {
112112
nm = c(iname, xname)
113-
if (x_merge_type=="integer64") { w=i; wc=icol; wclass=i_merge_type; } else { w=x; wc=xcol; wclass=x_merge_type; nm=rev(nm) } # w is which to coerce
113+
if (x_merge_type=="integer64") { w=i; wc=icol; wclass=i_merge_type; } else { w=x; wc=xcol; wclass=x_merge_type; setfrev(nm) } # w is which to coerce
114114
if (wclass=="integer" || (wclass=="double" && fitsInInt64(w[[wc]]))) {
115115
from_detail = if (wclass == "double") gettext(" (which has integer64 representation, e.g. no fractions)") else ""
116116
coerce_col(w, wc, wclass, "integer64", nm[1L], nm[2L], from_detail, verbose=verbose)

R/cedta.R

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,4 @@
11

2-
cedta.override = NULL # If no need arises, will deprecate.
3-
42
cedta.pkgEvalsUserCode = c("gWidgetsWWW","statET","FastRWeb","slidify","rmarkdown","knitr","ezknitr","IRkernel", "rtvs")
53
# These packages run user code in their own environment and thus do not
64
# themselves Depend or Import data.table. knitr's eval is passed envir=globalenv() so doesn't
@@ -39,6 +37,16 @@ cedta.pkgEvalsUserCode = c("gWidgetsWWW","statET","FastRWeb","slidify","rmarkdow
3937
}
4038
# nocov end
4139

40+
.any_sd_queries_in_stack = function(calls) {
41+
for (ii in length(calls):1) { # nolint: seq_linter. As above.
42+
if (!calls[[ii]] %iscall% "[") next
43+
the_lhs = calls[[ii]][[2L]]
44+
if (!is.name(the_lhs) || the_lhs != ".SD") next
45+
return(TRUE)
46+
}
47+
FALSE
48+
}
49+
4250
# cedta = Calling Environment Data.Table-Aware
4351
cedta = function(n=2L) {
4452
# Calling Environment Data Table Aware
@@ -52,14 +60,16 @@ cedta = function(n=2L) {
5260
return(TRUE)
5361
}
5462
nsname = getNamespaceName(ns)
63+
sc = sys.calls()
5564
ans = nsname=="data.table" ||
5665
"data.table" %chin% names(getNamespaceImports(ns)) || # most common and recommended cases first for speed
5766
(nsname=="utils" &&
5867
(exists("debugger.look", parent.frame(n+1L)) ||
59-
(length(sc<-sys.calls())>=8L && sc[[length(sc)-7L]] %iscall% 'example')) ) || # 'example' for #2972
60-
(nsname=="base" && all(c("FUN", "X") %chin% ls(parent.frame(n)))) || # lapply
68+
(length(sc)>=8L && sc[[length(sc)-7L]] %iscall% 'example')) ) || # 'example' for #2972
69+
(nsname=="base" && # lapply
70+
(all(c("FUN", "X") %chin% ls(parent.frame(n))) ||
71+
.any_sd_queries_in_stack(sc))) ||
6172
(nsname %chin% cedta.pkgEvalsUserCode && .any_eval_calls_in_stack()) ||
62-
nsname %chin% cedta.override ||
6373
isTRUE(ns$.datatable.aware) || # As of Sep 2018: RCAS, caretEnsemble, dtplyr, rstanarm, rbokeh, CEMiTool, rqdatatable, RImmPort, BPRMeth, rlist
6474
tryCatch("data.table" %chin% get(".Depends",paste("package",nsname,sep=":"),inherits=FALSE),error=function(e)FALSE) # both ns$.Depends and get(.Depends,ns) are not sufficient
6575
if (!ans && getOption("datatable.verbose")) {

R/data.table.R

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -221,7 +221,7 @@ replace_dot_alias = function(e) {
221221
}
222222
return(x)
223223
}
224-
if (!mult %chin% c("first", "last", "all")) stopf("mult argument can only be 'first', 'last' or 'all'")
224+
if (!mult %chin% c("first", "last", "all", "error")) stopf("mult argument can only be 'first', 'last', 'all' or 'error'")
225225
missingroll = missing(roll)
226226
if (length(roll)!=1L || is.na(roll)) stopf("roll must be a single TRUE, FALSE, positive/negative integer/double including +Inf and -Inf or 'nearest'")
227227
if (is.character(roll)) {
@@ -520,6 +520,7 @@ replace_dot_alias = function(e) {
520520
}
521521
i = .shallow(i, retain.key = TRUE)
522522
ans = bmerge(i, x, leftcols, rightcols, roll, rollends, nomatch, mult, ops, verbose=verbose)
523+
if (mult == "error") mult = "all" ## error should have been raised inside bmerge() call above already, if it wasn't continue as mult="all"
523524
xo = ans$xo ## to make it available for further use.
524525
# temp fix for issue spotted by Jan, test #1653.1. TODO: avoid this
525526
# 'setorder', as there's another 'setorder' in generating 'irows' below...

R/mergelist.R

Lines changed: 102 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ cbindlist_impl_ = function(l, copy) {
99
}
1010

1111
cbindlist = function(l) cbindlist_impl_(l, copy=TRUE)
12-
setcbindlist = function(l) cbindlist_impl_(l, copy=FALSE)
12+
setcbindlist = function(l) invisible(cbindlist_impl_(l, copy=FALSE))
1313

1414
# when 'on' is missing then use keys, used only for inner and full join
1515
onkeys = function(x, y) {
@@ -157,9 +157,9 @@ mergepair = function(lhs, rhs, on, how, mult, lhs.cols=names(lhs), rhs.cols=name
157157
stopf("'on' is missing and necessary key is not present")
158158
}
159159
if (any(bad.on <- !on %chin% names(lhs)))
160-
stopf("'on' argument specifies columns to join [%s] that are not present in %s table [%s]", brackify(on[bad.on]), "LHS", brackify(names(lhs)))
160+
stopf("'on' argument specifies columns to join %s that are not present in %s table %s", brackify(on[bad.on]), "LHS", brackify(names(lhs)))
161161
if (any(bad.on <- !on %chin% names(rhs)))
162-
stopf("'on' argument specifies columns to join [%s] that are not present in %s table [%s]", brackify(on[bad.on]), "RHS", brackify(names(rhs)))
162+
stopf("'on' argument specifies columns to join %s that are not present in %s table %s", brackify(on[bad.on]), "RHS", brackify(names(rhs)))
163163
} else if (is.null(on)) {
164164
on = character() ## cross join only
165165
}
@@ -203,7 +203,7 @@ mergepair = function(lhs, rhs, on, how, mult, lhs.cols=names(lhs), rhs.cols=name
203203
copy_x = TRUE
204204
## ensure no duplicated column names in merge results
205205
if (any(dup.i <- names(out.i) %chin% names(out.x)))
206-
stopf("merge result has duplicated column names [%s], use 'cols' argument or rename columns in 'l' tables", brackify(names(out.i)[dup.i]))
206+
stopf("merge result has duplicated column names %s, use 'cols' argument or rename columns in 'l' tables", brackify(names(out.i)[dup.i]))
207207
}
208208

209209
## stack i and x
@@ -257,6 +257,104 @@ mergepair = function(lhs, rhs, on, how, mult, lhs.cols=names(lhs), rhs.cols=name
257257
setDT(out)
258258
}
259259

260+
mergelist_impl_ = function(l, on, cols, how, mult, join.many, copy) {
261+
verbose = getOption("datatable.verbose")
262+
if (verbose)
263+
p = proc.time()[[3L]]
264+
265+
if (!is.list(l) || is.data.frame(l))
266+
stopf("'%s' must be a list", "l")
267+
if (!all(vapply_1b(l, is.data.table)))
268+
stopf("Every element of 'l' list must be data.table objects")
269+
if (!all(idx <- lengths(l) > 0L))
270+
stopf("Tables in 'l' must all have columns, but these entries have 0: %s", brackify(which(!idx)))
271+
if (any(idx <- vapply_1i(l, function(x) anyDuplicated(names(x))) > 0L))
272+
stopf("Column names in individual 'l' entries must be unique, but these have some duplicates: %s", brackify(which(idx)))
273+
274+
n = length(l)
275+
if (n < 2L) {
276+
out = if (n) l[[1L]] else as.data.table(l)
277+
if (copy) out = copy(out)
278+
if (verbose)
279+
catf("mergelist: merging %d table(s), took %.3fs\n", n, proc.time()[[3L]]-p)
280+
return(out)
281+
}
282+
283+
if (!is.list(join.many))
284+
join.many = rep(list(join.many), n - 1L)
285+
if (length(join.many) != n - 1L || !all(vapply_1b(join.many, isTRUEorFALSE)))
286+
stopf("'join.many' must be TRUE or FALSE, or a list of such whose length must be length(l)-1L")
287+
288+
if (missing(mult))
289+
mult = NULL
290+
if (!is.list(mult))
291+
mult = rep(list(mult), n - 1L)
292+
if (length(mult) != n - 1L || !all(vapply_1b(mult, function(x) is.null(x) || (is.character(x) && length(x) == 1L && !anyNA(x) && x %chin% c("error", "all", "first", "last")))))
293+
stopf("'mult' must be one of [error, all, first, last] or NULL, or a list of such whose length must be length(l)-1L")
294+
295+
if (!is.list(how))
296+
how = rep(list(how), n-1L)
297+
if (length(how)!=n-1L || !all(vapply_1b(how, function(x) is.character(x) && length(x)==1L && !anyNA(x) && x %chin% c("left", "inner", "full", "right", "semi", "anti", "cross"))))
298+
stopf("'how' must be one of [left, inner, full, right, semi, anti, cross], or a list of such whose length must be length(l)-1L")
299+
300+
if (is.null(cols)) {
301+
cols = vector("list", n)
302+
} else {
303+
if (!is.list(cols))
304+
stopf("'%s' must be a list", "cols")
305+
if (length(cols) != n)
306+
stopf("'cols' must be same length as 'l' (%d != %d)", length(cols), n)
307+
skip = vapply_1b(cols, is.null)
308+
if (!all(vapply_1b(cols[!skip], function(x) is.character(x) && !anyNA(x) && !anyDuplicated(x))))
309+
stopf("'cols' must be a list of non-zero length, non-NA, non-duplicated, character vectors, or eventually NULLs (all columns)")
310+
if (any(mapply(function(x, icols) !all(icols %chin% names(x)), l[!skip], cols[!skip])))
311+
stopf("'cols' specify columns not present in corresponding table")
312+
}
313+
314+
if (missing(on) || is.null(on)) {
315+
on = vector("list", n - 1L)
316+
} else {
317+
if (!is.list(on))
318+
on = rep(list(on), n - 1L)
319+
if (length(on) != n-1L || !all(vapply_1b(on, function(x) is.character(x) && !anyNA(x) && !anyDuplicated(x)))) ## length checked in dtmerge
320+
stopf("'on' must be non-NA, non-duplicated, character vector, or a list of such which length must be length(l)-1L")
321+
}
322+
323+
l.mem = lapply(l, vapply, address, "")
324+
out = l[[1L]]
325+
out.cols = cols[[1L]]
326+
for (join.i in seq_len(n - 1L)) {
327+
rhs.i = join.i + 1L
328+
out = mergepair(
329+
lhs = out, rhs = l[[rhs.i]],
330+
on = on[[join.i]],
331+
how = how[[join.i]], mult = mult[[join.i]],
332+
lhs.cols = out.cols, rhs.cols = cols[[rhs.i]],
333+
copy = FALSE, ## avoid any copies inside, will copy once below
334+
join.many = join.many[[join.i]],
335+
verbose = verbose
336+
)
337+
out.cols = copy(names(out))
338+
}
339+
out.mem = vapply_1c(out, address)
340+
if (copy)
341+
.Call(CcopyCols, out, colnamesInt(out, names(out.mem)[out.mem %chin% unique(unlist(l.mem, recursive=FALSE))]))
342+
if (verbose)
343+
catf("mergelist: merging %d tables, took %.3fs\n", n, proc.time()[[3L]] - p)
344+
out
345+
}
346+
347+
mergelist = function(l, on, cols=NULL, how=c("left", "inner", "full", "right", "semi", "anti", "cross"), mult, join.many=getOption("datatable.join.many")) {
348+
if (missing(how) || is.null(how))
349+
how = match.arg(how)
350+
mergelist_impl_(l, on, cols, how, mult, join.many, copy=TRUE)
351+
}
352+
setmergelist = function(l, on, cols=NULL, how=c("left", "inner", "full", "right", "semi", "anti", "cross"), mult, join.many=getOption("datatable.join.many")) {
353+
if (missing(how) || is.null(how))
354+
how = match.arg(how)
355+
invisible(mergelist_impl_(l, on, cols, how, mult, join.many, copy=FALSE))
356+
}
357+
260358
# Previously, we had a custom C implementation here, which is ~2x faster,
261359
# but this is fast enough we don't bother maintaining a new routine.
262360
# Hopefully in the future rep() can recognize the ALTREP and use that, too.

0 commit comments

Comments
 (0)