From 7049f9b64b5086dca855747c782581368f07c252 Mon Sep 17 00:00:00 2001 From: Benjamin Schwendinger Date: Sun, 26 Oct 2025 16:35:39 +0100 Subject: [PATCH] add setDT, setalloccol --- NEWS.md | 2 ++ R/as.data.table.R | 6 +++--- R/between.R | 6 +++--- R/data.table.R | 49 +++++++++++++++++++++++-------------------- R/fcast.R | 8 +++---- R/fmelt.R | 2 +- R/foverlaps.R | 2 +- R/frank.R | 13 +++++++++--- R/fread.R | 2 +- R/frollapply.R | 2 +- R/mergelist.R | 4 ++-- R/setkey.R | 4 ++-- R/test.data.table.R | 2 +- R/transpose.R | 2 +- R/utils.R | 2 +- R/xts.R | 2 +- inst/tests/tests.Rraw | 46 ++++++++++++++++++++++++++++++++++++++-- man/setDT.Rd | 7 +++++-- man/truelength.Rd | 7 +++++-- src/assign.c | 18 +++++++++++++--- src/data.table.h | 2 +- src/utils.c | 4 ++-- 22 files changed, 132 insertions(+), 60 deletions(-) diff --git a/NEWS.md b/NEWS.md index 43dd6308e9..9a1c02416c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -34,6 +34,8 @@ 5. Negative and missing values of `n` argument of adaptive rolling functions trigger an error. +6. `setDT()` and `setalloccol()` gain `duplicateShared` argument (default `TRUE`). When `TRUE`, columns that are shared with other objects are duplicated to avoid unintended modification of the original data, [#2683](https://github.com/Rdatatable/data.table/issues/2683). Previously, shared columns were not duplicated, which could lead to unexpected side effects. `frank()` now uses this internally to preserve names from the input vector and avoid modifying shared vectors, [#4240](https://github.com/Rdatatable/data.table/issues/4240). Thanks to @jangorecki, @BenoitLondon, and @MichaelChirico for the report and @ben-schwen for the fix. + ### NOTICE OF INTENDED FUTURE POTENTIAL BREAKING CHANGES 1. `data.table(x=1, )`, where `` is an expression resulting in a 1-column matrix without column names, will eventually have names `x` and `V2`, not `x` and `V1`, consistent with `data.table(x=1, )` where `` results in an atomic vector, for example `data.table(x=1, cbind(1))` and `data.table(x=1, 1)` will both have columns named `x` and `V2`. In this release, the matrix case continues to be named `V1`, but the new behavior can be activated by setting `options(datatable.old.matrix.autoname)` to `FALSE`. See point 5 under Bug Fixes for more context; this change will provide more internal consistency as well as more consistency with `data.frame()`. diff --git a/R/as.data.table.R b/R/as.data.table.R index f849aa8c10..146d580caf 100644 --- a/R/as.data.table.R +++ b/R/as.data.table.R @@ -69,7 +69,7 @@ as.data.table.matrix = function(x, keep.rownames=FALSE, key=NULL, ...) { for (i in ic) value[[i]] = as.vector(x[, i]) # to drop any row.names that would otherwise be retained inside every column of the data.table } col_labels = dimnames(x)[[2L]] - setDT(value) + setDT(value, duplicateShared=FALSE) if (length(col_labels) == ncols) { if (any(empty <- !nzchar(col_labels))) col_labels[empty] = paste0("V", ic[empty]) @@ -233,7 +233,7 @@ as.data.table.list = function(x, } } setattr(ans, "names", vnames) - setDT(ans, key=key) # copy ensured above; also, setDT handles naming + setDT(ans, key=key, duplicateShared=FALSE) # copy ensured above; also, setDT handles naming if (length(origListNames)==length(ans)) setattr(ans, "names", origListNames) # PR 3854 and tests 2058.15-17 ans } @@ -282,7 +282,7 @@ as.data.table.data.frame = function(x, keep.rownames=FALSE, key=NULL, ...) { # fix for #1078 and #1128, see .resetclass() for explanation. setattr(ans, "class", .resetclass(x, "data.frame")) - setalloccol(ans) + setalloccol(ans, duplicateShared=FALSE) setkeyv(ans, key) ans } diff --git a/R/between.R b/R/between.R index 63877313d6..e1f60e1052 100644 --- a/R/between.R +++ b/R/between.R @@ -66,8 +66,8 @@ between = function(x, lower, upper, incbounds=TRUE, NAbounds=TRUE, check=FALSE, # issue FR #707 # is x[i] found anywhere within [lower, upper] range? inrange = function(x,lower,upper,incbounds=TRUE) { - query = setDT(list(x=x)) - subject = setDT(list(l=lower, u=upper)) + query = setDT(list(x=x), duplicateShared=FALSE) + subject = setDT(list(l=lower, u=upper), duplicateShared=FALSE) ops = if (incbounds) c(4L, 2L) else c(5L, 3L) # >=,<= and >,< verbose = isTRUE(getOption("datatable.verbose")) if (verbose) {last.started.at=proc.time();catf("forderv(query) took ... ");flush.console()} @@ -81,7 +81,7 @@ inrange = function(x,lower,upper,incbounds=TRUE) { ) xo = ans$xo options(datatable.verbose=FALSE) - setDT(ans[c("starts", "lens")], key=c("starts", "lens")) + setDT(ans[c("starts", "lens")], key=c("starts", "lens"), duplicateShared=FALSE) options(datatable.verbose=verbose) if (verbose) {last.started.at=proc.time();catf("Generating final logical vector ... ");flush.console()} .Call(Cinrange, idx <- vector("logical", length(x)), xo, ans[["starts"]], ans[["lens"]]) diff --git a/R/data.table.R b/R/data.table.R index f8f39c8f7f..7d1190361a 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -38,7 +38,7 @@ null.data.table = function() { ans = list() setattr(ans,"class",c("data.table","data.frame")) setattr(ans,"row.names",.set_row_names(0L)) - setalloccol(ans) + setalloccol(ans, duplicateShared=FALSE) } data.table = function(..., keep.rownames=FALSE, check.names=FALSE, key=NULL, stringsAsFactors=FALSE) @@ -73,7 +73,7 @@ data.table = function(..., keep.rownames=FALSE, check.names=FALSE, key=NULL, str for (j in which(vapply_1b(ans, is.character))) set(ans, NULL, j, as_factor(.subset2(ans, j))) # as_factor is internal function in fread.R currently } - setalloccol(ans) # returns a NAMED==0 object, unlike data.frame() + setalloccol(ans, duplicateShared=FALSE) # returns a NAMED==0 object, unlike data.frame() } replace_dot_alias = function(e) { @@ -523,7 +523,7 @@ replace_dot_alias = function(e) { xo = ans$xo ## to make it available for further use. # temp fix for issue spotted by Jan, test #1653.1. TODO: avoid this # 'setorder', as there's another 'setorder' in generating 'irows' below... - if (length(ans$indices)) setorder(setDT(ans[1L:3L]), indices) + if (length(ans$indices)) setorder(setDT(ans[1L:3L], duplicateShared=FALSE), indices) allLen1 = ans$allLen1 f__ = ans$starts len__ = ans$lens @@ -575,7 +575,7 @@ replace_dot_alias = function(e) { } if (nqbyjoin) { irows = if (length(xo)) xo[irows] else irows - xo = setorder(setDT(list(indices=rep.int(indices__, len__), irows=irows)))[["irows"]] + xo = setorder(setDT(list(indices=rep.int(indices__, len__), irows=irows), duplicateShared=FALSE))[["irows"]] ans = .Call(CnqRecreateIndices, xo, len__, indices__, max(indices__), nomatch) # issue#4388 fix f__ = ans[[1L]]; len__ = ans[[2L]] allLen1 = FALSE # TODO; should this always be FALSE? @@ -594,7 +594,7 @@ replace_dot_alias = function(e) { irows = xo[irows] # TO DO: fsort here? if (mult=="all" && !allGrp1) { # following #1991 fix, !allGrp1 will always be TRUE. TODO: revisit. if (verbose) {last.started.at=proc.time();catf("Reorder irows for 'mult==\"all\" && !allGrp1' ... ");flush.console()} - irows = setorder(setDT(list(indices=rep.int(indices__, len__), irows=irows)))[["irows"]] + irows = setorder(setDT(list(indices=rep.int(indices__, len__), irows=irows), duplicateShared=FALSE))[["irows"]] if (verbose) {cat(timetaken(last.started.at),"\n"); flush.console()} } } @@ -1245,7 +1245,7 @@ replace_dot_alias = function(e) { # Note also that this growing will happen for missing columns assigned NULL, too. But so rare, we # don't mind. } - setalloccol(x, n, verbose=verbose) # always assigns to calling scope; i.e. this scope + setalloccol(x, n, verbose=verbose, duplicateShared=FALSE) # always assigns to calling scope; i.e. this scope if (is.name(name)) { assign(as.character(name),x,parent.frame(),inherits=TRUE) } else if (.is_simple_extraction(name)) { @@ -1352,7 +1352,7 @@ replace_dot_alias = function(e) { setattr(ans, "sorted", .join_result_key(x, i, ans, if (!missing(on)) names(on), ansvars, leftcols, rightcols, names_i, irows, roll)) setattr(ans, "class", class(x)) # retain class that inherits from data.table, #64 setattr(ans, "row.names", .set_row_names(length(ans[[1L]]))) - setalloccol(ans) + setalloccol(ans, duplicateShared=FALSE) } if (!with || missing(j)) return(ans) if (!is.data.table(ans)) setattr(ans, "class", c("data.table","data.frame")) # DF |> DT(,.SD[...]) .SD should be data.table, test 2212.013 @@ -2018,7 +2018,7 @@ replace_dot_alias = function(e) { } else if (.by_result_is_keyable(x, keyby, bysameorder, byjoin, allbyvars, bysub)) { setattr(ans, "sorted", names(ans)[seq_along(grpcols)]) } - setalloccol(ans) # TODO: overallocate in dogroups in the first place and remove this line + setalloccol(ans, duplicateShared=FALSE) # TODO: overallocate in dogroups in the first place and remove this line } # can the specified merge of x and i be marked as sorted? return the columns for which this is true, otherwise NULL @@ -2242,7 +2242,7 @@ tail.data.table = function(x, n=6L, ...) { if (!cedta()) { x = if (nargs()<4L) `[<-.data.frame`(x, i, value=value) else `[<-.data.frame`(x, i, j, value) - return(setalloccol(x)) # over-allocate (again). Avoid all this by using :=. + return(setalloccol(x, duplicateShared=FALSE)) # over-allocate (again). Avoid all this by using :=. } # TO DO: warningf("Please use DT[i,j:=value] syntax instead of DT[i,j]<-value, for efficiency. See ?':='") if (!missing(i)) { @@ -2251,7 +2251,7 @@ tail.data.table = function(x, n=6L, ...) { if (is.matrix(i)) { if (!missing(j)) stopf("When i is a matrix in DT[i]<-value syntax, it doesn't make sense to provide j") x = `[<-.data.frame`(x, i, value=value) - return(setalloccol(x)) + return(setalloccol(x, duplicateShared=FALSE)) } i = x[i, which=TRUE] # Tried adding ... after value above, and passing ... in here (e.g. for mult="first") but R CMD check @@ -2284,11 +2284,11 @@ tail.data.table = function(x, n=6L, ...) { reinstatekey=key(x) } if (!selfrefok(x) || truelength(x) < ncol(x)+length(newnames)) { - x = setalloccol(x, length(x)+length(newnames)) # because [<- copies via *tmp* and main/duplicate.c copies at length but copies truelength over too + x = setalloccol(x, length(x)+length(newnames), duplicateShared=FALSE) # because [<- copies via *tmp* and main/duplicate.c copies at length but copies truelength over too # search for one other .Call to assign in [.data.table to see how it differs } x = .Call(Cassign,copy(x),i,cols,newnames,value) # From 3.1.0, DF[2,"b"] = 7 no longer copies DF$a (so in this [<-.data.table method we need to copy) - setalloccol(x) # can maybe avoid this realloc, but this is (slow) [<- anyway, so just be safe. + setalloccol(x, duplicateShared=FALSE) # can maybe avoid this realloc, but this is (slow) [<- anyway, so just be safe. if (length(reinstatekey)) setkeyv(x,reinstatekey) invisible(x) # no copy at all if user calls directly; i.e. `[<-.data.table`(x,i,j,value) @@ -2302,7 +2302,7 @@ tail.data.table = function(x, n=6L, ...) { "$<-.data.table" = function(x, name, value) { if (!cedta()) { ans = `$<-.data.frame`(x, name, value) # nocov - return(setalloccol(ans)) # nocov. over-allocate (again) + return(setalloccol(ans, duplicateShared=FALSE)) # nocov. over-allocate (again) } x = copy(x) set(x,j=name,value=value) # important i is missing here @@ -2359,7 +2359,7 @@ dimnames.data.table = function(x) { setattr(x,"names",NULL) # e.g. plyr::melt() calls base::unname() else { setnames(x,value) - setalloccol(x) + setalloccol(x, duplicateShared=FALSE) } x # this returned value is now shallow copied by R 3.1.0 via *tmp*. A very welcome change. } @@ -2601,7 +2601,7 @@ copy = function(x) { reallocate = function(y) { if (is.data.table(y)) { .Call(C_unlock, y) - setalloccol(y) + setalloccol(y, duplicateShared=FALSE) } else if (is.list(y)) { oldClass = class(y) setattr(y, 'class', NULL) # otherwise [[.person method (which returns itself) results in infinite recursion, #4620 @@ -2666,11 +2666,11 @@ shallow = function(x, cols=NULL) { ans } -setalloccol = alloc.col = function(DT, n=getOption("datatable.alloccol"), verbose=getOption("datatable.verbose")) +setalloccol = alloc.col = function(DT, n=getOption("datatable.alloccol"), verbose=getOption("datatable.verbose"), duplicateShared=TRUE) { name = substitute(DT) if (identical(name, quote(`*tmp*`))) stopf("setalloccol attempting to modify `*tmp*`") - ans = .Call(Calloccolwrapper, DT, eval(n), verbose) + ans = .Call(Calloccolwrapper, DT, eval(n), verbose, duplicateShared) if (is.name(name)) { name = as.character(name) assign(name,ans,parent.frame(),inherits=TRUE) @@ -2875,7 +2875,7 @@ rbindlist = function(l, use.names="check", fill=FALSE, idcol=NULL, ignore.attr=F } ans = .Call(Crbindlist, l, use.names, fill, idcol, ignore.attr) if (!length(ans)) return(null.data.table()) - setDT(ans)[] + setDT(ans, duplicateShared=FALSE)[] } vecseq = function(x,y,clamp) .Call(Cvecseq,x,y,clamp) @@ -2953,7 +2953,7 @@ setDF = function(x, rownames=NULL) { invisible(x) } -setDT = function(x, keep.rownames=FALSE, key=NULL, check.names=FALSE) { +setDT = function(x, keep.rownames=FALSE, key=NULL, check.names=FALSE, duplicateShared=TRUE) { name = substitute(x) if (is.name(name)) { home = function(x, env) { @@ -2968,12 +2968,14 @@ setDT = function(x, keep.rownames=FALSE, key=NULL, check.names=FALSE) { stopf("Cannot convert '%1$s' to data.table by reference because binding is locked. It is very likely that '%1$s' resides within a package (or an environment) that is locked to prevent modifying its variable bindings. Try copying the object to your current environment, ex: var <- copy(var) and then using setDT again.", cname) } } + if (!isTRUEorFALSE(duplicateShared)) + stopf("'%s' must be TRUE or FALSE", "duplicateShared") if (is.data.table(x)) { # fix for #1078 and #1128, see .resetclass() for explanation. setattr(x, 'class', .resetclass(x, 'data.table')) if (!missing(key)) setkeyv(x, key) # fix for #1169 if (check.names) setattr(x, "names", make.names(names(x), unique=TRUE)) - if (selfrefok(x) > 0L) return(invisible(x)) else setalloccol(x) + if (selfrefok(x) > 0L) return(invisible(x)) else setalloccol(x, duplicateShared=duplicateShared) } else if (is.data.frame(x)) { # check no matrix-like columns, #3760. Allow a single list(matrix) is unambiguous and depended on by some revdeps, #3581 # for performance, only warn on the first such column, #5426 @@ -2988,8 +2990,9 @@ setDT = function(x, keep.rownames=FALSE, key=NULL, check.names=FALSE) { setattr(x, "row.names", .set_row_names(nrow(x))) if (check.names) setattr(x, "names", make.names(names(x), unique=TRUE)) # fix for #1078 and #1128, see .resetclass() for explanation. - setattr(x, "class", .resetclass(x, 'data.frame')) - setalloccol(x) + setalloccol(x, duplicateShared=duplicateShared) + setattr(x, "class", .resetclass(x, 'data.frame')) # selfrek not ok after setattr class + setalloccol(x, duplicateShared=duplicateShared) if (!is.null(rn)) { nm = c(if (is.character(keep.rownames)) keep.rownames[1L] else "rn", names(x)) x[, (nm[1L]) := rn] @@ -3017,7 +3020,7 @@ setDT = function(x, keep.rownames=FALSE, key=NULL, check.names=FALSE) { } setattr(x, "row.names", .set_row_names(nrow)) setattr(x, "class", c("data.table", "data.frame")) - setalloccol(x) + setalloccol(x, duplicateShared = duplicateShared) } else { stopf("Argument 'x' to 'setDT' should be a 'list', 'data.frame' or 'data.table'") } diff --git a/R/fcast.R b/R/fcast.R index f059812dd0..e8b4d0bdd4 100644 --- a/R/fcast.R +++ b/R/fcast.R @@ -162,7 +162,7 @@ dcast.data.table = function(data, formula, fun.aggregate = NULL, sep = "_", ..., if (any(vapply_1b(dat[varnames], is.list))) { stopf("Columns specified in formula can not be of type list") } - setDT(dat) + setDT(dat, duplicateShared=FALSE) m = as.list(match.call()[-1L]) subset = m[["subset"]][[2L]] @@ -214,7 +214,7 @@ dcast.data.table = function(data, formula, fun.aggregate = NULL, sep = "_", ..., lhs = shallow(dat, lhsnames); rhs = shallow(dat, rhsnames); val = shallow(dat, valnames) # handle drop=TRUE/FALSE - Update: Logic moved to R, AND faster than previous version. Take that... old me :-). if (all(drop)) { - map = setDT(lapply(list(lhsnames, rhsnames), function(cols) frankv(dat, cols=cols, ties.method="dense", na.last=FALSE))) # #2202 fix + map = setDT(lapply(list(lhsnames, rhsnames), function(cols) frankv(dat, cols=cols, ties.method="dense", na.last=FALSE)), duplicateShared=FALSE) # #2202 fix maporder = lapply(map, order_) mapunique = lapply(seq_along(map), function(i) .Call(CsubsetVector, map[[i]], maporder[[i]])) lhs = .Call(CsubsetDT, lhs, maporder[[1L]], seq_along(lhs)) @@ -225,7 +225,7 @@ dcast.data.table = function(data, formula, fun.aggregate = NULL, sep = "_", ..., map = vector("list", 2L) .Call(Csetlistelt, map, 1L, lhs_[lhs, which=TRUE]) .Call(Csetlistelt, map, 2L, rhs_[rhs, which=TRUE]) - setDT(map) + setDT(map, duplicateShared=FALSE) mapunique = vector("list", 2L) .Call(Csetlistelt, mapunique, 1L, seq_len(nrow(lhs_))) .Call(Csetlistelt, mapunique, 2L, seq_len(nrow(rhs_))) @@ -245,7 +245,7 @@ dcast.data.table = function(data, formula, fun.aggregate = NULL, sep = "_", ..., else c(CJ(valnames, allcols, sorted=FALSE), sep=sep)) # removed 'setcolorder()' here, #1153 setattr(ans, 'names', c(lhsnames, allcols)) - setDT(ans) + setDT(ans, duplicateShared=FALSE) setattr(ans, 'sorted', lhsnames) ans } diff --git a/R/fmelt.R b/R/fmelt.R index c6f435578b..e17071b3b1 100644 --- a/R/fmelt.R +++ b/R/fmelt.R @@ -216,7 +216,7 @@ melt.data.table = function(data, id.vars, measure.vars, variable.name = "variabl as.logical(variable.factor), as.logical(value.factor), variable.name, value.name, as.logical(na.rm), as.logical(verbose)) - setDT(ans) + setDT(ans, duplicateShared=FALSE) if (anyDuplicated(names(ans))) { catf("Duplicate column names found in molten data.table. Setting unique names using 'make.names'\n") setnames(ans, make.unique(names(ans))) diff --git a/R/foverlaps.R b/R/foverlaps.R index 7bbaf0dc13..e684d5c2e9 100644 --- a/R/foverlaps.R +++ b/R/foverlaps.R @@ -172,7 +172,7 @@ foverlaps = function(x, y, by.x=key(x) %||% key(y), by.y=key(y), maxgap=0L, mino } # nocov end - setDT(olaps) + setDT(olaps, duplicateShared=FALSE) setnames(olaps, c("xid", "yid")) yid = NULL # for 'no visible binding for global variable' from R CMD check on i clauses below diff --git a/R/frank.R b/R/frank.R index 419f5ea414..a532eb27a0 100644 --- a/R/frank.R +++ b/R/frank.R @@ -5,6 +5,7 @@ frankv = function(x, cols=seq_along(x), order=1L, na.last=TRUE, ties.method=c("a warningf("length(na.last) > 1, only the first element will be used") na.last = na.last[1L] } + input_names = NULL keep = (na.last == "keep") na.last = as.logical(na.last) as_list = function(x) { @@ -16,6 +17,7 @@ frankv = function(x, cols=seq_along(x), order=1L, na.last=TRUE, ties.method=c("a if (!missing(cols) && !is.null(cols)) stopf("x is a single vector, non-NULL 'cols' doesn't make sense") cols = 1L + input_names = names(x) x = as_list(x) } else { cols = colnamesInt(x, cols, check_dups=TRUE) @@ -24,7 +26,7 @@ frankv = function(x, cols=seq_along(x), order=1L, na.last=TRUE, ties.method=c("a } # need to unlock for #4429 x = .shallow(x, cols, unlock = TRUE) # shallow copy even if list.. - setDT(x) + setDT(x, duplicateShared=TRUE) cols = seq_along(cols) if (is.na(na.last)) { if ("..na_prefix.." %chin% names(x)) @@ -66,10 +68,15 @@ frankv = function(x, cols=seq_along(x), order=1L, na.last=TRUE, ties.method=c("a # take care of na.last="keep" V1 = NULL # for R CMD CHECK warning if (isTRUE(keep)) { - ans = (setDT(as_list(ans))[which_(nas, TRUE), V1 := NA])[[1L]] + ans = (setDT(as_list(ans), duplicateShared=TRUE)[which_(nas, TRUE), V1 := NA])[[1L]] } else if (is.na(na.last)) { - ans = ans[which_(nas, FALSE)] + idx = which_(nas, FALSE) + if (!is.null(input_names)) + input_names = input_names[idx] + ans = ans[idx] } + if (!is.null(input_names)) + names(ans) = input_names ans } diff --git a/R/fread.R b/R/fread.R index 16a72ed24d..1c4722d697 100644 --- a/R/fread.R +++ b/R/fread.R @@ -301,7 +301,7 @@ yaml=FALSE, tmpdir=tempdir(), tz="UTC") if (isTRUE(data.table)) { setattr(ans, "class", c("data.table", "data.frame")) - setalloccol(ans) + setalloccol(ans, duplicateShared=FALSE) } else { setattr(ans, "class", "data.frame") } diff --git a/R/frollapply.R b/R/frollapply.R index ee9d785bcd..abfd901bb5 100644 --- a/R/frollapply.R +++ b/R/frollapply.R @@ -265,7 +265,7 @@ frollapply = function(X, N, FUN, ..., by.column=TRUE, fill=NA, align=c("right"," } else { rev.d = function(d) { l = lapply(d, rev) - if (is.data.table(d)) setDT(l) else if (is.data.frame(d)) setDF(l) else l + if (is.data.table(d)) setDT(l, duplicateShared=FALSE) else if (is.data.frame(d)) setDF(l) else l } X = lapply(X, rev.d) } diff --git a/R/mergelist.R b/R/mergelist.R index 7ff0e87bcf..b2042bcf64 100644 --- a/R/mergelist.R +++ b/R/mergelist.R @@ -4,7 +4,7 @@ cbindlist_impl_ = function(l, copy) { setattr(ans, "sorted", NULL) setattr(ans, "index", NULL) } - setDT(ans) + setDT(ans, duplicateShared=FALSE) ans } @@ -254,7 +254,7 @@ mergepair = function(lhs, rhs, on, how, mult, lhs.cols=names(lhs), rhs.cols=name out = rbindlist(list(out.l, out.r), use.names=TRUE, fill=TRUE) } } - setDT(out) + setDT(out, duplicateShared=FALSE) } mergelist_impl_ = function(l, on, cols, how, mult, join.many, copy) { diff --git a/R/setkey.R b/R/setkey.R index 4ba5be4d71..63a7230561 100644 --- a/R/setkey.R +++ b/R/setkey.R @@ -331,8 +331,8 @@ CJ = function(..., sorted = TRUE, unique = FALSE) nrow = prod(lengths(l)) if (nrow > .Machine$integer.max) stopf("Cross product of elements provided to CJ() would result in %.0f rows which exceeds .Machine$integer.max == %d", nrow, .Machine$integer.max) l = .Call(Ccj, l) - setDT(l) - l = setalloccol(l) # a tiny bit wasteful to over-allocate a fixed join table (column slots only), doing it anyway for consistency since + setDT(l, duplicateShared=FALSE) + l = setalloccol(l, duplicateShared=FALSE) # a tiny bit wasteful to over-allocate a fixed join table (column slots only), doing it anyway for consistency since # it's possible a user may wish to use SJ directly outside a join and would expect consistent over-allocation setnames(l, vnames) if (sorted) { diff --git a/R/test.data.table.R b/R/test.data.table.R index 6e264c871f..5b6a7deacb 100644 --- a/R/test.data.table.R +++ b/R/test.data.table.R @@ -154,7 +154,7 @@ test.data.table = function(script="tests.Rraw", verbose=FALSE, pkg=".", silent=F # runtime test number (i.e. 'numStr') since we're just doing a static check here, though we _are_ careful to match the # full test expression string, i.e., not just limited to numeric literal test numbers. arg_line = call_id = col1 = col2 = i.line1 = id = line1 = parent = preceding_line = test_start_line = text = token = x.line1 = x.parent = NULL # R CMD check - pd = setDT(utils::getParseData(parse(fn, keep.source=TRUE))) + pd = setDT(utils::getParseData(parse(fn, keep.source=TRUE)), duplicateShared=FALSE) file_lines = readLines(fn) # NB: a call looks like (with id/parent tracking) # diff --git a/R/transpose.R b/R/transpose.R index 0da0123b94..3423548c38 100644 --- a/R/transpose.R +++ b/R/transpose.R @@ -18,7 +18,7 @@ transpose = function(l, fill=NA, ignore.empty=FALSE, keep.names=NULL, make.names if (!is.null(make.names)) setattr(ans, "names", c(keep.names, colnames)) else if (is.data.frame(l)) # including data.table but not plain list setattr(ans, "names", c(keep.names, paste0("V", seq_len(length(ans)-length(keep.names))))) - if (is.data.table(l)) setDT(ans) + if (is.data.table(l)) setDT(ans, duplicateShared=FALSE) else if (is.data.frame(l)) setDF(ans) ans[] } diff --git a/R/utils.R b/R/utils.R index 39a623b36d..ff5671d64a 100644 --- a/R/utils.R +++ b/R/utils.R @@ -207,7 +207,7 @@ is_utc = function(tz) { # nocov start #593 always return a data.table edit.data.table = function(name, ...) { - setDT(NextMethod('edit', name))[] + setDT(NextMethod('edit', name), duplicateShared=FALSE)[] } # nocov end diff --git a/R/xts.R b/R/xts.R index 582cfa5b0a..58fcb33606 100644 --- a/R/xts.R +++ b/R/xts.R @@ -5,7 +5,7 @@ as.data.table.xts = function(x, keep.rownames = TRUE, key=NULL, ...) { if (is.na(keep.rownames)) stopf("keep.rownames must not be NA") # as.data.frame.xts will handle copying, and # the error check above ensures as.data.frame.xts is applied - r = setDT(as.data.frame(x, row.names=NULL)) + r = setDT(as.data.frame(x, row.names=NULL), duplicateShared=FALSE) if (identical(keep.rownames, FALSE)) return(r[]) index_nm = if (is.character(keep.rownames)) keep.rownames else "index" if (index_nm %chin% names(x)) stopf("Input xts object should not have '%s' column because it would result in duplicate column names. Rename '%s' column in xts or use `keep.rownames` to change the index column name.", index_nm, index_nm) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 11a02fabad..541149286a 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -16224,13 +16224,13 @@ test(2087.01, x[a == 1L, .(b, b2=b)][ , address(b)!=address(b2)]) # setkey detects and copies shared address columns, #3496 x = data.frame(a=paste0(2:1), stringsAsFactors=FALSE) x$b = x$a -setDT(x) +setDT(x, duplicateShared=FALSE) test(2087.02, setkey(x, a, verbose=TRUE), data.table(a=paste0(1:2), b=paste0(1:2), key="a"), output='Found and copied 1 column with a shared memory address') x = data.frame(a=paste0(2:1), stringsAsFactors=FALSE) x$b = x$a x$c = x$a -setDT(x) +setDT(x, duplicateShared=FALSE) test(2087.03, setkey(x, a, verbose=TRUE), data.table(a=paste0(1:2), b=paste0(1:2), c=paste0(1:2), key="a"), output='Found and copied 2 columns with a shared memory address') # follow-up from #3890; function body and variable in calling scope @@ -21837,3 +21837,45 @@ DT[, V1000 := 20:1] test(2343.1, forderv(DT, by=names(DT), sort=FALSE, retGrp=TRUE), forderv(DT, by=c("V1", "V1000"), sort=FALSE, retGrp=TRUE)) x = c(rep(0, 7e5), 1e6) test(2343.2, forderv(list(x)), integer(0)) + +# setDT should not modify shared vectors #2683 +sDT = function(x) setalloccol(setattr(x, "class", c("data.table", "data.frame")), duplicateShared=TRUE) +y = c(a=4, b=5) +x = list(c=y) +test(2344.01, {sDT(x); y}, c(a=4, b=5)) +y = c(a=4, b=5) +x = list(c=y) +test(2344.02, {setDT(x, duplicateShared=TRUE); y}, c(a=4, b=5)) +y = c(a=4, b=5) +x = list(c=y) +setattr(x, "class", c("data.table", "data.frame")) +test(2344.03, {sDT(x)[1, c := 0L]; y}, c(a=4, b=5)) +y = c(a=4, b=5) +x = list(c=y) +test(2344.04, {setDT(x, duplicateShared=TRUE)[1, c := 0L]; y}, c(a=4, b=5)) +# frank should not modify shared vectors #4240 +x = setNames(c(3,1,4,1,5,9,2,6,5,3,5), letters[1:11]) +test(2344.11, {frank(y<-copy(x)); names(y)}, names(x)) +test(2344.12, frank(copy(x)), rank(copy(x))) +# consistent semantics after setDT #4783 +df = data.frame(a=c(1,2,3,4,5), b=c(2,3,4,5,6)) +d1 = copy(df) +d2 = d1 +setDT(d2) +test(2344.21, {setDT(d2); class(d1)}, "data.frame") +d1 = copy(df) +d2 = d1 +setDT(d2) +test(2344.22, {d2[, b:=3:7]; d1}, df) +d1 = copy(df) +d2 = d1 +setDT(d2) +test(2344.23, {d2[, c:=4:8]; d1}, df) +d1 = copy(df) +d2 = d1 +setDT(d2) +test(2344.24, {d2[!is.na(a), b:=5:9]; d1}, df) +d1 = copy(df) +d2 = d1 +setDT(d2) +test(2344.25, {d2[, b:=30]; d1}, df) diff --git a/man/setDT.Rd b/man/setDT.Rd index 6dfb7ac701..0d852047de 100644 --- a/man/setDT.Rd +++ b/man/setDT.Rd @@ -8,19 +8,22 @@ } \usage{ -setDT(x, keep.rownames=FALSE, key=NULL, check.names=FALSE) +setDT(x, keep.rownames=FALSE, key=NULL, check.names=FALSE, duplicateShared=TRUE) } \arguments{ \item{x}{ A named or unnamed \code{list}, \code{data.frame} or \code{data.table}. } - \item{keep.rownames}{ For \code{data.frame}s, \code{TRUE} retains the \code{data.frame}'s row names under a new column \code{rn}. \code{keep.rownames = "id"} names the column \code{"id"} instead. } + \item{keep.rownames}{ For \code{data.frame}s, \code{TRUE} retains the \code{data.frame}'s row names under a new column \code{rn}. \code{keep.rownames = "id"} names the column \code{"id"} instead. } \item{key}{ Character vector of one or more column names which is passed to \code{\link{setkeyv}}. } \item{check.names}{ Just as \code{check.names} in \code{\link{data.frame}}. } + \item{duplicateShared}{ Logical. When \code{TRUE}, columns that are shared with other objects (i.e., \code{MAYBE_SHARED} is true) are duplicated to avoid unintended modification of the original data. Default is \code{TRUE}. } } \details{ When working on large \code{list}s or \code{data.frame}s, it might be both time- and memory-consuming to convert them to a \code{data.table} using \code{as.data.table(.)}, which will make a complete copy of the input object before converting it to a \code{data.table}. \code{setDT} takes care of this issue by converting any \code{list} (named or unnamed, data.frame or not) \emph{by reference} instead. That is, the input object is modified in place with no copy. This should come with low overhead, but note that \code{setDT} does check that the input is valid by looking for inconsistent input lengths and inadmissible column types (e.g. matrix). + + The \code{duplicateShared} parameter controls whether columns that might be shared with other R objects are duplicated. When \code{TRUE} (the default), \code{setDT} checks each column to see if it is shared (using R's internal \code{MAYBE_SHARED} macro) and duplicates any shared columns. This prevents unintended side effects where modifying the data.table would also modify the original objects. For example, if a list contains a vector \code{y} that is also referenced elsewhere, operations on the resulting data.table would not affect \code{y} when \code{duplicateShared=TRUE}. Set to \code{FALSE} only if you are certain no columns are shared or if you explicitly want shared column behavior for performance reasons. } \value{ diff --git a/man/truelength.Rd b/man/truelength.Rd index a85f78b1b6..02455b82af 100644 --- a/man/truelength.Rd +++ b/man/truelength.Rd @@ -10,16 +10,19 @@ truelength(x) setalloccol(DT, n = getOption("datatable.alloccol"), # default: 1024L - verbose = getOption("datatable.verbose")) # default: FALSE + verbose = getOption("datatable.verbose"), # default: FALSE + duplicateShared = TRUE) alloc.col(DT, n = getOption("datatable.alloccol"), # default: 1024L - verbose = getOption("datatable.verbose")) # default: FALSE + verbose = getOption("datatable.verbose"), # default: FALSE + duplicateShared = TRUE) } \arguments{ \item{x}{ Any type of vector, including \code{data.table} which is a \code{list} vector of column pointers. } \item{DT}{ A \code{data.table}. } \item{n}{ The number of spare column pointer slots to ensure are available. If \code{DT} is a 1,000 column \code{data.table} with 24 spare slots remaining, \code{n=1024L} means grow the 24 spare slots to be 1024. \code{truelength(DT)} will then be 2024 in this example. } \item{verbose}{ Output status and information. } +\item{duplicateShared}{ Logical. When \code{TRUE} (the default), columns that are shared with other objects (i.e., \code{MAYBE_SHARED} is true) are duplicated to avoid unintended modification of the original data. Set to \code{FALSE} only if you are certain no columns are shared or if you explicitly want shared column behavior for performance reasons. } } \details{ When adding columns by reference using \code{:=}, we \emph{could} simply create a new column list vector (one longer) and memcpy over the old vector, diff --git a/src/assign.c b/src/assign.c index eee00cc0f5..5e5e1b0215 100644 --- a/src/assign.c +++ b/src/assign.c @@ -285,15 +285,27 @@ int checkOverAlloc(SEXP x) return ans; } -SEXP alloccolwrapper(SEXP dt, SEXP overAllocArg, SEXP verbose) { +SEXP alloccolwrapper(SEXP dt, SEXP overAllocArg, SEXP verbose, SEXP duplicateSharedArg) { if (!IS_TRUE_OR_FALSE(verbose)) error(_("%s must be TRUE or FALSE"), "verbose"); - int overAlloc = checkOverAlloc(overAllocArg); + bool duplicateShared = false; + if (duplicateSharedArg != R_NilValue) { + if (!IS_TRUE_OR_FALSE(duplicateSharedArg)) + error(_("%s must be TRUE or FALSE"), "duplicateShared"); + duplicateShared = LOGICAL(duplicateSharedArg)[0]; + } + const int overAlloc = checkOverAlloc(overAllocArg); SEXP ans = PROTECT(alloccol(dt, length(dt)+overAlloc, LOGICAL(verbose)[0])); for(R_len_t i = 0; i < LENGTH(ans); i++) { + SEXP col = VECTOR_ELT(ans, i); + // Check if column is shared with another object and duplicate on request (#2683) + if (duplicateShared && MAYBE_SHARED(col)) { + col = duplicate(col); + SET_VECTOR_ELT(ans, i, col); + } // clear names; also excluded by copyMostAttrib(). Primarily for data.table and as.data.table, but added here centrally (see #103). - setAttrib(VECTOR_ELT(ans, i), R_NamesSymbol, R_NilValue); + setAttrib(col, R_NamesSymbol, R_NilValue); // But don't clear dim and dimnames. Because as from 1.12.4 we keep the matrix column as-is and ask user to use as.data.table to // unpack matrix columns when they really need to; test 2089.2 diff --git a/src/data.table.h b/src/data.table.h index 663f0adb46..8e8fbac87b 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -354,7 +354,7 @@ SEXP setattrib(SEXP, SEXP, SEXP); SEXP assign(SEXP, SEXP, SEXP, SEXP, SEXP); SEXP copy(SEXP); SEXP setdt_nrows(SEXP); -SEXP alloccolwrapper(SEXP, SEXP, SEXP); +SEXP alloccolwrapper(SEXP, SEXP, SEXP, SEXP); SEXP selfrefokwrapper(SEXP, SEXP); SEXP truelength(SEXP); SEXP setcharvec(SEXP, SEXP, SEXP); diff --git a/src/utils.c b/src/utils.c index b159fa0e60..c81756b1b0 100644 --- a/src/utils.c +++ b/src/utils.c @@ -500,8 +500,8 @@ int n_columns(SEXP x) { error("internal error: C setDT should be called only on a list"); // # nocov setAttrib(x, R_ClassSymbol, char2_dtdf()); setAttrib(x, sym_rownames, set_row_names(n_rows(x))); - return alloccolwrapper(x, GetOption(sym_alloccol, R_NilValue), GetOption(sym_verbose, R_NilValue)); - }*/ + return alloccolwrapper(x, GetOption(sym_alloccol, R_NilValue), GetOption(sym_verbose, R_NilValue), R_NilValue); +}*/ // inherits(x, "data.table") bool isDataTable(SEXP x) {