Rdatatable · ben-schwen · Oct 26, 2025 · jangorecki · Oct 26, 2025
@@ -34,6 +34,8 @@
 
 5. Negative and missing values of `n` argument of adaptive rolling functions trigger an error.
 
+6. `setDT()` and `setalloccol()` gain `duplicateShared` argument (default `TRUE`). When `TRUE`, columns that are shared with other objects are duplicated to avoid unintended modification of the original data, [#2683](https://github.com/Rdatatable/data.table/issues/2683). Previously, shared columns were not duplicated, which could lead to unexpected side effects. `frank()` now uses this internally to preserve names from the input vector and avoid modifying shared vectors, [#4240](https://github.com/Rdatatable/data.table/issues/4240). Thanks to @jangorecki, @BenoitLondon, and @MichaelChirico for the report and @ben-schwen for the fix.
+
 ### NOTICE OF INTENDED FUTURE POTENTIAL BREAKING CHANGES 
 
 1. `data.table(x=1, <expr>)`, where `<expr>` is an expression resulting in a 1-column matrix without column names, will eventually have names `x` and `V2`, not `x` and `V1`, consistent with `data.table(x=1, <expr>)` where `<expr>` results in an atomic vector, for example `data.table(x=1, cbind(1))` and `data.table(x=1, 1)` will both have columns named `x` and `V2`. In this release, the matrix case continues to be named `V1`, but the new behavior can be activated by setting `options(datatable.old.matrix.autoname)` to `FALSE`. See point 5 under Bug Fixes for more context; this change will provide more internal consistency as well as more consistency with `data.frame()`.

@@ -69,7 +69,7 @@ as.data.table.matrix = function(x, keep.rownames=FALSE, key=NULL, ...) {
     for (i in ic) value[[i]] = as.vector(x[, i])       # to drop any row.names that would otherwise be retained inside every column of the data.table
   }
   col_labels = dimnames(x)[[2L]]
-  setDT(value)
+  setDT(value, duplicateShared=FALSE)
   if (length(col_labels) == ncols) {
     if (any(empty <- !nzchar(col_labels)))
       col_labels[empty] = paste0("V", ic[empty])
@@ -233,7 +233,7 @@ as.data.table.list = function(x,
     }
   }
   setattr(ans, "names", vnames)
-  setDT(ans, key=key) # copy ensured above; also, setDT handles naming
+  setDT(ans, key=key, duplicateShared=FALSE) # copy ensured above; also, setDT handles naming
   if (length(origListNames)==length(ans)) setattr(ans, "names", origListNames)  # PR 3854 and tests 2058.15-17
   ans
 }
@@ -282,7 +282,7 @@ as.data.table.data.frame = function(x, keep.rownames=FALSE, key=NULL, ...) {
 
   # fix for #1078 and #1128, see .resetclass() for explanation.
   setattr(ans, "class", .resetclass(x, "data.frame"))
-  setalloccol(ans)
+  setalloccol(ans, duplicateShared=FALSE)
   setkeyv(ans, key)
   ans
 }

@@ -66,8 +66,8 @@ between = function(x, lower, upper, incbounds=TRUE, NAbounds=TRUE, check=FALSE,
 # issue FR #707
 # is x[i] found anywhere within [lower, upper] range?
 inrange = function(x,lower,upper,incbounds=TRUE) {
-  query = setDT(list(x=x))
-  subject = setDT(list(l=lower, u=upper))
+  query = setDT(list(x=x), duplicateShared=FALSE)
+  subject = setDT(list(l=lower, u=upper), duplicateShared=FALSE)
   ops = if (incbounds) c(4L, 2L) else c(5L, 3L) # >=,<= and >,<
   verbose = isTRUE(getOption("datatable.verbose"))
   if (verbose) {last.started.at=proc.time();catf("forderv(query) took ... ");flush.console()}
@@ -81,7 +81,7 @@ inrange = function(x,lower,upper,incbounds=TRUE) {
   )
   xo = ans$xo
   options(datatable.verbose=FALSE)
-  setDT(ans[c("starts", "lens")], key=c("starts", "lens"))
+  setDT(ans[c("starts", "lens")], key=c("starts", "lens"), duplicateShared=FALSE)
   options(datatable.verbose=verbose)
   if (verbose) {last.started.at=proc.time();catf("Generating final logical vector ... ");flush.console()}
   .Call(Cinrange, idx <- vector("logical", length(x)), xo, ans[["starts"]], ans[["lens"]])

@@ -38,7 +38,7 @@ null.data.table = function() {
   ans = list()
   setattr(ans,"class",c("data.table","data.frame"))
   setattr(ans,"row.names",.set_row_names(0L))
-  setalloccol(ans)
+  setalloccol(ans, duplicateShared=FALSE)
 }
 
 data.table = function(..., keep.rownames=FALSE, check.names=FALSE, key=NULL, stringsAsFactors=FALSE)
@@ -73,7 +73,7 @@ data.table = function(..., keep.rownames=FALSE, check.names=FALSE, key=NULL, str
     for (j in which(vapply_1b(ans, is.character))) set(ans, NULL, j, as_factor(.subset2(ans, j)))
     # as_factor is internal function in fread.R currently
   }
-  setalloccol(ans)  # returns a NAMED==0 object, unlike data.frame()
+  setalloccol(ans, duplicateShared=FALSE)  # returns a NAMED==0 object, unlike data.frame()
 }
 
 replace_dot_alias = function(e) {
@@ -523,7 +523,7 @@ replace_dot_alias = function(e) {
       xo = ans$xo ## to make it available for further use.
       # temp fix for issue spotted by Jan, test #1653.1. TODO: avoid this
       # 'setorder', as there's another 'setorder' in generating 'irows' below...
-      if (length(ans$indices)) setorder(setDT(ans[1L:3L]), indices)
+      if (length(ans$indices)) setorder(setDT(ans[1L:3L], duplicateShared=FALSE), indices)
       allLen1 = ans$allLen1
       f__ = ans$starts
       len__ = ans$lens
@@ -575,7 +575,7 @@ replace_dot_alias = function(e) {
         }
         if (nqbyjoin) {
           irows = if (length(xo)) xo[irows] else irows
-          xo = setorder(setDT(list(indices=rep.int(indices__, len__), irows=irows)))[["irows"]]
+          xo = setorder(setDT(list(indices=rep.int(indices__, len__), irows=irows), duplicateShared=FALSE))[["irows"]]
           ans = .Call(CnqRecreateIndices, xo, len__, indices__, max(indices__), nomatch) # issue#4388 fix
           f__ = ans[[1L]]; len__ = ans[[2L]]
           allLen1 = FALSE # TODO; should this always be FALSE?
@@ -594,7 +594,7 @@ replace_dot_alias = function(e) {
         irows = xo[irows]   # TO DO: fsort here?
         if (mult=="all" && !allGrp1) { # following #1991 fix, !allGrp1 will always be TRUE. TODO: revisit.
           if (verbose) {last.started.at=proc.time();catf("Reorder irows for 'mult==\"all\" && !allGrp1' ... ");flush.console()}
-          irows = setorder(setDT(list(indices=rep.int(indices__, len__), irows=irows)))[["irows"]]
+          irows = setorder(setDT(list(indices=rep.int(indices__, len__), irows=irows), duplicateShared=FALSE))[["irows"]]
           if (verbose) {cat(timetaken(last.started.at),"\n"); flush.console()}
         }
       }
@@ -1245,7 +1245,7 @@ replace_dot_alias = function(e) {
               # Note also that this growing will happen for missing columns assigned NULL, too. But so rare, we
               # don't mind.
             }
-            setalloccol(x, n, verbose=verbose)   # always assigns to calling scope; i.e. this scope
+            setalloccol(x, n, verbose=verbose, duplicateShared=FALSE)   # always assigns to calling scope; i.e. this scope
             if (is.name(name)) {
               assign(as.character(name),x,parent.frame(),inherits=TRUE)
             } else if (.is_simple_extraction(name)) {
@@ -1352,7 +1352,7 @@ replace_dot_alias = function(e) {
         setattr(ans, "sorted", .join_result_key(x, i, ans, if (!missing(on)) names(on), ansvars, leftcols, rightcols, names_i, irows, roll))
         setattr(ans, "class", class(x))  # retain class that inherits from data.table, #64
         setattr(ans, "row.names", .set_row_names(length(ans[[1L]])))
-        setalloccol(ans)
+        setalloccol(ans, duplicateShared=FALSE)
       }
       if (!with || missing(j)) return(ans)
       if (!is.data.table(ans)) setattr(ans, "class", c("data.table","data.frame"))  # DF |> DT(,.SD[...]) .SD should be data.table, test 2212.013
@@ -2018,7 +2018,7 @@ replace_dot_alias = function(e) {
   } else if (.by_result_is_keyable(x, keyby, bysameorder, byjoin, allbyvars, bysub)) {
     setattr(ans, "sorted", names(ans)[seq_along(grpcols)])
   }
-  setalloccol(ans)   # TODO: overallocate in dogroups in the first place and remove this line
+  setalloccol(ans, duplicateShared=FALSE)   # TODO: overallocate in dogroups in the first place and remove this line
 }
 
 # can the specified merge of x and i be marked as sorted? return the columns for which this is true, otherwise NULL
@@ -2242,7 +2242,7 @@ tail.data.table = function(x, n=6L, ...) {
   if (!cedta()) {
     x = if (nargs()<4L) `[<-.data.frame`(x, i, value=value)
         else `[<-.data.frame`(x, i, j, value)
-    return(setalloccol(x))    # over-allocate (again).   Avoid all this by using :=.
+    return(setalloccol(x, duplicateShared=FALSE))    # over-allocate (again).   Avoid all this by using :=.
   }
   # TO DO: warningf("Please use DT[i,j:=value] syntax instead of DT[i,j]<-value, for efficiency. See ?':='")
   if (!missing(i)) {
@@ -2251,7 +2251,7 @@ tail.data.table = function(x, n=6L, ...) {
     if (is.matrix(i)) {
       if (!missing(j)) stopf("When i is a matrix in DT[i]<-value syntax, it doesn't make sense to provide j")
       x = `[<-.data.frame`(x, i, value=value)
-      return(setalloccol(x))
+      return(setalloccol(x, duplicateShared=FALSE))
     }
     i = x[i, which=TRUE]
     # Tried adding ... after value above, and passing ... in here (e.g. for mult="first") but R CMD check
@@ -2284,11 +2284,11 @@ tail.data.table = function(x, n=6L, ...) {
     reinstatekey=key(x)
   }
   if (!selfrefok(x) || truelength(x) < ncol(x)+length(newnames)) {
-    x = setalloccol(x, length(x)+length(newnames)) # because [<- copies via *tmp* and main/duplicate.c copies at length but copies truelength over too
+    x = setalloccol(x, length(x)+length(newnames), duplicateShared=FALSE) # because [<- copies via *tmp* and main/duplicate.c copies at length but copies truelength over too
     # search for one other .Call to assign in [.data.table to see how it differs
   }
   x = .Call(Cassign,copy(x),i,cols,newnames,value) # From 3.1.0, DF[2,"b"] = 7 no longer copies DF$a (so in this [<-.data.table method we need to copy)
-  setalloccol(x)  #  can maybe avoid this realloc, but this is (slow) [<- anyway, so just be safe.
+  setalloccol(x, duplicateShared=FALSE)  #  can maybe avoid this realloc, but this is (slow) [<- anyway, so just be safe.
   if (length(reinstatekey)) setkeyv(x,reinstatekey)
   invisible(x)
   # no copy at all if user calls directly; i.e. `[<-.data.table`(x,i,j,value)
@@ -2302,7 +2302,7 @@ tail.data.table = function(x, n=6L, ...) {
 "$<-.data.table" = function(x, name, value) {
   if (!cedta()) {
     ans = `$<-.data.frame`(x, name, value) # nocov
-    return(setalloccol(ans))           # nocov. over-allocate (again)
+    return(setalloccol(ans, duplicateShared=FALSE))           # nocov. over-allocate (again)
   }
   x = copy(x)
   set(x,j=name,value=value)  # important i is missing here
@@ -2359,7 +2359,7 @@ dimnames.data.table = function(x) {
     setattr(x,"names",NULL)   # e.g. plyr::melt() calls base::unname()
   else {
     setnames(x,value)
-    setalloccol(x)
+    setalloccol(x, duplicateShared=FALSE)
   }
   x   # this returned value is now shallow copied by R 3.1.0 via *tmp*. A very welcome change.
 }
@@ -2601,7 +2601,7 @@ copy = function(x) {
   reallocate = function(y) {
     if (is.data.table(y)) {
       .Call(C_unlock, y)
-      setalloccol(y)
+      setalloccol(y, duplicateShared=FALSE)
     } else if (is.list(y)) {
       oldClass = class(y)
       setattr(y, 'class', NULL)  # otherwise [[.person method (which returns itself) results in infinite recursion, #4620
@@ -2666,11 +2666,11 @@ shallow = function(x, cols=NULL) {
   ans
 }
 
-setalloccol = alloc.col = function(DT, n=getOption("datatable.alloccol"), verbose=getOption("datatable.verbose"))
+setalloccol = alloc.col = function(DT, n=getOption("datatable.alloccol"), verbose=getOption("datatable.verbose"), duplicateShared=TRUE)
 {
   name = substitute(DT)
   if (identical(name, quote(`*tmp*`))) stopf("setalloccol attempting to modify `*tmp*`")
-  ans = .Call(Calloccolwrapper, DT, eval(n), verbose)
+  ans = .Call(Calloccolwrapper, DT, eval(n), verbose, duplicateShared)
   if (is.name(name)) {
     name = as.character(name)
     assign(name,ans,parent.frame(),inherits=TRUE)
@@ -2875,7 +2875,7 @@ rbindlist = function(l, use.names="check", fill=FALSE, idcol=NULL, ignore.attr=F
   }
   ans = .Call(Crbindlist, l, use.names, fill, idcol, ignore.attr)
   if (!length(ans)) return(null.data.table())
-  setDT(ans)[]
+  setDT(ans, duplicateShared=FALSE)[]
 }
 
 vecseq = function(x,y,clamp) .Call(Cvecseq,x,y,clamp)
@@ -2953,7 +2953,7 @@ setDF = function(x, rownames=NULL) {
   invisible(x)
 }
 
-setDT = function(x, keep.rownames=FALSE, key=NULL, check.names=FALSE) {
+setDT = function(x, keep.rownames=FALSE, key=NULL, check.names=FALSE, duplicateShared=TRUE) {
   name = substitute(x)
   if (is.name(name)) {
     home = function(x, env) {
@@ -2968,12 +2968,14 @@ setDT = function(x, keep.rownames=FALSE, key=NULL, check.names=FALSE) {
       stopf("Cannot convert '%1$s' to data.table by reference because binding is locked. It is very likely that '%1$s' resides within a package (or an environment) that is locked to prevent modifying its variable bindings. Try copying the object to your current environment, ex: var <- copy(var) and then using setDT again.", cname)
     }
   }
+  if (!isTRUEorFALSE(duplicateShared))
+    stopf("'%s' must be TRUE or FALSE", "duplicateShared")
   if (is.data.table(x)) {
     # fix for #1078 and #1128, see .resetclass() for explanation.
     setattr(x, 'class', .resetclass(x, 'data.table'))
     if (!missing(key)) setkeyv(x, key) # fix for #1169
     if (check.names) setattr(x, "names", make.names(names(x), unique=TRUE))
-    if (selfrefok(x) > 0L) return(invisible(x)) else setalloccol(x)
+    if (selfrefok(x) > 0L) return(invisible(x)) else setalloccol(x, duplicateShared=duplicateShared)
   } else if (is.data.frame(x)) {
     # check no matrix-like columns, #3760. Allow a single list(matrix) is unambiguous and depended on by some revdeps, #3581
     # for performance, only warn on the first such column, #5426
@@ -2988,8 +2990,9 @@ setDT = function(x, keep.rownames=FALSE, key=NULL, check.names=FALSE) {
     setattr(x, "row.names", .set_row_names(nrow(x)))
     if (check.names) setattr(x, "names", make.names(names(x), unique=TRUE))
     # fix for #1078 and #1128, see .resetclass() for explanation.
-    setattr(x, "class", .resetclass(x, 'data.frame'))
-    setalloccol(x)
+    setalloccol(x, duplicateShared=duplicateShared)
+    setattr(x, "class", .resetclass(x, 'data.frame')) # selfrek not ok after setattr class
+    setalloccol(x, duplicateShared=duplicateShared)
     if (!is.null(rn)) {
       nm = c(if (is.character(keep.rownames)) keep.rownames[1L] else "rn", names(x))
       x[, (nm[1L]) := rn]
@@ -3017,7 +3020,7 @@ setDT = function(x, keep.rownames=FALSE, key=NULL, check.names=FALSE) {
     }
     setattr(x, "row.names", .set_row_names(nrow))
     setattr(x, "class", c("data.table", "data.frame"))
-    setalloccol(x)
+    setalloccol(x, duplicateShared = duplicateShared)
   } else {
     stopf("Argument 'x' to 'setDT' should be a 'list', 'data.frame' or 'data.table'")
   }

@@ -162,7 +162,7 @@ dcast.data.table = function(data, formula, fun.aggregate = NULL, sep = "_", ...,
   if (any(vapply_1b(dat[varnames], is.list))) {
     stopf("Columns specified in formula can not be of type list")
   }
-  setDT(dat)
+  setDT(dat, duplicateShared=FALSE)
 
   m = as.list(match.call()[-1L])
   subset = m[["subset"]][[2L]]
@@ -214,7 +214,7 @@ dcast.data.table = function(data, formula, fun.aggregate = NULL, sep = "_", ...,
   lhs = shallow(dat, lhsnames); rhs = shallow(dat, rhsnames); val = shallow(dat, valnames)
   # handle drop=TRUE/FALSE - Update: Logic moved to R, AND faster than previous version. Take that... old me :-).
   if (all(drop)) {
-    map = setDT(lapply(list(lhsnames, rhsnames), function(cols) frankv(dat, cols=cols, ties.method="dense", na.last=FALSE))) # #2202 fix
+    map = setDT(lapply(list(lhsnames, rhsnames), function(cols) frankv(dat, cols=cols, ties.method="dense", na.last=FALSE)), duplicateShared=FALSE) # #2202 fix
     maporder = lapply(map, order_)
     mapunique = lapply(seq_along(map), function(i) .Call(CsubsetVector, map[[i]], maporder[[i]]))
     lhs = .Call(CsubsetDT, lhs, maporder[[1L]], seq_along(lhs))
@@ -225,7 +225,7 @@ dcast.data.table = function(data, formula, fun.aggregate = NULL, sep = "_", ...,
     map = vector("list", 2L)
     .Call(Csetlistelt, map, 1L, lhs_[lhs, which=TRUE])
     .Call(Csetlistelt, map, 2L, rhs_[rhs, which=TRUE])
-    setDT(map)
+    setDT(map, duplicateShared=FALSE)
     mapunique = vector("list", 2L)
     .Call(Csetlistelt, mapunique, 1L, seq_len(nrow(lhs_)))
     .Call(Csetlistelt, mapunique, 2L, seq_len(nrow(rhs_)))
@@ -245,7 +245,7 @@ dcast.data.table = function(data, formula, fun.aggregate = NULL, sep = "_", ...,
           else c(CJ(valnames, allcols, sorted=FALSE), sep=sep))
     # removed 'setcolorder()' here, #1153
   setattr(ans, 'names', c(lhsnames, allcols))
-  setDT(ans)
+  setDT(ans, duplicateShared=FALSE)
   setattr(ans, 'sorted', lhsnames)
   ans
 }
@@ -216,7 +216,7 @@ melt.data.table = function(data, id.vars, measure.vars, variable.name = "variabl
       as.logical(variable.factor), as.logical(value.factor),
       variable.name, value.name, as.logical(na.rm),
       as.logical(verbose))
-  setDT(ans)
+  setDT(ans, duplicateShared=FALSE)
   if (anyDuplicated(names(ans))) {
     catf("Duplicate column names found in molten data.table. Setting unique names using 'make.names'\n")
     setnames(ans, make.unique(names(ans)))

@@ -172,7 +172,7 @@ foverlaps = function(x, y, by.x=key(x) %||% key(y), by.y=key(y), maxgap=0L, mino
   }
   # nocov end
 
-  setDT(olaps)
+  setDT(olaps, duplicateShared=FALSE)
   setnames(olaps, c("xid", "yid"))
   yid = NULL  # for 'no visible binding for global variable' from R CMD check on i clauses below
 

@@ -5,6 +5,7 @@
     warningf("length(na.last) > 1, only the first element will be used")
     na.last = na.last[1L]
   }
+  input_names = NULL
   keep = (na.last == "keep")
   na.last = as.logical(na.last)
   as_list = function(x) {
@@ -16,6 +17,7 @@
     if (!missing(cols) && !is.null(cols))
       stopf("x is a single vector, non-NULL 'cols' doesn't make sense")
     cols = 1L
+    input_names = names(x) 
     x = as_list(x)
   } else {
     cols = colnamesInt(x, cols, check_dups=TRUE)
@@ -24,7 +26,7 @@
   }
   # need to unlock for #4429
   x = .shallow(x, cols, unlock = TRUE) # shallow copy even if list..
-  setDT(x)
+  setDT(x, duplicateShared=TRUE)
   cols = seq_along(cols)
   if (is.na(na.last)) {
     if ("..na_prefix.." %chin% names(x))
@@ -66,10 +68,15 @@
   # take care of na.last="keep"
   V1 = NULL # for R CMD CHECK warning
   if (isTRUE(keep)) {
-    ans = (setDT(as_list(ans))[which_(nas, TRUE), V1 := NA])[[1L]]
+    ans = (setDT(as_list(ans), duplicateShared=TRUE)[which_(nas, TRUE), V1 := NA])[[1L]]
   } else if (is.na(na.last)) {
-    ans = ans[which_(nas, FALSE)]
+    idx = which_(nas, FALSE)
+    if (!is.null(input_names))
+      input_names = input_names[idx]
+    ans = ans[idx]
   }
+  if (!is.null(input_names))
+    names(ans) = input_names
   ans
 }
 

@@ -301,7 +301,7 @@ yaml=FALSE, tmpdir=tempdir(), tz="UTC")
 
   if (isTRUE(data.table)) {
     setattr(ans, "class", c("data.table", "data.frame"))
-    setalloccol(ans)
+    setalloccol(ans, duplicateShared=FALSE)
   } else {
     setattr(ans, "class", "data.frame")
   }

@@ -265,7 +265,7 @@ frollapply = function(X, N, FUN, ..., by.column=TRUE, fill=NA, align=c("right","
     } else {
       rev.d = function(d) {
         l = lapply(d, rev)
-        if (is.data.table(d)) setDT(l) else if (is.data.frame(d)) setDF(l) else l
+        if (is.data.table(d)) setDT(l, duplicateShared=FALSE) else if (is.data.frame(d)) setDF(l) else l
       }
       X = lapply(X, rev.d)
     }