Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@

5. Negative and missing values of `n` argument of adaptive rolling functions trigger an error.

6. `setDT()` and `setalloccol()` gain `duplicateShared` argument (default `TRUE`). When `TRUE`, columns that are shared with other objects are duplicated to avoid unintended modification of the original data, [#2683](https://github.com/Rdatatable/data.table/issues/2683). Previously, shared columns were not duplicated, which could lead to unexpected side effects. `frank()` now uses this internally to preserve names from the input vector and avoid modifying shared vectors, [#4240](https://github.com/Rdatatable/data.table/issues/4240). Thanks to @jangorecki, @BenoitLondon, and @MichaelChirico for the report and @ben-schwen for the fix.

### NOTICE OF INTENDED FUTURE POTENTIAL BREAKING CHANGES

1. `data.table(x=1, <expr>)`, where `<expr>` is an expression resulting in a 1-column matrix without column names, will eventually have names `x` and `V2`, not `x` and `V1`, consistent with `data.table(x=1, <expr>)` where `<expr>` results in an atomic vector, for example `data.table(x=1, cbind(1))` and `data.table(x=1, 1)` will both have columns named `x` and `V2`. In this release, the matrix case continues to be named `V1`, but the new behavior can be activated by setting `options(datatable.old.matrix.autoname)` to `FALSE`. See point 5 under Bug Fixes for more context; this change will provide more internal consistency as well as more consistency with `data.frame()`.
Expand Down
6 changes: 3 additions & 3 deletions R/as.data.table.R
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ as.data.table.matrix = function(x, keep.rownames=FALSE, key=NULL, ...) {
for (i in ic) value[[i]] = as.vector(x[, i]) # to drop any row.names that would otherwise be retained inside every column of the data.table
}
col_labels = dimnames(x)[[2L]]
setDT(value)
setDT(value, duplicateShared=FALSE)
if (length(col_labels) == ncols) {
if (any(empty <- !nzchar(col_labels)))
col_labels[empty] = paste0("V", ic[empty])
Expand Down Expand Up @@ -233,7 +233,7 @@ as.data.table.list = function(x,
}
}
setattr(ans, "names", vnames)
setDT(ans, key=key) # copy ensured above; also, setDT handles naming
setDT(ans, key=key, duplicateShared=FALSE) # copy ensured above; also, setDT handles naming
if (length(origListNames)==length(ans)) setattr(ans, "names", origListNames) # PR 3854 and tests 2058.15-17
ans
}
Expand Down Expand Up @@ -282,7 +282,7 @@ as.data.table.data.frame = function(x, keep.rownames=FALSE, key=NULL, ...) {

# fix for #1078 and #1128, see .resetclass() for explanation.
setattr(ans, "class", .resetclass(x, "data.frame"))
setalloccol(ans)
setalloccol(ans, duplicateShared=FALSE)
setkeyv(ans, key)
ans
}
Expand Down
6 changes: 3 additions & 3 deletions R/between.R
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,8 @@ between = function(x, lower, upper, incbounds=TRUE, NAbounds=TRUE, check=FALSE,
# issue FR #707
# is x[i] found anywhere within [lower, upper] range?
inrange = function(x,lower,upper,incbounds=TRUE) {
query = setDT(list(x=x))
subject = setDT(list(l=lower, u=upper))
query = setDT(list(x=x), duplicateShared=FALSE)
subject = setDT(list(l=lower, u=upper), duplicateShared=FALSE)
ops = if (incbounds) c(4L, 2L) else c(5L, 3L) # >=,<= and >,<
verbose = isTRUE(getOption("datatable.verbose"))
if (verbose) {last.started.at=proc.time();catf("forderv(query) took ... ");flush.console()}
Expand All @@ -81,7 +81,7 @@ inrange = function(x,lower,upper,incbounds=TRUE) {
)
xo = ans$xo
options(datatable.verbose=FALSE)
setDT(ans[c("starts", "lens")], key=c("starts", "lens"))
setDT(ans[c("starts", "lens")], key=c("starts", "lens"), duplicateShared=FALSE)
options(datatable.verbose=verbose)
if (verbose) {last.started.at=proc.time();catf("Generating final logical vector ... ");flush.console()}
.Call(Cinrange, idx <- vector("logical", length(x)), xo, ans[["starts"]], ans[["lens"]])
Expand Down
49 changes: 26 additions & 23 deletions R/data.table.R
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ null.data.table = function() {
ans = list()
setattr(ans,"class",c("data.table","data.frame"))
setattr(ans,"row.names",.set_row_names(0L))
setalloccol(ans)
setalloccol(ans, duplicateShared=FALSE)
}

data.table = function(..., keep.rownames=FALSE, check.names=FALSE, key=NULL, stringsAsFactors=FALSE)
Expand Down Expand Up @@ -73,7 +73,7 @@ data.table = function(..., keep.rownames=FALSE, check.names=FALSE, key=NULL, str
for (j in which(vapply_1b(ans, is.character))) set(ans, NULL, j, as_factor(.subset2(ans, j)))
# as_factor is internal function in fread.R currently
}
setalloccol(ans) # returns a NAMED==0 object, unlike data.frame()
setalloccol(ans, duplicateShared=FALSE) # returns a NAMED==0 object, unlike data.frame()
}

replace_dot_alias = function(e) {
Expand Down Expand Up @@ -523,7 +523,7 @@ replace_dot_alias = function(e) {
xo = ans$xo ## to make it available for further use.
# temp fix for issue spotted by Jan, test #1653.1. TODO: avoid this
# 'setorder', as there's another 'setorder' in generating 'irows' below...
if (length(ans$indices)) setorder(setDT(ans[1L:3L]), indices)
if (length(ans$indices)) setorder(setDT(ans[1L:3L], duplicateShared=FALSE), indices)
allLen1 = ans$allLen1
f__ = ans$starts
len__ = ans$lens
Expand Down Expand Up @@ -575,7 +575,7 @@ replace_dot_alias = function(e) {
}
if (nqbyjoin) {
irows = if (length(xo)) xo[irows] else irows
xo = setorder(setDT(list(indices=rep.int(indices__, len__), irows=irows)))[["irows"]]
xo = setorder(setDT(list(indices=rep.int(indices__, len__), irows=irows), duplicateShared=FALSE))[["irows"]]
ans = .Call(CnqRecreateIndices, xo, len__, indices__, max(indices__), nomatch) # issue#4388 fix
f__ = ans[[1L]]; len__ = ans[[2L]]
allLen1 = FALSE # TODO; should this always be FALSE?
Expand All @@ -594,7 +594,7 @@ replace_dot_alias = function(e) {
irows = xo[irows] # TO DO: fsort here?
if (mult=="all" && !allGrp1) { # following #1991 fix, !allGrp1 will always be TRUE. TODO: revisit.
if (verbose) {last.started.at=proc.time();catf("Reorder irows for 'mult==\"all\" && !allGrp1' ... ");flush.console()}
irows = setorder(setDT(list(indices=rep.int(indices__, len__), irows=irows)))[["irows"]]
irows = setorder(setDT(list(indices=rep.int(indices__, len__), irows=irows), duplicateShared=FALSE))[["irows"]]
if (verbose) {cat(timetaken(last.started.at),"\n"); flush.console()}
}
}
Expand Down Expand Up @@ -1245,7 +1245,7 @@ replace_dot_alias = function(e) {
# Note also that this growing will happen for missing columns assigned NULL, too. But so rare, we
# don't mind.
}
setalloccol(x, n, verbose=verbose) # always assigns to calling scope; i.e. this scope
setalloccol(x, n, verbose=verbose, duplicateShared=FALSE) # always assigns to calling scope; i.e. this scope
if (is.name(name)) {
assign(as.character(name),x,parent.frame(),inherits=TRUE)
} else if (.is_simple_extraction(name)) {
Expand Down Expand Up @@ -1352,7 +1352,7 @@ replace_dot_alias = function(e) {
setattr(ans, "sorted", .join_result_key(x, i, ans, if (!missing(on)) names(on), ansvars, leftcols, rightcols, names_i, irows, roll))
setattr(ans, "class", class(x)) # retain class that inherits from data.table, #64
setattr(ans, "row.names", .set_row_names(length(ans[[1L]])))
setalloccol(ans)
setalloccol(ans, duplicateShared=FALSE)
}
if (!with || missing(j)) return(ans)
if (!is.data.table(ans)) setattr(ans, "class", c("data.table","data.frame")) # DF |> DT(,.SD[...]) .SD should be data.table, test 2212.013
Expand Down Expand Up @@ -2018,7 +2018,7 @@ replace_dot_alias = function(e) {
} else if (.by_result_is_keyable(x, keyby, bysameorder, byjoin, allbyvars, bysub)) {
setattr(ans, "sorted", names(ans)[seq_along(grpcols)])
}
setalloccol(ans) # TODO: overallocate in dogroups in the first place and remove this line
setalloccol(ans, duplicateShared=FALSE) # TODO: overallocate in dogroups in the first place and remove this line
}

# can the specified merge of x and i be marked as sorted? return the columns for which this is true, otherwise NULL
Expand Down Expand Up @@ -2242,7 +2242,7 @@ tail.data.table = function(x, n=6L, ...) {
if (!cedta()) {
x = if (nargs()<4L) `[<-.data.frame`(x, i, value=value)
else `[<-.data.frame`(x, i, j, value)
return(setalloccol(x)) # over-allocate (again). Avoid all this by using :=.
return(setalloccol(x, duplicateShared=FALSE)) # over-allocate (again). Avoid all this by using :=.
}
# TO DO: warningf("Please use DT[i,j:=value] syntax instead of DT[i,j]<-value, for efficiency. See ?':='")
if (!missing(i)) {
Expand All @@ -2251,7 +2251,7 @@ tail.data.table = function(x, n=6L, ...) {
if (is.matrix(i)) {
if (!missing(j)) stopf("When i is a matrix in DT[i]<-value syntax, it doesn't make sense to provide j")
x = `[<-.data.frame`(x, i, value=value)
return(setalloccol(x))
return(setalloccol(x, duplicateShared=FALSE))
}
i = x[i, which=TRUE]
# Tried adding ... after value above, and passing ... in here (e.g. for mult="first") but R CMD check
Expand Down Expand Up @@ -2284,11 +2284,11 @@ tail.data.table = function(x, n=6L, ...) {
reinstatekey=key(x)
}
if (!selfrefok(x) || truelength(x) < ncol(x)+length(newnames)) {
x = setalloccol(x, length(x)+length(newnames)) # because [<- copies via *tmp* and main/duplicate.c copies at length but copies truelength over too
x = setalloccol(x, length(x)+length(newnames), duplicateShared=FALSE) # because [<- copies via *tmp* and main/duplicate.c copies at length but copies truelength over too
# search for one other .Call to assign in [.data.table to see how it differs
}
x = .Call(Cassign,copy(x),i,cols,newnames,value) # From 3.1.0, DF[2,"b"] = 7 no longer copies DF$a (so in this [<-.data.table method we need to copy)
setalloccol(x) # can maybe avoid this realloc, but this is (slow) [<- anyway, so just be safe.
setalloccol(x, duplicateShared=FALSE) # can maybe avoid this realloc, but this is (slow) [<- anyway, so just be safe.
if (length(reinstatekey)) setkeyv(x,reinstatekey)
invisible(x)
# no copy at all if user calls directly; i.e. `[<-.data.table`(x,i,j,value)
Expand All @@ -2302,7 +2302,7 @@ tail.data.table = function(x, n=6L, ...) {
"$<-.data.table" = function(x, name, value) {
if (!cedta()) {
ans = `$<-.data.frame`(x, name, value) # nocov
return(setalloccol(ans)) # nocov. over-allocate (again)
return(setalloccol(ans, duplicateShared=FALSE)) # nocov. over-allocate (again)
}
x = copy(x)
set(x,j=name,value=value) # important i is missing here
Expand Down Expand Up @@ -2359,7 +2359,7 @@ dimnames.data.table = function(x) {
setattr(x,"names",NULL) # e.g. plyr::melt() calls base::unname()
else {
setnames(x,value)
setalloccol(x)
setalloccol(x, duplicateShared=FALSE)
}
x # this returned value is now shallow copied by R 3.1.0 via *tmp*. A very welcome change.
}
Expand Down Expand Up @@ -2601,7 +2601,7 @@ copy = function(x) {
reallocate = function(y) {
if (is.data.table(y)) {
.Call(C_unlock, y)
setalloccol(y)
setalloccol(y, duplicateShared=FALSE)
} else if (is.list(y)) {
oldClass = class(y)
setattr(y, 'class', NULL) # otherwise [[.person method (which returns itself) results in infinite recursion, #4620
Expand Down Expand Up @@ -2666,11 +2666,11 @@ shallow = function(x, cols=NULL) {
ans
}

setalloccol = alloc.col = function(DT, n=getOption("datatable.alloccol"), verbose=getOption("datatable.verbose"))
setalloccol = alloc.col = function(DT, n=getOption("datatable.alloccol"), verbose=getOption("datatable.verbose"), duplicateShared=TRUE)
{
name = substitute(DT)
if (identical(name, quote(`*tmp*`))) stopf("setalloccol attempting to modify `*tmp*`")
ans = .Call(Calloccolwrapper, DT, eval(n), verbose)
ans = .Call(Calloccolwrapper, DT, eval(n), verbose, duplicateShared)
if (is.name(name)) {
name = as.character(name)
assign(name,ans,parent.frame(),inherits=TRUE)
Expand Down Expand Up @@ -2875,7 +2875,7 @@ rbindlist = function(l, use.names="check", fill=FALSE, idcol=NULL, ignore.attr=F
}
ans = .Call(Crbindlist, l, use.names, fill, idcol, ignore.attr)
if (!length(ans)) return(null.data.table())
setDT(ans)[]
setDT(ans, duplicateShared=FALSE)[]
}

vecseq = function(x,y,clamp) .Call(Cvecseq,x,y,clamp)
Expand Down Expand Up @@ -2953,7 +2953,7 @@ setDF = function(x, rownames=NULL) {
invisible(x)
}

setDT = function(x, keep.rownames=FALSE, key=NULL, check.names=FALSE) {
setDT = function(x, keep.rownames=FALSE, key=NULL, check.names=FALSE, duplicateShared=TRUE) {
Copy link
Member

@jangorecki jangorecki Oct 26, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would keep naming style consistent here by naming duplicate.shared, or maybe just shared which would be the opposite of what current arg means, so then logic needs !

name = substitute(x)
if (is.name(name)) {
home = function(x, env) {
Expand All @@ -2968,12 +2968,14 @@ setDT = function(x, keep.rownames=FALSE, key=NULL, check.names=FALSE) {
stopf("Cannot convert '%1$s' to data.table by reference because binding is locked. It is very likely that '%1$s' resides within a package (or an environment) that is locked to prevent modifying its variable bindings. Try copying the object to your current environment, ex: var <- copy(var) and then using setDT again.", cname)
}
}
if (!isTRUEorFALSE(duplicateShared))
stopf("'%s' must be TRUE or FALSE", "duplicateShared")
if (is.data.table(x)) {
# fix for #1078 and #1128, see .resetclass() for explanation.
setattr(x, 'class', .resetclass(x, 'data.table'))
if (!missing(key)) setkeyv(x, key) # fix for #1169
if (check.names) setattr(x, "names", make.names(names(x), unique=TRUE))
if (selfrefok(x) > 0L) return(invisible(x)) else setalloccol(x)
if (selfrefok(x) > 0L) return(invisible(x)) else setalloccol(x, duplicateShared=duplicateShared)
} else if (is.data.frame(x)) {
# check no matrix-like columns, #3760. Allow a single list(matrix) is unambiguous and depended on by some revdeps, #3581
# for performance, only warn on the first such column, #5426
Expand All @@ -2988,8 +2990,9 @@ setDT = function(x, keep.rownames=FALSE, key=NULL, check.names=FALSE) {
setattr(x, "row.names", .set_row_names(nrow(x)))
if (check.names) setattr(x, "names", make.names(names(x), unique=TRUE))
# fix for #1078 and #1128, see .resetclass() for explanation.
setattr(x, "class", .resetclass(x, 'data.frame'))
setalloccol(x)
setalloccol(x, duplicateShared=duplicateShared)
setattr(x, "class", .resetclass(x, 'data.frame')) # selfrek not ok after setattr class
setalloccol(x, duplicateShared=duplicateShared)
if (!is.null(rn)) {
nm = c(if (is.character(keep.rownames)) keep.rownames[1L] else "rn", names(x))
x[, (nm[1L]) := rn]
Expand Down Expand Up @@ -3017,7 +3020,7 @@ setDT = function(x, keep.rownames=FALSE, key=NULL, check.names=FALSE) {
}
setattr(x, "row.names", .set_row_names(nrow))
setattr(x, "class", c("data.table", "data.frame"))
setalloccol(x)
setalloccol(x, duplicateShared = duplicateShared)
} else {
stopf("Argument 'x' to 'setDT' should be a 'list', 'data.frame' or 'data.table'")
}
Expand Down
8 changes: 4 additions & 4 deletions R/fcast.R
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ dcast.data.table = function(data, formula, fun.aggregate = NULL, sep = "_", ...,
if (any(vapply_1b(dat[varnames], is.list))) {
stopf("Columns specified in formula can not be of type list")
}
setDT(dat)
setDT(dat, duplicateShared=FALSE)

m = as.list(match.call()[-1L])
subset = m[["subset"]][[2L]]
Expand Down Expand Up @@ -214,7 +214,7 @@ dcast.data.table = function(data, formula, fun.aggregate = NULL, sep = "_", ...,
lhs = shallow(dat, lhsnames); rhs = shallow(dat, rhsnames); val = shallow(dat, valnames)
# handle drop=TRUE/FALSE - Update: Logic moved to R, AND faster than previous version. Take that... old me :-).
if (all(drop)) {
map = setDT(lapply(list(lhsnames, rhsnames), function(cols) frankv(dat, cols=cols, ties.method="dense", na.last=FALSE))) # #2202 fix
map = setDT(lapply(list(lhsnames, rhsnames), function(cols) frankv(dat, cols=cols, ties.method="dense", na.last=FALSE)), duplicateShared=FALSE) # #2202 fix
maporder = lapply(map, order_)
mapunique = lapply(seq_along(map), function(i) .Call(CsubsetVector, map[[i]], maporder[[i]]))
lhs = .Call(CsubsetDT, lhs, maporder[[1L]], seq_along(lhs))
Expand All @@ -225,7 +225,7 @@ dcast.data.table = function(data, formula, fun.aggregate = NULL, sep = "_", ...,
map = vector("list", 2L)
.Call(Csetlistelt, map, 1L, lhs_[lhs, which=TRUE])
.Call(Csetlistelt, map, 2L, rhs_[rhs, which=TRUE])
setDT(map)
setDT(map, duplicateShared=FALSE)
mapunique = vector("list", 2L)
.Call(Csetlistelt, mapunique, 1L, seq_len(nrow(lhs_)))
.Call(Csetlistelt, mapunique, 2L, seq_len(nrow(rhs_)))
Expand All @@ -245,7 +245,7 @@ dcast.data.table = function(data, formula, fun.aggregate = NULL, sep = "_", ...,
else c(CJ(valnames, allcols, sorted=FALSE), sep=sep))
# removed 'setcolorder()' here, #1153
setattr(ans, 'names', c(lhsnames, allcols))
setDT(ans)
setDT(ans, duplicateShared=FALSE)
setattr(ans, 'sorted', lhsnames)
ans
}
2 changes: 1 addition & 1 deletion R/fmelt.R
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,7 @@ melt.data.table = function(data, id.vars, measure.vars, variable.name = "variabl
as.logical(variable.factor), as.logical(value.factor),
variable.name, value.name, as.logical(na.rm),
as.logical(verbose))
setDT(ans)
setDT(ans, duplicateShared=FALSE)
if (anyDuplicated(names(ans))) {
catf("Duplicate column names found in molten data.table. Setting unique names using 'make.names'\n")
setnames(ans, make.unique(names(ans)))
Expand Down
2 changes: 1 addition & 1 deletion R/foverlaps.R
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ foverlaps = function(x, y, by.x=key(x) %||% key(y), by.y=key(y), maxgap=0L, mino
}
# nocov end

setDT(olaps)
setDT(olaps, duplicateShared=FALSE)
setnames(olaps, c("xid", "yid"))
yid = NULL # for 'no visible binding for global variable' from R CMD check on i clauses below

Expand Down
13 changes: 10 additions & 3 deletions R/frank.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
warningf("length(na.last) > 1, only the first element will be used")
na.last = na.last[1L]
}
input_names = NULL
keep = (na.last == "keep")
na.last = as.logical(na.last)
as_list = function(x) {
Expand All @@ -16,6 +17,7 @@
if (!missing(cols) && !is.null(cols))
stopf("x is a single vector, non-NULL 'cols' doesn't make sense")
cols = 1L
input_names = names(x)

Check warning on line 20 in R/frank.R

View workflow job for this annotation

GitHub Actions / lint-r

file=R/frank.R,line=20,col=27,[trailing_whitespace_linter] Remove trailing whitespace.
x = as_list(x)
} else {
cols = colnamesInt(x, cols, check_dups=TRUE)
Expand All @@ -24,7 +26,7 @@
}
# need to unlock for #4429
x = .shallow(x, cols, unlock = TRUE) # shallow copy even if list..
setDT(x)
setDT(x, duplicateShared=TRUE)
cols = seq_along(cols)
if (is.na(na.last)) {
if ("..na_prefix.." %chin% names(x))
Expand Down Expand Up @@ -66,10 +68,15 @@
# take care of na.last="keep"
V1 = NULL # for R CMD CHECK warning
if (isTRUE(keep)) {
ans = (setDT(as_list(ans))[which_(nas, TRUE), V1 := NA])[[1L]]
ans = (setDT(as_list(ans), duplicateShared=TRUE)[which_(nas, TRUE), V1 := NA])[[1L]]
} else if (is.na(na.last)) {
ans = ans[which_(nas, FALSE)]
idx = which_(nas, FALSE)
if (!is.null(input_names))
input_names = input_names[idx]
ans = ans[idx]
}
if (!is.null(input_names))
names(ans) = input_names
ans
}

Expand Down
2 changes: 1 addition & 1 deletion R/fread.R
Original file line number Diff line number Diff line change
Expand Up @@ -301,7 +301,7 @@ yaml=FALSE, tmpdir=tempdir(), tz="UTC")

if (isTRUE(data.table)) {
setattr(ans, "class", c("data.table", "data.frame"))
setalloccol(ans)
setalloccol(ans, duplicateShared=FALSE)
} else {
setattr(ans, "class", "data.frame")
}
Expand Down
2 changes: 1 addition & 1 deletion R/frollapply.R
Original file line number Diff line number Diff line change
Expand Up @@ -265,7 +265,7 @@ frollapply = function(X, N, FUN, ..., by.column=TRUE, fill=NA, align=c("right","
} else {
rev.d = function(d) {
l = lapply(d, rev)
if (is.data.table(d)) setDT(l) else if (is.data.frame(d)) setDF(l) else l
if (is.data.table(d)) setDT(l, duplicateShared=FALSE) else if (is.data.frame(d)) setDF(l) else l
}
X = lapply(X, rev.d)
}
Expand Down
Loading
Loading