Skip to content

Commit e7d1412

Browse files
committed
Fixed nested ':=' reference assignment fails (#6768)
*Followed TODO: by mattdowle from resolution to '2-space indentation #2420' *Added tests for jsub that modify DT by-reference *Added test case for interger vector indexing
1 parent b089b74 commit e7d1412

File tree

3 files changed

+94
-48
lines changed

3 files changed

+94
-48
lines changed

NEWS.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,8 @@ rowwiseDT(
133133
134134
19. An integer overflow in `fread()` with lines longer than `2^(31/2)` bytes is prevented, [#6729](https://github.com/Rdatatable/data.table/issues/6729). The typical impact was no worse than a wrong initial allocation size, corrected later. Thanks to @TaikiSan21 for the report and @aitap for the fix.
135135
136+
20. By reference assignments (':=') with functions that modify the data.table by reference e.g. (`foo=function(DT){DT[,b:=1L];return(2L)}`, `DT[,a:=foo(DT)]`) returned a mallformed data.table due to the the modification of the targeted named column ("a") index before and after the j expression evaluation. Thanks @AntonNM for the the report and fix.
137+
136138
## NOTES
137139
138140
1. There is a new vignette on joins! See `vignette("datatable-joins")`. Thanks to Angel Feliz for authoring it! Feedback welcome. This vignette has been highly requested since 2017: [#2181](https://github.com/Rdatatable/data.table/issues/2181).

R/data.table.R

Lines changed: 83 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -1154,7 +1154,6 @@ replace_dot_alias = function(e) {
11541154
} else if (is.numeric(lhs)) {
11551155
m = as.integer(lhs)
11561156
if (any(m<1L | ncol(x)<m)) stopf("LHS of := appears to be column positions but are outside [1,ncol] range. New columns can only be added by name.")
1157-
lhs = names_x[m]
11581157
} else
11591158
stopf("LHS of := isn't column names ('character') or positions ('integer' or 'numeric')")
11601159
if (!anyNA(m)) {
@@ -1179,57 +1178,16 @@ replace_dot_alias = function(e) {
11791178
return(invisible(x))
11801179
}
11811180
} else {
1182-
# Adding new column(s). TO DO: move after the first eval in case the jsub has an error.
1181+
# Adding new column(s). Allocation for columns and recalculation of target cols moved after the jval = eval(jsub)
1182+
# in case of error or by-reference modifications to the DT
11831183
newnames=setdiff(lhs, names_x)
11841184
m[is.na(m)] = ncol(x)+seq_len(length(newnames))
11851185
cols = as.integer(m)
11861186
# don't pass verbose to selfrefok here -- only activated when
1187-
# ok=-1 which will trigger setalloccol with verbose in the next
1188-
# branch, which again calls _selfrefok and returns the message then
1187+
# ok=-1 which will trigger setalloccol with verbose after
1188+
# the jval = eval(jsub, ...)
11891189
if ((ok<-selfrefok(x, verbose=FALSE))==0L) # ok==0 so no warning when loaded from disk (-1) [-1 considered TRUE by R]
11901190
if (is.data.table(x)) warningf("A shallow copy of this data.table was taken so that := can add or remove %d columns by reference. At an earlier point, this data.table was copied by R (or was created manually using structure() or similar). Avoid names<- and attr<- which in R currently (and oddly) may copy the whole data.table. Use set* syntax instead to avoid copying: ?set, ?setnames and ?setattr. It's also not unusual for data.table-agnostic packages to produce tables affected by this issue. If this message doesn't help, please report your use case to the data.table issue tracker so the root cause can be fixed or this message improved.", length(newnames))
1191-
# !is.data.table for DF |> DT(,:=) tests 2212.16-19 (#5113) where a shallow copy is routine for data.frame
1192-
if ((ok<1L) || (truelength(x) < ncol(x)+length(newnames))) {
1193-
DT = x # in case getOption contains "ncol(DT)" as it used to. TODO: warn and then remove
1194-
n = length(newnames) + eval(getOption("datatable.alloccol")) # TODO: warn about expressions and then drop the eval()
1195-
# i.e. reallocate at the size as if the new columns were added followed by setalloccol().
1196-
name = substitute(x)
1197-
if (is.name(name) && ok && verbose) { # && NAMED(x)>0 (TO DO) # ok here includes -1 (loaded from disk)
1198-
catf("Growing vector of column pointers from truelength %d to %d. A shallow copy has been taken, see ?setalloccol. Only a potential issue if two variables point to the same data (we can't yet detect that well) and if not you can safely ignore this. To avoid this message you could setalloccol() first, deep copy first using copy(), wrap with suppressWarnings() or increase the 'datatable.alloccol' option.\n", truelength(x), n)
1199-
# #1729 -- copying to the wrong environment here can cause some confusion
1200-
if (ok == -1L) catf("Note that the shallow copy will assign to the environment from which := was called. That means for example that if := was called within a function, the original table may be unaffected.\n")
1201-
1202-
# Verbosity should not issue warnings, so cat rather than warning.
1203-
# TO DO: Add option 'datatable.pedantic' to turn on warnings like this.
1204-
1205-
# TO DO ... comments moved up from C ...
1206-
# Note that the NAMED(dt)>1 doesn't work because .Call
1207-
# always sets to 2 (see R-ints), it seems. Work around
1208-
# may be possible but not yet working. When the NAMED test works, we can drop allocwarn argument too
1209-
# because that's just passed in as FALSE from [<- where we know `*tmp*` isn't really NAMED=2.
1210-
# Note also that this growing will happen for missing columns assigned NULL, too. But so rare, we
1211-
# don't mind.
1212-
}
1213-
setalloccol(x, n, verbose=verbose) # always assigns to calling scope; i.e. this scope
1214-
if (is.name(name)) {
1215-
assign(as.character(name),x,parent.frame(),inherits=TRUE)
1216-
} else if (.is_simple_extraction(name)) { # TODO(#6702): use a helper here as the code is very similar to setDT().
1217-
k = eval(name[[2L]], parent.frame(), parent.frame())
1218-
if (is.list(k)) {
1219-
origj = j = if (name[[1L]] == "$") as.character(name[[3L]]) else eval(name[[3L]], parent.frame(), parent.frame())
1220-
if (is.character(j)) {
1221-
if (length(j)!=1L) stopf("Cannot assign to an under-allocated recursively indexed list -- L[[i]][,:=] syntax is only valid when i is length 1, but its length is %d", length(j))
1222-
j = match(j, names(k))
1223-
if (is.na(j)) internal_error("item '%s' not found in names of list", origj) # nocov
1224-
}
1225-
.Call(Csetlistelt,k,as.integer(j), x)
1226-
} else if (is.environment(k) && exists(as.character(name[[3L]]), k)) {
1227-
assign(as.character(name[[3L]]), x, k, inherits=FALSE)
1228-
} else if (isS4(k)) {
1229-
.Call(CsetS4elt, k, as.character(name[[3L]]), x)
1230-
}
1231-
} # TO DO: else if env$<- or list$<-
1232-
}
12331191
}
12341192
}
12351193
}
@@ -1402,6 +1360,85 @@ replace_dot_alias = function(e) {
14021360
}
14031361

14041362
if (!is.null(lhs)) {
1363+
# Re-matches characters names in the lhs after jval to account for jsub's that modify the columns of the data.table (#6768)
1364+
# Replaces numerical lhs with respective names_x
1365+
if(is.character(lhs)){
1366+
m = chmatch(lhs, names_x)
1367+
if(!anyNA(m)){
1368+
# updates by reference to existing columns
1369+
cols = as.integer(m)
1370+
newnames=NULL
1371+
if (identical(irows, integer())) {
1372+
# Empty integer() means no rows e.g. logical i with only FALSE and NA
1373+
# got converted to empty integer() by the which() above
1374+
# Short circuit and do-nothing since columns already exist. If some don't
1375+
# exist then for consistency with cases where irows is non-empty, we need to create
1376+
# them of the right type and populate with NA. Which will happen via the regular
1377+
# alternative branches below, to cover #759.
1378+
# We need this short circuit at all just for convenience. Otherwise users may need to
1379+
# fix errors in their RHS when called on empty edge cases, even when the result won't be
1380+
# used anyway (so it would be annoying to have to fix it.)
1381+
if (verbose) {
1382+
catf("No rows match i. No new columns to add so not evaluating RHS of :=\nAssigning to 0 row subset of %d rows\n", nrow(x))
1383+
}
1384+
.Call(Cassign, x, irows, NULL, NULL, NULL) # only purpose is to write 0 to .Last.updated
1385+
.global$print = address(x)
1386+
return(invisible(x))
1387+
}
1388+
}else{
1389+
# Adding new column(s).
1390+
newnames=setdiff(lhs, names_x)
1391+
m[is.na(m)] = ncol(x)+seq_len(length(newnames))
1392+
cols = as.integer(m)
1393+
# ok <- selfrefok above called without verbose -- only activated when
1394+
# ok=-1 which will trigger setalloccol with verbose in the next
1395+
# branch, which again calls _selfrefok and returns the message then
1396+
# !is.data.table for DF |> DT(,:=) tests 2212.16-19 (#5113) where a shallow copy is routine for data.frame
1397+
if ((ok<1L) || (truelength(x) < ncol(x)+length(newnames))) {
1398+
DT = x # in case getOption contains "ncol(DT)" as it used to. TODO: warn and then remove
1399+
n = length(newnames) + eval(getOption("datatable.alloccol")) # TODO: warn about expressions and then drop the eval()
1400+
# i.e. reallocate at the size as if the new columns were added followed by setalloccol().
1401+
name = substitute(x)
1402+
if (is.name(name) && ok && verbose) { # && NAMED(x)>0 (TO DO) # ok here includes -1 (loaded from disk)
1403+
catf("Growing vector of column pointers from truelength %d to %d. A shallow copy has been taken, see ?setalloccol. Only a potential issue if two variables point to the same data (we can't yet detect that well) and if not you can safely ignore this. To avoid this message you could setalloccol() first, deep copy first using copy(), wrap with suppressWarnings() or increase the 'datatable.alloccol' option.\n", truelength(x), n)
1404+
# #1729 -- copying to the wrong environment here can cause some confusion
1405+
if (ok == -1L) catf("Note that the shallow copy will assign to the environment from which := was called. That means for example that if := was called within a function, the original table may be unaffected.\n")
1406+
1407+
# Verbosity should not issue warnings, so cat rather than warning.
1408+
# TO DO: Add option 'datatable.pedantic' to turn on warnings like this.
1409+
1410+
# TO DO ... comments moved up from C ...
1411+
# Note that the NAMED(dt)>1 doesn't work because .Call
1412+
# always sets to 2 (see R-ints), it seems. Work around
1413+
# may be possible but not yet working. When the NAMED test works, we can drop allocwarn argument too
1414+
# because that's just passed in as FALSE from [<- where we know `*tmp*` isn't really NAMED=2.
1415+
# Note also that this growing will happen for missing columns assigned NULL, too. But so rare, we
1416+
# don't mind.
1417+
}
1418+
setalloccol(x, n, verbose=verbose) # always assigns to calling scope; i.e. this scope
1419+
if (is.name(name)) {
1420+
assign(as.character(name),x,parent.frame(),inherits=TRUE)
1421+
} else if (.is_simple_extraction(name)) { # TODO(#6702): use a helper here as the code is very similar to setDT().
1422+
k = eval(name[[2L]], parent.frame(), parent.frame())
1423+
if (is.list(k)) {
1424+
origj = j = if (name[[1L]] == "$") as.character(name[[3L]]) else eval(name[[3L]], parent.frame(), parent.frame())
1425+
if (is.character(j)) {
1426+
if (length(j)!=1L) stopf("Cannot assign to an under-allocated recursively indexed list -- L[[i]][,:=] syntax is only valid when i is length 1, but its length is %d", length(j))
1427+
j = match(j, names(k))
1428+
if (is.na(j)) internal_error("item '%s' not found in names of list", origj) # nocov
1429+
}
1430+
.Call(Csetlistelt,k,as.integer(j), x)
1431+
} else if (is.environment(k) && exists(as.character(name[[3L]]), k)) {
1432+
assign(as.character(name[[3L]]), x, k, inherits=FALSE)
1433+
} else if (isS4(k)) {
1434+
.Call(CsetS4elt, k, as.character(name[[3L]]), x)
1435+
}
1436+
} # TO DO: else if env$<- or list$<-
1437+
}
1438+
}
1439+
} else if (is.numeric(lhs)) {
1440+
lhs = names_x[m]
1441+
}
14051442
# TODO?: use set() here now that it can add new columns. Then remove newnames and alloc logic above.
14061443
.Call(Cassign,x,irows,cols,newnames,jval)
14071444
return(suppPrint(x))

inst/tests/tests.Rraw

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1979,8 +1979,9 @@ test(632, merge(DT1,DT2,all=TRUE), data.table(a=c(1,2,3,4,5),total.x=c(2,NA,1,3,
19791979
test(632.1, merge(DT1,DT2,all=TRUE), setkey(adt(merge(adf(DT1),adf(DT2),by="a",all=TRUE)),a))
19801980

19811981
# Test that with=FALSE by number isn't messed up by dup column names, #2025
1982-
DT = data.table(a=1:3,a=4:6)
1983-
test(634, DT[,2:=200L], data.table(a=1:3,a=200L))
1982+
DT = data.table(a=1:3, a=4:6, a=7:9)
1983+
test(634, DT[,2:=200L], data.table(a=1:3, a=200L, a=7:9))
1984+
test(634.1, DT[,c(2, 3):=200L], data.table(a=1:3, a=200L, a=200L))
19841985

19851986
# Test names when not all items are named, #2029
19861987
DT = data.table(x=1:3,y=1:3)
@@ -21063,3 +21064,9 @@ test(2304.100, set(copy(DT), i=2L, j=c("L1", "L2"), value=list(list(NULL), list(
2106321064

2106421065
# the integer overflow in #6729 is only noticeable with UBSan
2106521066
test(2305, { fread(testDir("issue_6729.txt.bz2")); TRUE })
21067+
21068+
# j expressions that modify a data.table by reference, (#6768)
21069+
inner=function(dt){dt[,b:=4:6]}
21070+
outer=function(dt){inner(dt); return(7:9)}
21071+
DT = data.table(a=1:3)
21072+
test(2306, DT[,c:=outer(DT)], data.table(a=1:3, b=4:6, c=7:9))

0 commit comments

Comments
 (0)