Skip to content

Commit ba7a1b3

Browse files
aitapben-schwen
andauthored
DT[,foo:=bar]: earlier check for selfrefok (#7502)
* DT[,foo:=bar]: earlier check for selfrefok The check needs to be there, not below, to detect non-selfrefok tables in by-group operations. Use static analysis to detect common forms of column deletion: foo := NULL and .(bar, baz) := .(NULL, NULL). Static analysis is doomed to miss things like frob := if (runif(1) < .5) 42 else NULL, but hopefully it covers the needs of our reverse dependencies. * add second setalloccol check --------- Co-authored-by: Benjamin Schwendinger <[email protected]> Co-authored-by: Benjamin Schwendinger <[email protected]>
1 parent 64e9ec1 commit ba7a1b3

File tree

2 files changed

+52
-39
lines changed

2 files changed

+52
-39
lines changed

R/data.table.R

Lines changed: 47 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -1220,11 +1220,51 @@ replace_dot_alias = function(e) {
12201220
m[is.na(m)] = ncol(x)+seq_along(newnames)
12211221
cols = as.integer(m)
12221222
# don't pass verbose to selfrefok here -- only activated when
1223-
# ok=-1 which will trigger setalloccol with verbose after
1224-
# the jval = eval(jsub, ...)
1223+
# ok=-1 which will trigger setalloccol with verbose in the next branch
1224+
# if a change in the number of columns is suspected
12251225
if (ok==0L) # ok==0 so no warning when loaded from disk (-1) [-1 considered TRUE by R]
12261226
if (is.data.table(x)) warningf("A shallow copy of this data.table was taken so that := can add or remove %d columns by reference. At an earlier point, this data.table was copied by R (or was created manually using structure() or similar). Avoid names<- and attr<- which in R currently (and oddly) may copy the whole data.table. Use set* syntax instead to avoid copying: ?set, ?setnames and ?setattr. It's also not unusual for data.table-agnostic packages to produce tables affected by this issue. If this message doesn't help, please report your use case to the data.table issue tracker so the root cause can be fixed or this message improved.", length(newnames))
12271227
}
1228+
# ok <- selfrefok above called without verbose -- only activated when
1229+
# ok=-1 which will trigger setalloccol with verbose in the next
1230+
# branch, which again calls _selfrefok and returns the message then
1231+
# !is.data.table for DF |> DT(,:=) tests 2212.16-19 (#5113) where a shallow copy is routine for data.frame
1232+
if (
1233+
(
1234+
!is.null(newnames) || # adding new columns
1235+
is.null(jsub) || (jsub %iscall% "list" && any(vapply_1b(jsub[-1], is.null))) # removing columns
1236+
) && (
1237+
(ok<1L) || # unsafe to resize
1238+
(truelength(x) < ncol(x)+length(newnames)) # not enough space for new columns
1239+
)
1240+
) {
1241+
DT = x # in case getOption contains "ncol(DT)" as it used to. TODO: warn and then remove
1242+
n = length(newnames) + eval(getOption("datatable.alloccol")) # TODO: warn about expressions and then drop the eval()
1243+
# i.e. reallocate at the size as if the new columns were added followed by setalloccol().
1244+
name = substitute(x)
1245+
if (is.name(name) && ok && verbose) { # && NAMED(x)>0 (TO DO) # ok here includes -1 (loaded from disk)
1246+
catf("Growing vector of column pointers from truelength %d to %d. A shallow copy has been taken, see ?setalloccol. Only a potential issue if two variables point to the same data (we can't yet detect that well) and if not you can safely ignore this. To avoid this message you could setalloccol() first, deep copy first using copy(), wrap with suppressWarnings() or increase the 'datatable.alloccol' option.\n", truelength(x), n)
1247+
# #1729 -- copying to the wrong environment here can cause some confusion
1248+
if (ok == -1L) catf("Note that the shallow copy will assign to the environment from which := was called. That means for example that if := was called within a function, the original table may be unaffected.\n")
1249+
1250+
# Verbosity should not issue warnings, so cat rather than warning.
1251+
# TO DO: Add option 'datatable.pedantic' to turn on warnings like this.
1252+
1253+
# TO DO ... comments moved up from C ...
1254+
# Note that the NAMED(dt)>1 doesn't work because .Call
1255+
# always sets to 2 (see R-ints), it seems. Work around
1256+
# may be possible but not yet working. When the NAMED test works, we can drop allocwarn argument too
1257+
# because that's just passed in as FALSE from [<- where we know `*tmp*` isn't really NAMED=2.
1258+
# Note also that this growing will happen for missing columns assigned NULL, too. But so rare, we
1259+
# don't mind.
1260+
}
1261+
setalloccol(x, n, verbose=verbose) # always assigns to calling scope; i.e. this scope
1262+
if (is.name(name)) {
1263+
assign(as.character(name),x,parent.frame(),inherits=TRUE)
1264+
} else if (.is_simple_extraction(name)) {
1265+
.reassign_extracted_table(name, x)
1266+
} # TO DO: else if env$<- or list$<-
1267+
}
12281268
}
12291269
}
12301270

@@ -1400,45 +1440,13 @@ replace_dot_alias = function(e) {
14001440
} else if (is.numeric(lhs)) {
14011441
lhs = names_x[m]
14021442
}
1403-
# ok <- selfrefok above called without verbose -- only activated when
1404-
# ok=-1 which will trigger setalloccol with verbose in the next
1405-
# branch, which again calls _selfrefok and returns the message then
1406-
# !is.data.table for DF |> DT(,:=) tests 2212.16-19 (#5113) where a shallow copy is routine for data.frame
1407-
if (
1408-
(
1409-
!is.null(newnames) || # adding new columns
1410-
is.null(jval) || (is.list(jval) && any(vapply_1b(jval, is.null))) # removing columns
1411-
) && (
1412-
(ok<1L) || # unsafe to resize
1413-
(truelength(x) < ncol(x)+length(newnames)) # not enough space for new columns
1414-
)
1415-
) {
1416-
DT = x # in case getOption contains "ncol(DT)" as it used to. TODO: warn and then remove
1417-
n = length(newnames) + eval(getOption("datatable.alloccol")) # TODO: warn about expressions and then drop the eval()
1418-
# i.e. reallocate at the size as if the new columns were added followed by setalloccol().
1443+
# cater for deleting columns by assigning NULL
1444+
if ((is.null(jval) || (is.list(jval) && any(vapply_1b(jval, is.null)))) && selfrefok(x, verbose=FALSE) < 1L) {
14191445
name = substitute(x)
1420-
if (is.name(name) && ok && verbose) { # && NAMED(x)>0 (TO DO) # ok here includes -1 (loaded from disk)
1421-
catf("Growing vector of column pointers from truelength %d to %d. A shallow copy has been taken, see ?setalloccol. Only a potential issue if two variables point to the same data (we can't yet detect that well) and if not you can safely ignore this. To avoid this message you could setalloccol() first, deep copy first using copy(), wrap with suppressWarnings() or increase the 'datatable.alloccol' option.\n", truelength(x), n)
1422-
# #1729 -- copying to the wrong environment here can cause some confusion
1423-
if (ok == -1L) catf("Note that the shallow copy will assign to the environment from which := was called. That means for example that if := was called within a function, the original table may be unaffected.\n")
1424-
1425-
# Verbosity should not issue warnings, so cat rather than warning.
1426-
# TO DO: Add option 'datatable.pedantic' to turn on warnings like this.
1427-
1428-
# TO DO ... comments moved up from C ...
1429-
# Note that the NAMED(dt)>1 doesn't work because .Call
1430-
# always sets to 2 (see R-ints), it seems. Work around
1431-
# may be possible but not yet working. When the NAMED test works, we can drop allocwarn argument too
1432-
# because that's just passed in as FALSE from [<- where we know `*tmp*` isn't really NAMED=2.
1433-
# Note also that this growing will happen for missing columns assigned NULL, too. But so rare, we
1434-
# don't mind.
1435-
}
1436-
setalloccol(x, n, verbose=verbose) # always assigns to calling scope; i.e. this scope
1446+
setalloccol(x, verbose=FALSE)
14371447
if (is.name(name)) {
1438-
assign(as.character(name),x,parent.frame(),inherits=TRUE)
1439-
} else if (.is_simple_extraction(name)) {
1440-
.reassign_extracted_table(name, x)
1441-
} # TO DO: else if env$<- or list$<-
1448+
assign(as.character(name), x, parent.frame(), inherits=TRUE)
1449+
}
14421450
}
14431451
# TODO?: use set() here now that it can add new columns. Then remove newnames and alloc logic above.
14441452
.Call(Cassign,x,irows,cols,newnames,jval)

inst/tests/tests.Rraw

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21916,3 +21916,8 @@ DT = suppressMessages(rbindlist(DTl)) # used to crash
2191621916
test(2352.1, dim(DT), c(300002L, 1L))
2191721917
test(2352.2, DT[[1]], rep(42, nrow(DT)))
2191821918
rm(DTn, DTl, DT)
21919+
21920+
# insert columns by group in freshly unserialized data.tables, #7498
21921+
DT = unserialize(serialize(as.data.table(mtcars), NULL))
21922+
test(2353, DT[,foo:=mean(mpg),by=cyl], as.data.table(mtcars)[,foo:=mean(mpg),by=cyl])
21923+
rm(DT)

0 commit comments

Comments
 (0)