diff --git a/NEWS.md b/NEWS.md index 994b54be48..c14705fe30 100644 --- a/NEWS.md +++ b/NEWS.md @@ -550,6 +550,8 @@ rowwiseDT( 22. `fread()` could fail to read Mac CSV files (with `\r` line endings) if the file contained any `\n` character, such as a final `\r\n`. This was fixed by detecting the predominant line ending in a sample of the file, [#4186](https://github.com/Rdatatable/data.table/issues/4186). Thanks to @MPagel for the report and @ben-schwen for the fix. +23. By reference assignments (':=') with functions that modified the data.table by reference e.g. (`foo=function(DT){modify(DT);return(1L)}`, `DT[,a:=foo(DT)]`) returned a malformed data.table due to the modification of the targeted named column index ("a") during the j expression evaluation [#6768](https://github.com/Rdatatable/data.table/issues/6768). Thanks @AntonNM for the report and fix. + ### NOTES 1. There is a new vignette on joins! See `vignette("datatable-joins")`. Thanks to Angel Feliz for authoring it! Feedback welcome. This vignette has been highly requested since 2017: [#2181](https://github.com/Rdatatable/data.table/issues/2181). diff --git a/R/data.table.R b/R/data.table.R index 573399be44..439f8a18fc 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1189,7 +1189,6 @@ replace_dot_alias = function(e) { } else if (is.numeric(lhs)) { m = as.integer(lhs) if (any(m<1L | ncol(x) DT(,:=) tests 2212.16-19 (#5113) where a shallow copy is routine for data.frame - if ((ok<1L) || (truelength(x) < ncol(x)+length(newnames))) { - DT = x # in case getOption contains "ncol(DT)" as it used to. TODO: warn and then remove - n = length(newnames) + eval(getOption("datatable.alloccol")) # TODO: warn about expressions and then drop the eval() - # i.e. reallocate at the size as if the new columns were added followed by setalloccol(). - name = substitute(x) - if (is.name(name) && ok && verbose) { # && NAMED(x)>0 (TO DO) # ok here includes -1 (loaded from disk) - catf("Growing vector of column pointers from truelength %d to %d. A shallow copy has been taken, see ?setalloccol. Only a potential issue if two variables point to the same data (we can't yet detect that well) and if not you can safely ignore this. To avoid this message you could setalloccol() first, deep copy first using copy(), wrap with suppressWarnings() or increase the 'datatable.alloccol' option.\n", truelength(x), n) - # #1729 -- copying to the wrong environment here can cause some confusion - if (ok == -1L) catf("Note that the shallow copy will assign to the environment from which := was called. That means for example that if := was called within a function, the original table may be unaffected.\n") - - # Verbosity should not issue warnings, so cat rather than warning. - # TO DO: Add option 'datatable.pedantic' to turn on warnings like this. - - # TO DO ... comments moved up from C ... - # Note that the NAMED(dt)>1 doesn't work because .Call - # always sets to 2 (see R-ints), it seems. Work around - # may be possible but not yet working. When the NAMED test works, we can drop allocwarn argument too - # because that's just passed in as FALSE from [<- where we know `*tmp*` isn't really NAMED=2. - # Note also that this growing will happen for missing columns assigned NULL, too. But so rare, we - # don't mind. - } - setalloccol(x, n, verbose=verbose) # always assigns to calling scope; i.e. this scope - if (is.name(name)) { - assign(as.character(name),x,parent.frame(),inherits=TRUE) - } else if (.is_simple_extraction(name)) { - .reassign_extracted_table(name, x) - } # TO DO: else if env$<- or list$<- - } } } } @@ -1411,6 +1382,55 @@ replace_dot_alias = function(e) { } if (!is.null(lhs)) { + # Re-matches characters names in the lhs after jval to account for jsub's that modify the columns of the data.table (#6768) + # Replaces numerical lhs with respective names_x + if(is.character(lhs)){ + m = chmatch(lhs, names_x) + if(!anyNA(m)) { + # updates by reference to existing columns + cols = as.integer(m) + newnames = NULL + } else { + # Adding new column(s). + newnames = setdiff(lhs, names_x) + m[is.na(m)] = ncol(x) + seq_along(newnames) + cols = as.integer(m) + # ok <- selfrefok above called without verbose -- only activated when + # ok=-1 which will trigger setalloccol with verbose in the next + # branch, which again calls _selfrefok and returns the message then + # !is.data.table for DF |> DT(,:=) tests 2212.16-19 (#5113) where a shallow copy is routine for data.frame + if ((ok<1L) || (truelength(x) < ncol(x)+length(newnames))) { + DT = x # in case getOption contains "ncol(DT)" as it used to. TODO: warn and then remove + n = length(newnames) + eval(getOption("datatable.alloccol")) # TODO: warn about expressions and then drop the eval() + # i.e. reallocate at the size as if the new columns were added followed by setalloccol(). + name = substitute(x) + if (is.name(name) && ok && verbose) { # && NAMED(x)>0 (TO DO) # ok here includes -1 (loaded from disk) + catf("Growing vector of column pointers from truelength %d to %d. A shallow copy has been taken, see ?setalloccol. Only a potential issue if two variables point to the same data (we can't yet detect that well) and if not you can safely ignore this. To avoid this message you could setalloccol() first, deep copy first using copy(), wrap with suppressWarnings() or increase the 'datatable.alloccol' option.\n", truelength(x), n) + # #1729 -- copying to the wrong environment here can cause some confusion + if (ok == -1L) catf("Note that the shallow copy will assign to the environment from which := was called. That means for example that if := was called within a function, the original table may be unaffected.\n") + + # Verbosity should not issue warnings, so cat rather than warning. + # TO DO: Add option 'datatable.pedantic' to turn on warnings like this. + + # TO DO ... comments moved up from C ... + # Note that the NAMED(dt)>1 doesn't work because .Call + # always sets to 2 (see R-ints), it seems. Work around + # may be possible but not yet working. When the NAMED test works, we can drop allocwarn argument too + # because that's just passed in as FALSE from [<- where we know `*tmp*` isn't really NAMED=2. + # Note also that this growing will happen for missing columns assigned NULL, too. But so rare, we + # don't mind. + } + setalloccol(x, n, verbose=verbose) # always assigns to calling scope; i.e. this scope + if (is.name(name)) { + assign(as.character(name),x,parent.frame(),inherits=TRUE) + } else if (.is_simple_extraction(name)) { + .reassign_extracted_table(name, x) + } # TO DO: else if env$<- or list$<- + } + } + } else if (is.numeric(lhs)) { + lhs = names_x[m] + } # TODO?: use set() here now that it can add new columns. Then remove newnames and alloc logic above. .Call(Cassign,x,irows,cols,newnames,jval) return(suppPrint(x)) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 4e7165a84a..dffb351067 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -1960,8 +1960,9 @@ test(632, merge(DT1,DT2,all=TRUE), data.table(a=c(1,2,3,4,5),total.x=c(2,NA,1,3, test(632.1, merge(DT1,DT2,all=TRUE), setkey(adt(merge(adf(DT1),adf(DT2),by="a",all=TRUE)),a)) # Test that with=FALSE by number isn't messed up by dup column names, #2025 -DT = data.table(a=1:3,a=4:6) -test(634, DT[,2:=200L], data.table(a=1:3,a=200L)) +DT = data.table(a=1:3, a=4:6, a=7:9) +test(634, DT[,2:=200L], data.table(a=1:3, a=200L, a=7:9)) +test(634.1, DT[,c(2, 3):=200L], data.table(a=1:3, a=200L, a=200L)) # Test names when not all items are named, #2029 DT = data.table(x=1:3,y=1:3) @@ -21876,3 +21877,11 @@ test(2347, DT[i, .(result = all(is.na(grp) == is.na(a))), by = grp][,all(result) DT = data.table(a = as.Date("2010-01-01"), b = 1L) test(2348.1, tryCatch(DT[a == as.Date("20100101")], error=conditionCall)[[1L]], quote(charToDate)) test(2348.2, tryCatch(DT[a == as.Date("20100101") | b == 2L], error=conditionCall)[[1L]], quote(charToDate)) + +# j expressions that modify a data.table by reference, (#6768) +inner = function(dt) dt[,b:=4:6] +outer = function(dt) { inner(dt); return(7:9) } +foo = function(dt) { dt[,b:=4:6]; return(7:9) } +DT = data.table(a=1:3) +test(2349, DT[,c:=outer(DT)], data.table(a=1:3, b=4:6, c=7:9)) +test(2349.1, DT[,c:=foo(DT)], data.table(a=1:3, b=4:6, c=7:9))