Skip to content

Commit 1c025c0

Browse files
committed
Merge branch 'master' into fix-7452
2 parents c8d051e + 1ba8785 commit 1c025c0

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

55 files changed

+16722
-11092
lines changed

.ci/atime/tests.R

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -286,5 +286,19 @@ test.list <- atime::atime_test_list(
286286
Slow = "548410d23dd74b625e8ea9aeb1a5d2e9dddd2927", # Parent of the first commit in the PR (https://github.com/Rdatatable/data.table/commit/548410d23dd74b625e8ea9aeb1a5d2e9dddd2927)
287287
Fast = "c0b32a60466bed0e63420ec105bc75c34590865e"), # Commit in the PR (https://github.com/Rdatatable/data.table/pull/7144/commits) that uses a much faster implementation
288288

289-
tests=extra.test.list)
289+
# Regression introduced in #7404 (grouped by factor).
290+
"DT[by] max regression fixed in #7480" = atime::atime_test(
291+
N = as.integer(10^seq(3, 5, by=0.5)),
292+
setup = {
293+
dt = data.table(
294+
id = as.factor(rep(seq_len(N), each = 100L)),
295+
V1 = 1L
296+
)
297+
},
298+
expr = data.table:::`[.data.table`(dt, , base::max(V1, na.rm = TRUE), by = id),
299+
Before = "476de7e3",
300+
Regression = "6f49bf1",
301+
Fixed = "b6ad1a4",
302+
seconds.limit = 1),
303+
tests=extra.test.list)
290304
# nolint end: undesirable_operator_linter.

.ci/lint.R

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,11 @@
33
args = commandArgs(TRUE)
44
if (identical(args, '--help')) {
55
writeLines(c(
6-
'Usage: Rscript .ci/lint.R .ci/linters/<KIND> <WHERE> <WHAT> [PREPROCESS]',
6+
'Usage: Rscript .ci/lint.R .ci/linters/<KIND> <WHERE> <WHAT>',
77
'KIND must name the directory containing the *.R files defining the linter functions.',
88
'WHERE must name the directory containing the files to lint, e.g. "po", or "src".',
99
"WHAT must contain the regular expression matching the files to lint, e.g., '[.]po$', or '[.][ch]$'.",
10+
NULL
1011
))
1112
q('no')
1213
}

.gitlab-ci.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -183,7 +183,7 @@ test-lin-dev-gcc-strict-cran:
183183
- R CMD check --as-cran $(ls -1t data.table_*.tar.gz | head -n 1)
184184
- (! grep "warning:" data.table.Rcheck/00install.out)
185185
- >-
186-
Rscript -e 'l=tail(readLines("data.table.Rcheck/00check.log"), 1L); notes<-"Status: 2 NOTEs"; if (!identical(l, notes)) stop("Last line of ", shQuote("00check.log"), " is not ", shQuote(notes), " (non-API calls, V8 package) but ", shQuote(l)) else q("no")'
186+
Rscript -e 'l=tail(readLines("data.table.Rcheck/00check.log"), 1L); notes<-"Status: 1 NOTE"; if (!identical(l, notes)) stop("Last line of ", shQuote("00check.log"), " is not ", shQuote(notes), " (V8 package) but ", shQuote(l)) else q("no")'
187187
188188
## R-devel on Linux clang
189189
# R compiled with clang, flags removed: -flto=auto -fopenmp
@@ -206,7 +206,7 @@ test-lin-dev-clang-cran:
206206
- R CMD check --as-cran $(ls -1t data.table_*.tar.gz | head -n 1)
207207
- (! grep "warning:" data.table.Rcheck/00install.out)
208208
- >-
209-
Rscript -e 'l=tail(readLines("data.table.Rcheck/00check.log"), 1L); notes<-"Status: 2 NOTEs"; if (!identical(l, notes)) stop("Last line of ", shQuote("00check.log"), " is not ", shQuote(notes), " (non-API calls, V8 package) but ", shQuote(l)) else q("no")'
209+
Rscript -e 'l=tail(readLines("data.table.Rcheck/00check.log"), 1L); notes<-"Status: 1 NOTE"; if (!identical(l, notes)) stop("Last line of ", shQuote("00check.log"), " is not ", shQuote(notes), " (V8 package) but ", shQuote(l)) else q("no")'
210210
211211
# stated dependency on R
212212
test-lin-ancient-cran:

NEWS.md

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -338,7 +338,7 @@ See [#2611](https://github.com/Rdatatable/data.table/issues/2611) for details. T
338338
339339
19. Ellipsis elements like `..1` are correctly excluded when searching for variables in "up-a-level" syntax inside `[`, [#5460](https://github.com/Rdatatable/data.table/issues/5460). Thanks @ggrothendieck for the report and @MichaelChirico for the fix.
340340
341-
20. `forderv` could segfault on keys with long runs of identical bytes (e.g., many duplicate columns) because the single-group branch tail-recursed radix-by-radix until the C stack ran out, [#4300](https://github.com/Rdatatable/data.table/issues/4300). This is a major problem since sorting is extensively used in `data.table`. Thanks @quantitative-technologies for the report and @ben-schwen for the fix.
341+
20. `forderv` could segfault on keys with long runs of identical bytes because the single-group branch tail-recursed radix-by-radix until the C stack ran out. This affected both integer/numeric sorting with many duplicate columns ([#4300](https://github.com/Rdatatable/data.table/issues/4300)) and character sorting with long common prefixes ([#7462](https://github.com/Rdatatable/data.table/issues/7462)). This is a major problem since sorting is extensively used in `data.table`. Thanks @quantitative-technologies and @DavisVaughan for the reports, and @ben-schwen for the fix.
342342
343343
21. `[` now preserves existing key(s) when new columns are added before them, instead of incorrectly setting a new column as key, [#7364](https://github.com/Rdatatable/data.table/issues/7364). Thanks @czeildi for the bug report and the fix.
344344
@@ -350,7 +350,11 @@ See [#2611](https://github.com/Rdatatable/data.table/issues/2611) for details. T
350350

351351
25. By-group operations on missing rows (e.g. `foo[c(i, NA), bar, by=grp]`) now avoid leaving in data from the previous groups, [#7442](https://github.com/Rdatatable/data.table/issues/7442). Thanks @aitap for the report and the fix.
352352

353-
26. `rbindlist()` now avoids the crash when working with many non-UTF-8 column names, [#7452](https://github.com/Rdatatable/data.table/issues/7452). Thanks @aitap for the report and the fix.
353+
26. Grouping by a factor with many groups is now fast again, fixing a timing regression introduced in [#6890](https://github.com/Rdatatable/data.table/pull/6890) where UTF-8 coercion and level remapping were performed unnecessarily, [#7404](https://github.com/Rdatatable/data.table/issues/7404). Thanks @ben-schwen for the report and fix.
354+
355+
27. `dogroups()` no longer reads beyond the resized end of over-allocated data.table list columns, [#7486](https://github.com/Rdatatable/data.table/issues/7486). While this didn't crash in practice, it is now explicitly checked for in recent R versions (r89198+). Thanks @TimTaylor and @aitap for the report and @aitap for the fix.
356+
357+
28. `rbindlist()` now avoids the crash when working with many non-UTF-8 column names, [#7452](https://github.com/Rdatatable/data.table/issues/7452). Thanks @aitap for the report and the fix.
354358

355359
### NOTES
356360

@@ -379,6 +383,8 @@ See [#2611](https://github.com/Rdatatable/data.table/issues/2611) for details. T
379383
380384
8. Retain important information in the error message about the source of the error when `i=` fails, e.g. pointing to `charToDate()` failing in `DT[date_col == "20250101"]`, [#7444](https://github.com/Rdatatable/data.table/issues/7444). Thanks @jan-swissre for the report and @MichaelChirico for the fix.
381385
386+
9. Internal use of declared non-API R functions `SETLENGTH`, `TRUELENGTH`, `SET_TRUELENGTH`, and `SET_GROWABLE_BIT` has been eliminated. Most usages have been migrated to R's experimental resizable vectors API (thanks to @ltierney, introduced in R 4.6.0, backported for older R versions), [#7451](https://github.com/Rdatatable/data.table/pull/7451). Uses of `TRUELENGTH` for marking seen items during grouping and binding operations (aka free hash table trick) have been replaced with proper hash tables, [#6694](https://github.com/Rdatatable/data.table/pull/6694). The new hash table implementation uses linear probing with power of 2 tables and automatic resizing. Additionally, `chmatch()` now hashes the needle (`x`) instead of the haystack (`table`) when `length(table) >> length(x)`, significantly improving performance for lookups into large tables. We've benchmarked the refactored code and find the performance satisfactory, but please do report any edge case performance regressions we may have missed. Thanks to @aitap, @ben-schwen, @jangorecki and @HughParsonage for implementation and reviews.
387+
382388
## data.table [v1.17.8](https://github.com/Rdatatable/data.table/milestone/41) (6 July 2025)
383389
384390
1. Internal functions used to signal errors are now marked as non-returning, silencing a compiler warning about potentially unchecked allocation failure. Thanks to Prof. Brian D. Ripley for the report and @aitap for the fix, [#7070](https://github.com/Rdatatable/data.table/pull/7070).
@@ -552,6 +558,8 @@ rowwiseDT(
552558

553559
22. `fread()` could fail to read Mac CSV files (with `\r` line endings) if the file contained any `\n` character, such as a final `\r\n`. This was fixed by detecting the predominant line ending in a sample of the file, [#4186](https://github.com/Rdatatable/data.table/issues/4186). Thanks to @MPagel for the report and @ben-schwen for the fix.
554560

561+
23. By reference assignments (':=') with functions that modified the data.table by reference e.g. (`foo=function(DT){modify(DT);return(1L)}`, `DT[,a:=foo(DT)]`) returned a malformed data.table due to the modification of the targeted named column index ("a") during the j expression evaluation [#6768](https://github.com/Rdatatable/data.table/issues/6768). Thanks @AntonNM for the report and fix.
562+
555563
### NOTES
556564

557565
1. There is a new vignette on joins! See `vignette("datatable-joins")`. Thanks to Angel Feliz for authoring it! Feedback welcome. This vignette has been highly requested since 2017: [#2181](https://github.com/Rdatatable/data.table/issues/2181).

R/data.table.R

Lines changed: 67 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -561,7 +561,7 @@ replace_dot_alias = function(e) {
561561
}
562562
irows = vecseq(f__, len__, limit)
563563
}
564-
if (verbose) {cat(timetaken(last.started.at),"\n"); flush.console()}
564+
if (verbose) {cat(timetaken(last.started.at),"\n"); flush.console()} # notranslate
565565
# Fix for #1092 and #1074
566566
# TODO: implement better version of "any"/"all"/"which" to avoid
567567
# unnecessary construction of logical vectors
@@ -1189,11 +1189,11 @@ replace_dot_alias = function(e) {
11891189
} else if (is.numeric(lhs)) {
11901190
m = as.integer(lhs)
11911191
if (any(m<1L | ncol(x)<m)) stopf("LHS of := appears to be column positions but are outside [1,ncol] range. New columns can only be added by name.")
1192-
lhs = names_x[m]
11931192
} else
11941193
stopf("LHS of := isn't column names ('character') or positions ('integer' or 'numeric')")
1194+
ok<-selfrefok(x, verbose=FALSE)
11951195
if (!anyNA(m)) {
1196-
# updates by reference to existing columns
1196+
# updates by reference to existing columns, or deletions
11971197
cols = as.integer(m)
11981198
newnames=NULL
11991199
if (identical(irows, integer())) {
@@ -1214,44 +1214,16 @@ replace_dot_alias = function(e) {
12141214
return(invisible(x))
12151215
}
12161216
} else {
1217-
# Adding new column(s). TO DO: move after the first eval in case the jsub has an error.
1217+
# Adding new column(s). Allocation for columns and recalculation of target cols moved after the jval = eval(jsub)
1218+
# in case of error or by-reference modifications to the DT
12181219
newnames=setdiff(lhs, names_x)
12191220
m[is.na(m)] = ncol(x)+seq_along(newnames)
12201221
cols = as.integer(m)
12211222
# don't pass verbose to selfrefok here -- only activated when
1222-
# ok=-1 which will trigger setalloccol with verbose in the next
1223-
# branch, which again calls _selfrefok and returns the message then
1224-
if ((ok<-selfrefok(x, verbose=FALSE))==0L) # ok==0 so no warning when loaded from disk (-1) [-1 considered TRUE by R]
1223+
# ok=-1 which will trigger setalloccol with verbose after
1224+
# the jval = eval(jsub, ...)
1225+
if (ok==0L) # ok==0 so no warning when loaded from disk (-1) [-1 considered TRUE by R]
12251226
if (is.data.table(x)) warningf("A shallow copy of this data.table was taken so that := can add or remove %d columns by reference. At an earlier point, this data.table was copied by R (or was created manually using structure() or similar). Avoid names<- and attr<- which in R currently (and oddly) may copy the whole data.table. Use set* syntax instead to avoid copying: ?set, ?setnames and ?setattr. It's also not unusual for data.table-agnostic packages to produce tables affected by this issue. If this message doesn't help, please report your use case to the data.table issue tracker so the root cause can be fixed or this message improved.", length(newnames))
1226-
# !is.data.table for DF |> DT(,:=) tests 2212.16-19 (#5113) where a shallow copy is routine for data.frame
1227-
if ((ok<1L) || (truelength(x) < ncol(x)+length(newnames))) {
1228-
DT = x # in case getOption contains "ncol(DT)" as it used to. TODO: warn and then remove
1229-
n = length(newnames) + eval(getOption("datatable.alloccol")) # TODO: warn about expressions and then drop the eval()
1230-
# i.e. reallocate at the size as if the new columns were added followed by setalloccol().
1231-
name = substitute(x)
1232-
if (is.name(name) && ok && verbose) { # && NAMED(x)>0 (TO DO) # ok here includes -1 (loaded from disk)
1233-
catf("Growing vector of column pointers from truelength %d to %d. A shallow copy has been taken, see ?setalloccol. Only a potential issue if two variables point to the same data (we can't yet detect that well) and if not you can safely ignore this. To avoid this message you could setalloccol() first, deep copy first using copy(), wrap with suppressWarnings() or increase the 'datatable.alloccol' option.\n", truelength(x), n)
1234-
# #1729 -- copying to the wrong environment here can cause some confusion
1235-
if (ok == -1L) catf("Note that the shallow copy will assign to the environment from which := was called. That means for example that if := was called within a function, the original table may be unaffected.\n")
1236-
1237-
# Verbosity should not issue warnings, so cat rather than warning.
1238-
# TO DO: Add option 'datatable.pedantic' to turn on warnings like this.
1239-
1240-
# TO DO ... comments moved up from C ...
1241-
# Note that the NAMED(dt)>1 doesn't work because .Call
1242-
# always sets to 2 (see R-ints), it seems. Work around
1243-
# may be possible but not yet working. When the NAMED test works, we can drop allocwarn argument too
1244-
# because that's just passed in as FALSE from [<- where we know `*tmp*` isn't really NAMED=2.
1245-
# Note also that this growing will happen for missing columns assigned NULL, too. But so rare, we
1246-
# don't mind.
1247-
}
1248-
setalloccol(x, n, verbose=verbose) # always assigns to calling scope; i.e. this scope
1249-
if (is.name(name)) {
1250-
assign(as.character(name),x,parent.frame(),inherits=TRUE)
1251-
} else if (.is_simple_extraction(name)) {
1252-
.reassign_extracted_table(name, x)
1253-
} # TO DO: else if env$<- or list$<-
1254-
}
12551227
}
12561228
}
12571229
}
@@ -1411,6 +1383,63 @@ replace_dot_alias = function(e) {
14111383
}
14121384

14131385
if (!is.null(lhs)) {
1386+
# Re-matches characters names in the lhs after jval to account for jsub's that modify the columns of the data.table (#6768)
1387+
# Replaces numerical lhs with respective names_x
1388+
if(is.character(lhs)){
1389+
m = chmatch(lhs, names_x)
1390+
if(!anyNA(m)) {
1391+
# updates by reference to existing columns
1392+
cols = as.integer(m)
1393+
newnames = NULL
1394+
} else {
1395+
# Adding new column(s).
1396+
newnames = setdiff(lhs, names_x)
1397+
m[is.na(m)] = ncol(x) + seq_along(newnames)
1398+
cols = as.integer(m)
1399+
}
1400+
} else if (is.numeric(lhs)) {
1401+
lhs = names_x[m]
1402+
}
1403+
# ok <- selfrefok above called without verbose -- only activated when
1404+
# ok=-1 which will trigger setalloccol with verbose in the next
1405+
# branch, which again calls _selfrefok and returns the message then
1406+
# !is.data.table for DF |> DT(,:=) tests 2212.16-19 (#5113) where a shallow copy is routine for data.frame
1407+
if (
1408+
(
1409+
!is.null(newnames) || # adding new columns
1410+
is.null(jval) || (is.list(jval) && any(vapply_1b(jval, is.null))) # removing columns
1411+
) && (
1412+
(ok<1L) || # unsafe to resize
1413+
(truelength(x) < ncol(x)+length(newnames)) # not enough space for new columns
1414+
)
1415+
) {
1416+
DT = x # in case getOption contains "ncol(DT)" as it used to. TODO: warn and then remove
1417+
n = length(newnames) + eval(getOption("datatable.alloccol")) # TODO: warn about expressions and then drop the eval()
1418+
# i.e. reallocate at the size as if the new columns were added followed by setalloccol().
1419+
name = substitute(x)
1420+
if (is.name(name) && ok && verbose) { # && NAMED(x)>0 (TO DO) # ok here includes -1 (loaded from disk)
1421+
catf("Growing vector of column pointers from truelength %d to %d. A shallow copy has been taken, see ?setalloccol. Only a potential issue if two variables point to the same data (we can't yet detect that well) and if not you can safely ignore this. To avoid this message you could setalloccol() first, deep copy first using copy(), wrap with suppressWarnings() or increase the 'datatable.alloccol' option.\n", truelength(x), n)
1422+
# #1729 -- copying to the wrong environment here can cause some confusion
1423+
if (ok == -1L) catf("Note that the shallow copy will assign to the environment from which := was called. That means for example that if := was called within a function, the original table may be unaffected.\n")
1424+
1425+
# Verbosity should not issue warnings, so cat rather than warning.
1426+
# TO DO: Add option 'datatable.pedantic' to turn on warnings like this.
1427+
1428+
# TO DO ... comments moved up from C ...
1429+
# Note that the NAMED(dt)>1 doesn't work because .Call
1430+
# always sets to 2 (see R-ints), it seems. Work around
1431+
# may be possible but not yet working. When the NAMED test works, we can drop allocwarn argument too
1432+
# because that's just passed in as FALSE from [<- where we know `*tmp*` isn't really NAMED=2.
1433+
# Note also that this growing will happen for missing columns assigned NULL, too. But so rare, we
1434+
# don't mind.
1435+
}
1436+
setalloccol(x, n, verbose=verbose) # always assigns to calling scope; i.e. this scope
1437+
if (is.name(name)) {
1438+
assign(as.character(name),x,parent.frame(),inherits=TRUE)
1439+
} else if (.is_simple_extraction(name)) {
1440+
.reassign_extracted_table(name, x)
1441+
} # TO DO: else if env$<- or list$<-
1442+
}
14141443
# TODO?: use set() here now that it can add new columns. Then remove newnames and alloc logic above.
14151444
.Call(Cassign,x,irows,cols,newnames,jval)
14161445
return(suppPrint(x))
@@ -1591,7 +1620,7 @@ replace_dot_alias = function(e) {
15911620
if (length(xcols)) {
15921621
# TODO add: if (max(len__)==nrow) stopf("There is no need to deep copy x in this case")
15931622
# TODO move down to dogroup.c, too.
1594-
SDenv$.SDall = .Call(CsubsetDT, x, if (length(len__)) seq_len(max(len__)) else 0L, xcols) # must be deep copy when largest group is a subset
1623+
SDenv$.SDall = .Call(CcopyAsGrowable, .Call(CsubsetDT, x, if (length(len__)) seq_len(max(len__)) else 0L, xcols)) # must be deep copy when largest group is a subset
15951624
if (!is.data.table(SDenv$.SDall)) setattr(SDenv$.SDall, "class", c("data.table","data.frame")) # DF |> DT(,.SD[...],by=grp) needs .SD to be data.table, test 2022.012
15961625
if (xdotcols) setattr(SDenv$.SDall, 'names', ansvars[xcolsAns]) # now that we allow 'x.' prefix in 'j', #2313 bug fix - [xcolsAns]
15971626
SDenv$.SD = if (length(non_sdvars)) shallow(SDenv$.SDall, sdvars) else SDenv$.SDall
@@ -1864,7 +1893,7 @@ replace_dot_alias = function(e) {
18641893
grpcols = leftcols # 'leftcols' are the columns in i involved in the join (either head of key(i) or head along i)
18651894
jiscols = chmatch(jisvars, names_i) # integer() if there are no jisvars (usually there aren't, advanced feature)
18661895
xjiscols = chmatch(xjisvars, names_x)
1867-
SDenv$.xSD = x[min(nrow(i), 1L), xjisvars, with=FALSE]
1896+
SDenv$.xSD = .Call(CcopyAsGrowable, x[min(nrow(i), 1L), xjisvars, with=FALSE])
18681897
if (!missing(on)) o__ = xo else o__ = integer(0L)
18691898
} else {
18701899
groups = byval

0 commit comments

Comments
 (0)