Rdatatable
diff --git a/‎.ci/atime/tests.R‎
Lines changed: 15 additions & 1 deletion b/‎.ci/atime/tests.R‎
Lines changed: 15 additions & 1 deletion
diff --git a/‎.ci/lint.R‎
Lines changed: 2 additions & 1 deletion b/‎.ci/lint.R‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎.gitlab-ci.yml‎
Lines changed: 2 additions & 2 deletions b/‎.gitlab-ci.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎NEWS.md‎
Lines changed: 10 additions & 2 deletions b/‎NEWS.md‎
Lines changed: 10 additions & 2 deletions
diff --git a/‎R/data.table.R‎
Lines changed: 67 additions & 38 deletions b/‎R/data.table.R‎
Lines changed: 67 additions & 38 deletions
@@ -286,5 +286,19 @@ test.list <- atime::atime_test_list(
     Slow = "548410d23dd74b625e8ea9aeb1a5d2e9dddd2927",   # Parent of the first commit in the PR (https://github.com/Rdatatable/data.table/commit/548410d23dd74b625e8ea9aeb1a5d2e9dddd2927)
     Fast = "c0b32a60466bed0e63420ec105bc75c34590865e"),  # Commit in the PR (https://github.com/Rdatatable/data.table/pull/7144/commits) that uses a much faster implementation
 
-    tests=extra.test.list)
+  # Regression introduced in #7404 (grouped by factor).
+  "DT[by] max regression fixed in #7480" = atime::atime_test(
+    N = as.integer(10^seq(3, 5, by=0.5)),
+    setup = {
+      dt = data.table(
+        id = as.factor(rep(seq_len(N), each = 100L)),
+        V1 = 1L
+      )
+    },
+    expr = data.table:::`[.data.table`(dt, , base::max(V1, na.rm = TRUE), by = id),
+    Before = "476de7e3",
+    Regression = "6f49bf1",
+    Fixed = "b6ad1a4",
+    seconds.limit = 1),
+  tests=extra.test.list)
 # nolint end: undesirable_operator_linter.
@@ -3,10 +3,11 @@
 args = commandArgs(TRUE)
 if (identical(args, '--help')) {
   writeLines(c(
-    'Usage: Rscript .ci/lint.R .ci/linters/<KIND> <WHERE> <WHAT> [PREPROCESS]',
+    'Usage: Rscript .ci/lint.R .ci/linters/<KIND> <WHERE> <WHAT>',
     'KIND must name the directory containing the *.R files defining the linter functions.',
     'WHERE must name the directory containing the files to lint, e.g. "po", or "src".',
     "WHAT must contain the regular expression matching the files to lint, e.g., '[.]po$', or '[.][ch]$'.",
+    NULL
   ))
   q('no')
 }
 
@@ -183,7 +183,7 @@ test-lin-dev-gcc-strict-cran:
     - R CMD check --as-cran $(ls -1t data.table_*.tar.gz | head -n 1)
     - (! grep "warning:" data.table.Rcheck/00install.out)
     - >-
-        Rscript -e 'l=tail(readLines("data.table.Rcheck/00check.log"), 1L); notes<-"Status: 2 NOTEs"; if (!identical(l, notes)) stop("Last line of ", shQuote("00check.log"), " is not ", shQuote(notes), " (non-API calls, V8 package) but ", shQuote(l)) else q("no")'
+        Rscript -e 'l=tail(readLines("data.table.Rcheck/00check.log"), 1L); notes<-"Status: 1 NOTE"; if (!identical(l, notes)) stop("Last line of ", shQuote("00check.log"), " is not ", shQuote(notes), " (V8 package) but ", shQuote(l)) else q("no")'
 
 ## R-devel on Linux clang
 # R compiled with clang, flags removed: -flto=auto -fopenmp
@@ -206,7 +206,7 @@ test-lin-dev-clang-cran:
     - R CMD check --as-cran $(ls -1t data.table_*.tar.gz | head -n 1)
     - (! grep "warning:" data.table.Rcheck/00install.out)
     - >-
-        Rscript -e 'l=tail(readLines("data.table.Rcheck/00check.log"), 1L); notes<-"Status: 2 NOTEs"; if (!identical(l, notes)) stop("Last line of ", shQuote("00check.log"), " is not ", shQuote(notes), " (non-API calls, V8 package) but ", shQuote(l)) else q("no")'
+        Rscript -e 'l=tail(readLines("data.table.Rcheck/00check.log"), 1L); notes<-"Status: 1 NOTE"; if (!identical(l, notes)) stop("Last line of ", shQuote("00check.log"), " is not ", shQuote(notes), " (V8 package) but ", shQuote(l)) else q("no")'
 
 # stated dependency on R
 test-lin-ancient-cran:
 
@@ -338,7 +338,7 @@ See [#2611](https://github.com/Rdatatable/data.table/issues/2611) for details. T
 
 19. Ellipsis elements like `..1` are correctly excluded when searching for variables in "up-a-level" syntax inside `[`, [#5460](https://github.com/Rdatatable/data.table/issues/5460). Thanks @ggrothendieck for the report and @MichaelChirico for the fix.
 
-20. `forderv` could segfault on keys with long runs of identical bytes (e.g., many duplicate columns) because the single-group branch tail-recursed radix-by-radix until the C stack ran out, [#4300](https://github.com/Rdatatable/data.table/issues/4300). This is a major problem since sorting is extensively used in `data.table`. Thanks @quantitative-technologies for the report and @ben-schwen for the fix.
+20. `forderv` could segfault on keys with long runs of identical bytes because the single-group branch tail-recursed radix-by-radix until the C stack ran out. This affected both integer/numeric sorting with many duplicate columns ([#4300](https://github.com/Rdatatable/data.table/issues/4300)) and character sorting with long common prefixes ([#7462](https://github.com/Rdatatable/data.table/issues/7462)). This is a major problem since sorting is extensively used in `data.table`. Thanks @quantitative-technologies and @DavisVaughan for the reports, and @ben-schwen for the fix.
 
 21. `[` now preserves existing key(s) when new columns are added before them, instead of incorrectly setting a new column as key, [#7364](https://github.com/Rdatatable/data.table/issues/7364). Thanks @czeildi for the bug report and the fix.
 
@@ -350,7 +350,11 @@ See [#2611](https://github.com/Rdatatable/data.table/issues/2611) for details. T
 
 25. By-group operations on missing rows (e.g. `foo[c(i, NA), bar, by=grp]`) now avoid leaving in data from the previous groups, [#7442](https://github.com/Rdatatable/data.table/issues/7442). Thanks @aitap for the report and the fix.
 
-26. `rbindlist()` now avoids the crash when working with many non-UTF-8 column names, [#7452](https://github.com/Rdatatable/data.table/issues/7452). Thanks @aitap for the report and the fix.
+26. Grouping by a factor with many groups is now fast again, fixing a timing regression introduced in [#6890](https://github.com/Rdatatable/data.table/pull/6890) where UTF-8 coercion and level remapping were performed unnecessarily, [#7404](https://github.com/Rdatatable/data.table/issues/7404). Thanks @ben-schwen for the report and fix.
+
+27. `dogroups()` no longer reads beyond the resized end of over-allocated data.table list columns, [#7486](https://github.com/Rdatatable/data.table/issues/7486). While this didn't crash in practice, it is now explicitly checked for in recent R versions (r89198+). Thanks @TimTaylor and @aitap for the report and @aitap for the fix.
+
+28. `rbindlist()` now avoids the crash when working with many non-UTF-8 column names, [#7452](https://github.com/Rdatatable/data.table/issues/7452). Thanks @aitap for the report and the fix.
 
 ### NOTES
 
@@ -379,6 +383,8 @@ See [#2611](https://github.com/Rdatatable/data.table/issues/2611) for details. T
 
 8. Retain important information in the error message about the source of the error when `i=` fails, e.g. pointing to `charToDate()` failing in `DT[date_col == "20250101"]`, [#7444](https://github.com/Rdatatable/data.table/issues/7444). Thanks @jan-swissre for the report and @MichaelChirico for the fix.
 
+9. Internal use of declared non-API R functions `SETLENGTH`, `TRUELENGTH`, `SET_TRUELENGTH`, and `SET_GROWABLE_BIT` has been eliminated. Most usages have been migrated to R's experimental resizable vectors API (thanks to @ltierney, introduced in R 4.6.0, backported for older R versions), [#7451](https://github.com/Rdatatable/data.table/pull/7451). Uses of `TRUELENGTH` for marking seen items during grouping and binding operations (aka free hash table trick) have been replaced with proper hash tables, [#6694](https://github.com/Rdatatable/data.table/pull/6694). The new hash table implementation uses linear probing with power of 2 tables and automatic resizing. Additionally, `chmatch()` now hashes the needle (`x`) instead of the haystack (`table`) when `length(table) >> length(x)`, significantly improving performance for lookups into large tables. We've benchmarked the refactored code and find the performance satisfactory, but please do report any edge case performance regressions we may have missed. Thanks to @aitap, @ben-schwen, @jangorecki and @HughParsonage for implementation and reviews.
+
 ## data.table [v1.17.8](https://github.com/Rdatatable/data.table/milestone/41) (6 July 2025)
 
 1. Internal functions used to signal errors are now marked as non-returning, silencing a compiler warning about potentially unchecked allocation failure. Thanks to Prof. Brian D. Ripley for the report and @aitap for the fix, [#7070](https://github.com/Rdatatable/data.table/pull/7070).
@@ -552,6 +558,8 @@ rowwiseDT(
 
 22. `fread()` could fail to read Mac CSV files (with `\r` line endings) if the file contained any `\n` character, such as a final `\r\n`. This was fixed by detecting the predominant line ending in a sample of the file, [#4186](https://github.com/Rdatatable/data.table/issues/4186). Thanks to @MPagel for the report and @ben-schwen for the fix.
 
+23. By reference assignments (':=') with functions that modified the data.table by reference e.g. (`foo=function(DT){modify(DT);return(1L)}`, `DT[,a:=foo(DT)]`) returned a malformed data.table due to the modification of the targeted named column index ("a") during the j expression evaluation [#6768](https://github.com/Rdatatable/data.table/issues/6768). Thanks @AntonNM for the report and fix.
+
 ### NOTES
 
 1. There is a new vignette on joins! See `vignette("datatable-joins")`. Thanks to Angel Feliz for authoring it! Feedback welcome. This vignette has been highly requested since 2017: [#2181](https://github.com/Rdatatable/data.table/issues/2181).
 
@@ -561,7 +561,7 @@ replace_dot_alias = function(e) {
             }
             irows = vecseq(f__, len__, limit)
           }
-          if (verbose) {cat(timetaken(last.started.at),"\n"); flush.console()}
+          if (verbose) {cat(timetaken(last.started.at),"\n"); flush.console()} # notranslate
           # Fix for #1092 and #1074
           # TODO: implement better version of "any"/"all"/"which" to avoid
           # unnecessary construction of logical vectors
@@ -1189,11 +1189,11 @@ replace_dot_alias = function(e) {
         } else if (is.numeric(lhs)) {
           m = as.integer(lhs)
           if (any(m<1L | ncol(x)<m)) stopf("LHS of := appears to be column positions but are outside [1,ncol] range. New columns can only be added by name.")
-          lhs = names_x[m]
         } else
           stopf("LHS of := isn't column names ('character') or positions ('integer' or 'numeric')")
+        ok<-selfrefok(x, verbose=FALSE)
         if (!anyNA(m)) {
-          # updates by reference to existing columns
+          # updates by reference to existing columns, or deletions
           cols = as.integer(m)
           newnames=NULL
           if (identical(irows, integer())) {
@@ -1214,44 +1214,16 @@ replace_dot_alias = function(e) {
             return(invisible(x))
           }
         } else {
-          # Adding new column(s). TO DO: move after the first eval in case the jsub has an error.
+          # Adding new column(s). Allocation for columns and recalculation of target cols moved after the jval = eval(jsub)
+          # in case of error or by-reference modifications to the DT
           newnames=setdiff(lhs, names_x)
           m[is.na(m)] = ncol(x)+seq_along(newnames)
           cols = as.integer(m)
           # don't pass verbose to selfrefok here -- only activated when
-          #   ok=-1 which will trigger setalloccol with verbose in the next
-          #   branch, which again calls _selfrefok and returns the message then
-          if ((ok<-selfrefok(x, verbose=FALSE))==0L)   # ok==0 so no warning when loaded from disk (-1) [-1 considered TRUE by R]
+          #   ok=-1 which will trigger setalloccol with verbose after
+          #   the jval = eval(jsub, ...)
+          if (ok==0L)   # ok==0 so no warning when loaded from disk (-1) [-1 considered TRUE by R]
             if (is.data.table(x)) warningf("A shallow copy of this data.table was taken so that := can add or remove %d columns by reference. At an earlier point, this data.table was copied by R (or was created manually using structure() or similar). Avoid names<- and attr<- which in R currently (and oddly) may copy the whole data.table. Use set* syntax instead to avoid copying: ?set, ?setnames and ?setattr. It's also not unusual for data.table-agnostic packages to produce tables affected by this issue. If this message doesn't help, please report your use case to the data.table issue tracker so the root cause can be fixed or this message improved.", length(newnames))
-            # !is.data.table for DF |> DT(,:=) tests 2212.16-19 (#5113) where a shallow copy is routine for data.frame
-          if ((ok<1L) || (truelength(x) < ncol(x)+length(newnames))) {
-            DT = x  # in case getOption contains "ncol(DT)" as it used to.  TODO: warn and then remove
-            n = length(newnames) + eval(getOption("datatable.alloccol"))  # TODO: warn about expressions and then drop the eval()
-            # i.e. reallocate at the size as if the new columns were added followed by setalloccol().
-            name = substitute(x)
-            if (is.name(name) && ok && verbose) { # && NAMED(x)>0 (TO DO)    # ok here includes -1 (loaded from disk)
-              catf("Growing vector of column pointers from truelength %d to %d. A shallow copy has been taken, see ?setalloccol. Only a potential issue if two variables point to the same data (we can't yet detect that well) and if not you can safely ignore this. To avoid this message you could setalloccol() first, deep copy first using copy(), wrap with suppressWarnings() or increase the 'datatable.alloccol' option.\n", truelength(x), n)
-              # #1729 -- copying to the wrong environment here can cause some confusion
-              if (ok == -1L) catf("Note that the shallow copy will assign to the environment from which := was called. That means for example that if := was called within a function, the original table may be unaffected.\n")
-
-              # Verbosity should not issue warnings, so cat rather than warning.
-              # TO DO: Add option 'datatable.pedantic' to turn on warnings like this.
-
-              # TO DO ... comments moved up from C ...
-              # Note that the NAMED(dt)>1 doesn't work because .Call
-              # always sets to 2 (see R-ints), it seems. Work around
-              # may be possible but not yet working. When the NAMED test works, we can drop allocwarn argument too
-              # because that's just passed in as FALSE from [<- where we know `*tmp*` isn't really NAMED=2.
-              # Note also that this growing will happen for missing columns assigned NULL, too. But so rare, we
-              # don't mind.
-            }
-            setalloccol(x, n, verbose=verbose)   # always assigns to calling scope; i.e. this scope
-            if (is.name(name)) {
-              assign(as.character(name),x,parent.frame(),inherits=TRUE)
-            } else if (.is_simple_extraction(name)) {
-              .reassign_extracted_table(name, x)
-            } # TO DO: else if env$<- or list$<-
-          }
         }
       }
     }
@@ -1411,6 +1383,63 @@ replace_dot_alias = function(e) {
     }
 
     if (!is.null(lhs)) {
+      # Re-matches characters names in the lhs after jval to account for jsub's that modify the columns of the data.table (#6768)
+      # Replaces numerical lhs with respective names_x
+      if(is.character(lhs)){
+        m = chmatch(lhs, names_x)
+        if(!anyNA(m)) {
+          # updates by reference to existing columns
+          cols = as.integer(m)
+          newnames = NULL
+        } else {
+          # Adding new column(s).
+          newnames = setdiff(lhs, names_x)
+          m[is.na(m)] = ncol(x) + seq_along(newnames)
+          cols = as.integer(m)
+        }
+      } else if (is.numeric(lhs)) {
+        lhs = names_x[m]
+      }
+      # ok <- selfrefok above called without verbose -- only activated when
+      #   ok=-1 which will trigger setalloccol with verbose in the next
+      #   branch, which again calls _selfrefok and returns the message then
+      # !is.data.table for DF |> DT(,:=) tests 2212.16-19 (#5113) where a shallow copy is routine for data.frame
+      if (
+        (
+          !is.null(newnames) || # adding new columns
+          is.null(jval) || (is.list(jval) && any(vapply_1b(jval, is.null))) # removing columns
+        ) && (
+          (ok<1L) || # unsafe to resize
+          (truelength(x) < ncol(x)+length(newnames)) # not enough space for new columns
+        )
+      ) {
+        DT = x  # in case getOption contains "ncol(DT)" as it used to.  TODO: warn and then remove
+        n = length(newnames) + eval(getOption("datatable.alloccol"))  # TODO: warn about expressions and then drop the eval()
+        # i.e. reallocate at the size as if the new columns were added followed by setalloccol().
+        name = substitute(x)
+        if (is.name(name) && ok && verbose) { # && NAMED(x)>0 (TO DO)    # ok here includes -1 (loaded from disk)
+          catf("Growing vector of column pointers from truelength %d to %d. A shallow copy has been taken, see ?setalloccol. Only a potential issue if two variables point to the same data (we can't yet detect that well) and if not you can safely ignore this. To avoid this message you could setalloccol() first, deep copy first using copy(), wrap with suppressWarnings() or increase the 'datatable.alloccol' option.\n", truelength(x), n)
+          # #1729 -- copying to the wrong environment here can cause some confusion
+          if (ok == -1L) catf("Note that the shallow copy will assign to the environment from which := was called. That means for example that if := was called within a function, the original table may be unaffected.\n")
+
+          # Verbosity should not issue warnings, so cat rather than warning.
+          # TO DO: Add option 'datatable.pedantic' to turn on warnings like this.
+
+          # TO DO ... comments moved up from C ...
+          # Note that the NAMED(dt)>1 doesn't work because .Call
+          # always sets to 2 (see R-ints), it seems. Work around
+          # may be possible but not yet working. When the NAMED test works, we can drop allocwarn argument too
+          # because that's just passed in as FALSE from [<- where we know `*tmp*` isn't really NAMED=2.
+          # Note also that this growing will happen for missing columns assigned NULL, too. But so rare, we
+          # don't mind.
+        }
+        setalloccol(x, n, verbose=verbose)   # always assigns to calling scope; i.e. this scope
+        if (is.name(name)) {
+          assign(as.character(name),x,parent.frame(),inherits=TRUE)
+        } else if (.is_simple_extraction(name)) {
+          .reassign_extracted_table(name, x)
+        } # TO DO: else if env$<- or list$<-
+      }
       # TODO?: use set() here now that it can add new columns. Then remove newnames and alloc logic above.
       .Call(Cassign,x,irows,cols,newnames,jval)
       return(suppPrint(x))
@@ -1591,7 +1620,7 @@ replace_dot_alias = function(e) {
   if (length(xcols)) {
     #  TODO add: if (max(len__)==nrow) stopf("There is no need to deep copy x in this case")
     #  TODO move down to dogroup.c, too.
-    SDenv$.SDall = .Call(CsubsetDT, x, if (length(len__)) seq_len(max(len__)) else 0L, xcols)  # must be deep copy when largest group is a subset
+    SDenv$.SDall = .Call(CcopyAsGrowable, .Call(CsubsetDT, x, if (length(len__)) seq_len(max(len__)) else 0L, xcols))  # must be deep copy when largest group is a subset
     if (!is.data.table(SDenv$.SDall)) setattr(SDenv$.SDall, "class", c("data.table","data.frame"))  # DF |> DT(,.SD[...],by=grp) needs .SD to be data.table, test 2022.012
     if (xdotcols) setattr(SDenv$.SDall, 'names', ansvars[xcolsAns]) # now that we allow 'x.' prefix in 'j', #2313 bug fix - [xcolsAns]
     SDenv$.SD = if (length(non_sdvars)) shallow(SDenv$.SDall, sdvars) else SDenv$.SDall
@@ -1864,7 +1893,7 @@ replace_dot_alias = function(e) {
     grpcols = leftcols # 'leftcols' are the columns in i involved in the join (either head of key(i) or head along i)
     jiscols = chmatch(jisvars, names_i)  # integer() if there are no jisvars (usually there aren't, advanced feature)
     xjiscols = chmatch(xjisvars, names_x)
-    SDenv$.xSD = x[min(nrow(i), 1L), xjisvars, with=FALSE]
+    SDenv$.xSD = .Call(CcopyAsGrowable, x[min(nrow(i), 1L), xjisvars, with=FALSE])
     if (!missing(on)) o__ = xo else o__ = integer(0L)
   } else {
     groups = byval