Rdatatable
diff --git a/‎.gitlab-ci.yml‎
Lines changed: 2 additions & 2 deletions b/‎.gitlab-ci.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎NEWS.md‎
Lines changed: 2 additions & 0 deletions b/‎NEWS.md‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎R/data.table.R‎
Lines changed: 2 additions & 2 deletions b/‎R/data.table.R‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎inst/tests/tests.Rraw‎
Lines changed: 8 additions & 0 deletions b/‎inst/tests/tests.Rraw‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎src/assign.c‎
Lines changed: 13 additions & 88 deletions b/‎src/assign.c‎
Lines changed: 13 additions & 88 deletions
diff --git a/‎src/chmatch.c‎
Lines changed: 56 additions & 45 deletions b/‎src/chmatch.c‎
Lines changed: 56 additions & 45 deletions
@@ -183,7 +183,7 @@ test-lin-dev-gcc-strict-cran:
     - R CMD check --as-cran $(ls -1t data.table_*.tar.gz | head -n 1)
     - (! grep "warning:" data.table.Rcheck/00install.out)
     - >-
-        Rscript -e 'l=tail(readLines("data.table.Rcheck/00check.log"), 1L); notes<-"Status: 2 NOTEs"; if (!identical(l, notes)) stop("Last line of ", shQuote("00check.log"), " is not ", shQuote(notes), " (non-API calls, V8 package) but ", shQuote(l)) else q("no")'
+        Rscript -e 'l=tail(readLines("data.table.Rcheck/00check.log"), 1L); notes<-"Status: 1 NOTE"; if (!identical(l, notes)) stop("Last line of ", shQuote("00check.log"), " is not ", shQuote(notes), " (V8 package) but ", shQuote(l)) else q("no")'
 
 ## R-devel on Linux clang
 # R compiled with clang, flags removed: -flto=auto -fopenmp
@@ -206,7 +206,7 @@ test-lin-dev-clang-cran:
     - R CMD check --as-cran $(ls -1t data.table_*.tar.gz | head -n 1)
     - (! grep "warning:" data.table.Rcheck/00install.out)
     - >-
-        Rscript -e 'l=tail(readLines("data.table.Rcheck/00check.log"), 1L); notes<-"Status: 2 NOTEs"; if (!identical(l, notes)) stop("Last line of ", shQuote("00check.log"), " is not ", shQuote(notes), " (non-API calls, V8 package) but ", shQuote(l)) else q("no")'
+        Rscript -e 'l=tail(readLines("data.table.Rcheck/00check.log"), 1L); notes<-"Status: 1 NOTE"; if (!identical(l, notes)) stop("Last line of ", shQuote("00check.log"), " is not ", shQuote(notes), " (V8 package) but ", shQuote(l)) else q("no")'
 
 # stated dependency on R
 test-lin-ancient-cran:
 
@@ -379,6 +379,8 @@ See [#2611](https://github.com/Rdatatable/data.table/issues/2611) for details. T
 
 8. Retain important information in the error message about the source of the error when `i=` fails, e.g. pointing to `charToDate()` failing in `DT[date_col == "20250101"]`, [#7444](https://github.com/Rdatatable/data.table/issues/7444). Thanks @jan-swissre for the report and @MichaelChirico for the fix.
 
+9. Internal use of declared non-API R functions `SETLENGTH`, `TRUELENGTH`, `SET_TRUELENGTH`, and `SET_GROWABLE_BIT` has been eliminated. Most usages have been migrated to R's experimental resizable vectors API (thanks to @ltierney, introduced in R 4.6.0, backported for older R versions), [#7451](https://github.com/Rdatatable/data.table/pull/7451). Uses of `TRUELENGTH` for marking seen items during grouping and binding operations (aka free hash table trick) have been replaced with proper hash tables, [#6694](https://github.com/Rdatatable/data.table/pull/6694). The new hash table implementation uses linear probing with power of 2 tables and automatic resizing. Additionally, `chmatch()` now hashes the needle (`x`) instead of the haystack (`table`) when `length(table) >> length(x)`, significantly improving performance for lookups into large tables. We've benchmarked the refactored code and find the performance satisfactory, but please do report any edge case performance regressions we may have missed. Thanks to @aitap, @ben-schwen, @jangorecki and @HughParsonage for implementation and reviews.
+
 ## data.table [v1.17.8](https://github.com/Rdatatable/data.table/milestone/41) (6 July 2025)
 
 1. Internal functions used to signal errors are now marked as non-returning, silencing a compiler warning about potentially unchecked allocation failure. Thanks to Prof. Brian D. Ripley for the report and @aitap for the fix, [#7070](https://github.com/Rdatatable/data.table/pull/7070).
 
@@ -1611,7 +1611,7 @@ replace_dot_alias = function(e) {
   if (length(xcols)) {
     #  TODO add: if (max(len__)==nrow) stopf("There is no need to deep copy x in this case")
     #  TODO move down to dogroup.c, too.
-    SDenv$.SDall = .Call(CsubsetDT, x, if (length(len__)) seq_len(max(len__)) else 0L, xcols)  # must be deep copy when largest group is a subset
+    SDenv$.SDall = .Call(CcopyAsGrowable, .Call(CsubsetDT, x, if (length(len__)) seq_len(max(len__)) else 0L, xcols))  # must be deep copy when largest group is a subset
     if (!is.data.table(SDenv$.SDall)) setattr(SDenv$.SDall, "class", c("data.table","data.frame"))  # DF |> DT(,.SD[...],by=grp) needs .SD to be data.table, test 2022.012
     if (xdotcols) setattr(SDenv$.SDall, 'names', ansvars[xcolsAns]) # now that we allow 'x.' prefix in 'j', #2313 bug fix - [xcolsAns]
     SDenv$.SD = if (length(non_sdvars)) shallow(SDenv$.SDall, sdvars) else SDenv$.SDall
@@ -1884,7 +1884,7 @@ replace_dot_alias = function(e) {
     grpcols = leftcols # 'leftcols' are the columns in i involved in the join (either head of key(i) or head along i)
     jiscols = chmatch(jisvars, names_i)  # integer() if there are no jisvars (usually there aren't, advanced feature)
     xjiscols = chmatch(xjisvars, names_x)
-    SDenv$.xSD = x[min(nrow(i), 1L), xjisvars, with=FALSE]
+    SDenv$.xSD = .Call(CcopyAsGrowable, x[min(nrow(i), 1L), xjisvars, with=FALSE])
     if (!missing(on)) o__ = xo else o__ = integer(0L)
   } else {
     groups = byval
 
@@ -21888,3 +21888,11 @@ foo   = function(dt) { dt[,b:=4:6]; return(7:9) }
 DT = data.table(a=1:3)
 test(2349, DT[,c:=outer(DT)], data.table(a=1:3, b=4:6, c=7:9))
 test(2349.1, DT[,c:=foo(DT)], data.table(a=1:3, b=4:6, c=7:9))
+rm(inner, outer, foo, DT)
+
+# exercise rehashing during forder, #6694
+strings = as.character(6145:1)
+DT = data.table(x = strings)
+setorder(DT, x)
+test(2350, DT[["x"]], sort.int(strings, method='radix'))
+rm(DT, strings)
@@ -762,29 +762,19 @@ const char *memrecycle(const SEXP target, const SEXP where, const int start, con
         const int nTargetLevels=length(targetLevels), nSourceLevels=length(sourceLevels);
         const SEXP *targetLevelsD=STRING_PTR_RO(targetLevels), *sourceLevelsD=STRING_PTR_RO(sourceLevels);
         SEXP newSource = PROTECT(allocVector(INTSXP, length(source))); protecti++;
-        savetl_init();
+        hashtab * marks = hash_create((size_t)nTargetLevels + nSourceLevels);
+        PROTECT(marks->prot); protecti++;
         for (int k=0; k<nTargetLevels; ++k) {
           const SEXP s = targetLevelsD[k];
-          const int tl = TRUELENGTH(s);
-          if (tl>0) {
-            savetl(s);
-          } else if (tl<0) {
-            // # nocov start
-            for (int j=0; j<k; ++j) SET_TRUELENGTH(s, 0);  // wipe our negative usage and restore 0
-            savetl_end();                                  // then restore R's own usage (if any)
-            internal_error(__func__, "levels of target are either not unique or have truelength<0"); // # nocov
-            // # nocov end
-          }
-          SET_TRUELENGTH(s, -k-1);
+          hash_set(marks, s, -k-1);
         }
         int nAdd = 0;
         for (int k=0; k<nSourceLevels; ++k) {
           const SEXP s = sourceLevelsD[k];
-          const int tl = TRUELENGTH(s);
+          const int tl = hash_lookup(marks, s, 0);
           if (tl>=0) {
             if (!sourceIsFactor && s==NA_STRING) continue; // don't create NA factor level when assigning character to factor; test 2117
-            if (tl>0) savetl(s);
-            SET_TRUELENGTH(s, -nTargetLevels-(++nAdd));
+            hash_set(marks, s, -nTargetLevels-(++nAdd));
           } // else, when sourceIsString, it's normal for there to be duplicates here
         }
         const int nSource = length(source);
@@ -793,45 +783,36 @@ const char *memrecycle(const SEXP target, const SEXP where, const int start, con
           const int *sourceD = INTEGER(source);
           for (int i=0; i<nSource; ++i) {  // convert source integers to refer to target levels
             const int val = sourceD[i];
-            newSourceD[i] = val==NA_INTEGER ? NA_INTEGER : -TRUELENGTH(sourceLevelsD[val-1]); // retains NA factor levels here via TL(NA_STRING); e.g. ordered factor
+            newSourceD[i] = val==NA_INTEGER ? NA_INTEGER : -hash_lookup(marks, sourceLevelsD[val-1], 0); // retains NA factor levels here via TL(NA_STRING); e.g. ordered factor
           }
         } else {
           const SEXP *sourceD = STRING_PTR_RO(source);
           for (int i=0; i<nSource; ++i) {  // convert source integers to refer to target levels
             const SEXP val = sourceD[i];
-            newSourceD[i] = val==NA_STRING ? NA_INTEGER : -TRUELENGTH(val);
+            newSourceD[i] = val==NA_STRING ? NA_INTEGER : -hash_lookup(marks, val, 0);
           }
         }
         source = newSource;
-        for (int k=0; k<nTargetLevels; ++k) SET_TRUELENGTH(targetLevelsD[k], 0);  // don't need those anymore
+        for (int k=0; k<nTargetLevels; ++k) hash_set(marks, targetLevelsD[k], 0);  // don't need those anymore
         if (nAdd) {
-          // cannot grow the levels yet as that would be R call which could fail to alloc and we have no hook to clear up
-          SEXP *temp = malloc(sizeof(*temp) * nAdd);
-          if (!temp) {
-            // # nocov start
-            for (int k=0; k<nSourceLevels; ++k) SET_TRUELENGTH(sourceLevelsD[k], 0);
-            savetl_end();
-            error(_("Unable to allocate working memory of %zu bytes to combine factor levels"), nAdd*sizeof(SEXP *));
-            // # nocov end
-          }
+          void *vmax = vmaxget();
+          SEXP *temp = (SEXP *)R_alloc(nAdd, sizeof(*temp));
           for (int k=0, thisAdd=0; thisAdd<nAdd; ++k) {   // thisAdd<nAdd to stop early when the added ones are all reached
             SEXP s = sourceLevelsD[k];
-            int tl = TRUELENGTH(s);
+            int tl = hash_lookup(marks, s, 0);
             if (tl) {  // tl negative here
               if (tl != -nTargetLevels-thisAdd-1) internal_error(__func__, "extra level check sum failed"); // # nocov
               temp[thisAdd++] = s;
-              SET_TRUELENGTH(s,0);
+              hash_set(marks, s, 0);
             }
           }
-          savetl_end();
           setAttrib(target, R_LevelsSymbol, targetLevels=growVector(targetLevels, nTargetLevels + nAdd));
           for (int k=0; k<nAdd; ++k) {
             SET_STRING_ELT(targetLevels, nTargetLevels+k, temp[k]);
           }
-          free(temp);
+          vmaxset(vmax);
         } else {
           // all source levels were already in target levels, but not with the same integers; we're done
-          savetl_end();
         }
         // now continue, but with the mapped integers in the (new) source
       }
@@ -1206,62 +1187,6 @@ SEXP allocNAVectorLike(SEXP x, R_len_t n) {
   return(v);
 }
 
-static SEXP *saveds=NULL;
-static R_len_t *savedtl=NULL, nalloc=0, nsaved=0;
-
-void savetl_init(void) {
-  if (nsaved || nalloc || saveds || savedtl) {
-    internal_error(__func__, "savetl_init checks failed (%d %d %p %p)", nsaved, nalloc, (void *)saveds, (void *)savedtl); // # nocov
-  }
-  nsaved = 0;
-  nalloc = 100;
-  saveds = malloc(sizeof(*saveds) * nalloc);
-  savedtl = malloc(sizeof(*savedtl) * nalloc);
-  if (!saveds || !savedtl) {
-    free(saveds); free(savedtl);                                            // # nocov
-    savetl_end();                                                           // # nocov
-    error(_("Failed to allocate initial %d items in savetl_init"), nalloc); // # nocov
-  }
-}
-
-void savetl(SEXP s)
-{
-  if (nsaved==nalloc) {
-    if (nalloc==INT_MAX) {
-      savetl_end();                                                                                                     // # nocov
-      internal_error(__func__, "reached maximum %d items for savetl", nalloc); // # nocov
-    }
-    nalloc = nalloc>(INT_MAX/2) ? INT_MAX : nalloc*2;
-    char *tmp = realloc(saveds, sizeof(SEXP)*nalloc);
-    if (tmp==NULL) {
-      // C spec states that if realloc() fails the original block is left untouched; it is not freed or moved. We rely on that here.
-      savetl_end();                                                      // # nocov  free(saveds) happens inside savetl_end
-      error(_("Failed to realloc saveds to %d items in savetl"), nalloc);   // # nocov
-    }
-    saveds = (SEXP *)tmp;
-    tmp = realloc(savedtl, sizeof(R_len_t)*nalloc);
-    if (tmp==NULL) {
-      savetl_end();                                                      // # nocov
-      error(_("Failed to realloc savedtl to %d items in savetl"), nalloc);  // # nocov
-    }
-    savedtl = (R_len_t *)tmp;
-  }
-  saveds[nsaved] = s;
-  savedtl[nsaved] = TRUELENGTH(s);
-  nsaved++;
-}
-
-void savetl_end(void) {
-  // Can get called if nothing has been saved yet (nsaved==0), or even if _init() hasn't been called yet (pointers NULL). Such
-  // as to clear up before error. Also, it might be that nothing needed to be saved anyway.
-  for (int i=0; i<nsaved; i++) SET_TRUELENGTH(saveds[i],savedtl[i]);
-  free(saveds);  // possible free(NULL) which is safe no-op
-  saveds = NULL;
-  free(savedtl);
-  savedtl = NULL;
-  nsaved = nalloc = 0;
-}
-
 SEXP setcharvec(SEXP x, SEXP which, SEXP newx)
 {
   int w;
 
@@ -36,8 +36,7 @@ static SEXP chmatchMain(SEXP x, SEXP table, int nomatch, bool chin, bool chmatch
   }
   // Since non-ASCII strings may be marked with different encodings, it only make sense to compare
   // the bytes under a same encoding (UTF-8) #3844 #3850.
-  // Not 'const' because we might SET_TRUELENGTH() below.
-  SEXP *xd;
+  const SEXP *xd;
   if (isSymbol(x)) {
     xd = &sym;
   } else {
@@ -55,34 +54,54 @@ static SEXP chmatchMain(SEXP x, SEXP table, int nomatch, bool chin, bool chmatch
     UNPROTECT(nprotect);
     return ans;
   }
-  // else xlen>1; nprotect is const above since no more R allocations should occur after this point
-  savetl_init();
-  for (int i=0; i<xlen; i++) {
-    SEXP s = xd[i];
-    const int tl = TRUELENGTH(s);
-    if (tl>0) {
-      savetl(s);  // R's internal hash (which is positive); save it
-      SET_TRUELENGTH(s,0);
-    } else if (tl<0) {
-      // R 2.14.0+ initializes truelength to 0 (before that it was uninitialized/random).
-      // Now that data.table depends on R 3.1.0+, that is after 2.14.0 too.
-      // We rely on that 0-initialization, and that R's internal hash is positive.
-      // # nocov start
-      savetl_end();
-      internal_error(__func__, "CHARSXP '%s' has a negative truelength (%d)", CHAR(s), tl); // # nocov
-      // # nocov end
+  // Else xlen > 1.
+  // When table >> x, hash x and scan table // ToDo tune the kick-in factor
+  if (!chmatchdup && tablelen > 2 * xlen) {
+    hashtab *marks = hash_create(xlen);
+    PROTECT(marks->prot); nprotect++;
+    int nuniq = 0;
+    for (int i = 0; i < xlen; ++i) {
+      // todo use lookup_insert?
+      int tl = hash_lookup(marks, xd[i], 0);
+      if (tl == 0) {
+        hash_set(marks, xd[i], -1);
+        nuniq++;
+      }
+    }
+
+    for (int i = 0; i < tablelen; ++i) {
+      int tl = hash_lookup(marks, td[i], 0);
+      if (tl == -1) {
+        hash_set(marks, td[i], i + 1);
+        nuniq--;
+        if (nuniq == 0) break; // all found, stop scanning
+      }
+    }
+
+    if (chin) {
+      #pragma omp parallel for num_threads(getDTthreads(xlen, true))
+      for (int i = 0; i < xlen; ++i) {
+        ansd[i] = hash_lookup(marks, xd[i], 0) > 0;
+      }
+    } else {
+      #pragma omp parallel for num_threads(getDTthreads(xlen, true))
+      for (int i = 0; i < xlen; ++i) {
+        const int m = hash_lookup(marks, xd[i], 0);
+        ansd[i] = (m < 0) ? nomatch : m;
+      }
     }
+    UNPROTECT(nprotect);
+    return ans;
   }
+  hashtab * marks = hash_create(tablelen);
+  PROTECT(marks->prot); nprotect++;
   int nuniq=0;
   for (int i=0; i<tablelen; ++i) {
     const SEXP s = td[i];
-    int tl = TRUELENGTH(s);
-    if (tl>0) { savetl(s); tl=0; }
-    if (tl==0) SET_TRUELENGTH(s, chmatchdup ? -(++nuniq) : -i-1); // first time seen this string in table
+    int tl = hash_lookup(marks, s, 0);
+    if (tl==0) hash_set(marks, s, chmatchdup ? -(++nuniq) : -i-1); // first time seen this string in table
   }
   // in future if we need NAs in x not to be matched to NAs in table ...
-  // if (!matchNAtoNA && TRUELENGTH(NA_STRING)<0)
-  //   SET_TRUELENGTH(NA_STRING, 0);
   if (chmatchdup) {
     // chmatchdup() is basically base::pmatch() but without the partial matching part. For example :
     //   chmatchdup(c("a", "a"), c("a", "a"))   # 1,2  - the second 'a' in 'x' has a 2nd match in 'table'
@@ -96,45 +115,37 @@ static SEXP chmatchMain(SEXP x, SEXP table, int nomatch, bool chin, bool chmatch
     // For example: A,B,C,B,D,E,A,A   =>   A(TL=1),B(2),C(3),D(4),E(5)   =>   dupMap    1  2  3  5  6 | 8  7  4
     //                                                                        dupLink   7  8          |    6     (blank=0)
     unsigned int mapsize = tablelen+nuniq; // lto compilation warning #5760 // +nuniq to store a 0 at the end of each group
-    int *counts = calloc(nuniq, sizeof(*counts));
-    int *map =    calloc(mapsize, sizeof(*map));
-    if (!counts || !map) {
-      // # nocov start
-      free(counts); free(map);
-      for (int i=0; i<tablelen; i++) SET_TRUELENGTH(td[i], 0);
-      savetl_end();
-      error(_("Failed to allocate %"PRIu64" bytes working memory in chmatchdup: length(table)=%d length(unique(table))=%d"), ((uint64_t)tablelen*2+nuniq)*sizeof(int), tablelen, nuniq);
-      // # nocov end
-    }
-    for (int i=0; i<tablelen; ++i) counts[-TRUELENGTH(td[i])-1]++;
+    void *vmax = vmaxget();
+    int *counts = (int *)R_alloc(nuniq, sizeof(*counts));
+    if (nuniq)   memset(counts, 0, sizeof(*counts) * nuniq);
+    int *map    = (int *)R_alloc(mapsize, sizeof(*map));
+    if (mapsize) memset(map,    0, sizeof(*map)    * mapsize);
+    for (int i=0; i<tablelen; ++i) counts[-hash_lookup(marks, td[i], 0)-1]++;
     for (int i=0, sum=0; i<nuniq; ++i) { int tt=counts[i]; counts[i]=sum; sum+=tt+1; }
-    for (int i=0; i<tablelen; ++i) map[counts[-TRUELENGTH(td[i])-1]++] = i+1;           // 0 is left ending each group thanks to the calloc
+    for (int i=0; i<tablelen; ++i) map[counts[-hash_lookup(marks, td[i], 0)-1]++] = i+1;           // 0 is left ending each group thanks to the memset
     for (int i=0, last=0; i<nuniq; ++i) {int tt=counts[i]+1; counts[i]=last; last=tt;}  // rewind counts to the beginning of each group
     for (int i=0; i<xlen; ++i) {
-      int u = TRUELENGTH(xd[i]);
+      int u = hash_lookup(marks, xd[i], 0);
       if (u<0) {
         const int w = counts[-u-1]++;
         if (map[w]) { ansd[i]=map[w]; continue; }
-        SET_TRUELENGTH(xd[i],0); // w falls on ending 0 marker: dups used up; any more dups should return nomatch
-        // we still need the 0-setting loop at the end of this function because often there will be some values in table that are not matched to at all.
+        hash_set(marks,xd[i],0); // w falls on ending 0 marker: dups used up; any more dups should return nomatch
       }
       ansd[i] = nomatch;
     }
-    free(counts);
-    free(map);
+    vmaxset(vmax);
   } else if (chin) {
+    #pragma omp parallel for num_threads(getDTthreads(xlen, true))
     for (int i=0; i<xlen; i++) {
-      ansd[i] = TRUELENGTH(xd[i])<0;
+      ansd[i] = hash_lookup(marks,xd[i],0)<0;
     }
   } else {
+    #pragma omp parallel for num_threads(getDTthreads(xlen, true))
     for (int i=0; i<xlen; i++) {
-      const int m = TRUELENGTH(xd[i]);
+      const int m = hash_lookup(marks,xd[i],0);
       ansd[i] = (m<0) ? -m : nomatch;
     }
   }
-  for (int i=0; i<tablelen; i++)
-    SET_TRUELENGTH(td[i], 0);  // reinstate 0 rather than leave the -i-1
-  savetl_end();
   UNPROTECT(nprotect);  // ans, xd, td
   return ans;
 }