Dynamically grow the hash table with bound unknown

aitap · aitap · commit d7a9a1707ec9 · 2025-01-01T19:31:01.000+03:00
In forder() and rbindlist(), there is no good upper boundary on the
number of elements in the hash known ahead of time. Grow the hash table
dynamically. Since the R/W locks are far too slow and OpenMP atomics are
too limited, rely on strategically placed flushes, which isn't really a
solution.
diff --git a/src/data.table.h b/src/data.table.h
@@ -283,13 +283,10 @@ SEXP notchin(SEXP x, SEXP table);
 typedef struct hash_tab hashtab;
 // Allocate, initialise, and return a pointer to the new hash table.
 // n is the maximal number of elements that will be inserted.
-// load_factor is a real in (0, 1) specifying the desired fraction of used table elements.
 // Lower load factors lead to fewer collisions and faster lookups, but waste memory.
 // May raise an R error if an allocation fails or a size is out of bounds.
 // The table is temporary (allocated via R_alloc()) and will be unprotected upon return from the .Call().
 // See vmaxget()/vmaxset() if you need to unprotect it manually.
-hashtab * hash_create_(size_t n, double load_factor);
-// Hard-coded "good enough" load_factor
 hashtab * hash_create(size_t n);
 // Inserts a new key-value pair into the hash, or overwrites an existing value.
 // Will raise an R error if inserting more than n elements.
@@ -298,6 +295,15 @@ void hash_set(hashtab *, SEXP key, R_xlen_t value);
 // Returns the value corresponding to the key present in the hash, otherwise returns ifnotfound.
 R_xlen_t hash_lookup(const hashtab *, SEXP key, R_xlen_t ifnotfound);
 
+// The dynamically-allocated hash table has a public field for the R protection wrapper.
+// Keep it PROTECTed while the table is in use.
+typedef struct dhash_tab {
+  SEXP prot;
+} dhashtab;
+dhashtab * dhash_create(size_t n);
+void dhash_set(dhashtab * h, SEXP key, R_xlen_t value);
+R_xlen_t dhash_lookup(dhashtab * h, SEXP key, R_xlen_t ifnotfound);
+
 // functions called from R level .Call/.External and registered in init.c
 // these now live here to pass -Wstrict-prototypes, #5477
 // all arguments must be SEXP since they are called from R level
diff --git a/src/forder.c b/src/forder.c
@@ -287,7 +287,7 @@ static void cradix(SEXP *x, int n)
   free(cradix_xtmp);   cradix_xtmp=NULL;
 }
 
-static void range_str(const SEXP *x, int n, uint64_t *out_min, uint64_t *out_max, int *out_na_count, bool *out_anynotascii, bool *out_anynotutf8, hashtab * marks)
+static void range_str(const SEXP *x, int n, uint64_t *out_min, uint64_t *out_max, int *out_na_count, bool *out_anynotascii, bool *out_anynotutf8, dhashtab * marks)
 // group numbers are left in truelength to be fetched by WRITE_KEY
 {
   int na_count=0;
@@ -302,13 +302,13 @@ static void range_str(const SEXP *x, int n, uint64_t *out_min, uint64_t *out_max
       na_count++;
       continue;
     }
-    // Why is it acceptable to call hash_lookup when marks can be shared between threads?
-    // 1. There are no pointers to follow for hash_lookup() inside the hash table, so there's no danger of crashing by following a partially written pointer.
-    // 2. If another thread writes s into the hash but hash_lookup() fails to see a non-zero value, we'll safely check it again in the critical section below.
+    // Why is it acceptable to call dhash_lookup when marks can be shared between threads?
+    // 1. We have rwlocks to avoid crashing on a pointer being invalidated by a different thread.
+    // 2. We check again after entering the critical section.
     // 3. We only change the marks from zero to nonzero, so once a nonzero value is seen, it must be correct.
-    if (hash_lookup(marks,s,0)<0) continue; // seen this group before
-    #pragma omp critical
-    if (hash_lookup(marks,s,0)>=0) {  // another thread may have set it while I was waiting, so check it again
+    if (dhash_lookup(marks,s,0)<0) continue; // seen this group before
+    #pragma omp critical(range_str_write)
+    if (dhash_lookup(marks,s,0)>=0) {  // another thread may have set it while I was waiting, so check it again
       // now save unique SEXP in ustr so we can loop sort uniques when sorting too
       if (ustr_alloc<=ustr_n) {
         ustr_alloc = (ustr_alloc==0) ? 16384 : ustr_alloc*2;  // small initial guess, negligible time to alloc 128KB (32 pages)
@@ -317,7 +317,7 @@ static void range_str(const SEXP *x, int n, uint64_t *out_min, uint64_t *out_max
         if (ustr==NULL) STOP(_("Unable to realloc %d * %d bytes in range_str"), ustr_alloc, (int)sizeof(SEXP));  // # nocov
       }
       ustr[ustr_n++] = s;
-      hash_set(marks, s, -ustr_n);  // unique in any order is fine. first-appearance order is achieved later in count_group
+      dhash_set(marks, s, -ustr_n);  // unique in any order is fine. first-appearance order is achieved later in count_group
       if (LENGTH(s)>ustr_maxlen) ustr_maxlen=LENGTH(s);
       if (!anynotutf8 &&    // even if anynotascii we still want to know if anynotutf8, and anynotutf8 implies anynotascii already
             !IS_ASCII(s)) { // anynotutf8 implies anynotascii and IS_ASCII will be cheaper than IS_UTF8, so start with this one
@@ -350,20 +350,20 @@ static void range_str(const SEXP *x, int n, uint64_t *out_min, uint64_t *out_max
       if (LENGTH(s)>ustr_maxlen) ustr_maxlen=LENGTH(s);
     }
     cradix(ustr3, ustr_n);  // sort to detect possible duplicates after converting; e.g. two different non-utf8 map to the same utf8
-    hash_set(marks, ustr3[0], -1);
+    dhash_set(marks, ustr3[0], -1);
     int o = -1;
     for (int i=1; i<ustr_n; i++) {
       if (ustr3[i] == ustr3[i-1]) continue;  // use the same o for duplicates
-      hash_set(marks, ustr3[i], --o);
+      dhash_set(marks, ustr3[i], --o);
     }
     // now use the 1-1 mapping from ustr to ustr2 to get the ordering back into original ustr, being careful to reset tl to 0
     int *tl = (int *)malloc(ustr_n * sizeof(int));
     if (!tl)
       STOP(_("Failed to alloc tl when converting strings to UTF8"));  // # nocov
     const SEXP *tt = STRING_PTR_RO(ustr2);
-    for (int i=0; i<ustr_n; i++) tl[i] = hash_lookup(marks, tt[i], 0);   // fetches the o in ustr3 into tl which is ordered by ustr
-    for (int i=0; i<ustr_n; i++) hash_set(marks, ustr3[i], 0);    // reset to 0 tl of the UTF8 (and possibly non-UTF in ustr too)
-    for (int i=0; i<ustr_n; i++) hash_set(marks, ustr[i], tl[i]); // put back the o into ustr's tl
+    for (int i=0; i<ustr_n; i++) tl[i] = dhash_lookup(marks, tt[i], 0);   // fetches the o in ustr3 into tl which is ordered by ustr
+    for (int i=0; i<ustr_n; i++) dhash_set(marks, ustr3[i], 0);    // reset to 0 tl of the UTF8 (and possibly non-UTF in ustr too)
+    for (int i=0; i<ustr_n; i++) dhash_set(marks, ustr[i], tl[i]); // put back the o into ustr's tl
     free(tl);
     free(ustr3);
     UNPROTECT(1);
@@ -376,7 +376,7 @@ static void range_str(const SEXP *x, int n, uint64_t *out_min, uint64_t *out_max
       // that this is always ascending; descending is done in WRITE_KEY using max-this
       cradix(ustr, ustr_n);  // sorts ustr in-place by reference. assumes NA_STRING not present.
       for(int i=0; i<ustr_n; i++)     // save ordering in the CHARSXP. negative so as to distinguish with R's own usage.
-        hash_set(marks, ustr[i], -i-1);
+        dhash_set(marks, ustr[i], -i-1);
     }
     // else group appearance order was already saved to tl in the first pass
   }
@@ -568,7 +568,7 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrpArg, SEXP retStatsArg, SEXP sortGroupsA
         STOP(_("Item %d of order (ascending/descending) is %d. Must be +1 or -1."), col+1, sortType);
     }
     //Rprintf(_("sortType = %d\n"), sortType);
-    hashtab * marks = NULL; // only used for STRSXP below
+    dhashtab * marks = NULL; // only used for STRSXP below
     switch(TYPEOF(x)) {
     case INTSXP : case LGLSXP :  // TODO skip LGL and assume range [0,1]
       range_i32(INTEGER(x), nrow, &min, &max, &na_count);
@@ -605,7 +605,8 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrpArg, SEXP retStatsArg, SEXP sortGroupsA
       break;
     case STRSXP :
       // need2utf8 now happens inside range_str on the uniques
-      marks = hash_create(nrow*2); // we mark both original and converted strings, hence the factor of 2
+      marks = dhash_create(4096); // relatively small to allocate, can grow exponentially later
+      PROTECT(marks->prot); n_protect++;
       range_str(STRING_PTR_RO(x), nrow, &min, &max, &na_count, &anynotascii, &anynotutf8, marks);
       break;
     default:
@@ -763,7 +764,7 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrpArg, SEXP retStatsArg, SEXP sortGroupsA
           if (nalast==-1) anso[i]=0;
           elem = naval;
         } else {
-          elem = -hash_lookup(marks, xd[i], 0);
+          elem = -dhash_lookup(marks, xd[i], 0);
         }
         WRITE_KEY
       }}
diff --git a/src/hash.c b/src/hash.c
@@ -1,3 +1,5 @@
+#include <pthread.h>
+
 #include "data.table.h"
 
 struct hash_pair {
@@ -10,44 +12,52 @@ struct hash_tab {
   struct hash_pair tb[];
 };
 
-hashtab * hash_create(size_t n) { return hash_create_(n, .5); }
 // TAOCP vol. 3, section 6.4: for multiplication hashing, use A ~ 1/phi, the golden ratio.
 static const double hash_multiplier = 0.618033988749895;
 
-hashtab * hash_create_(size_t n, double load_factor) {
+static R_INLINE size_t get_full_size(size_t n_elements, double load_factor) {
   if (load_factor <= 0 || load_factor >= 1)
-    internal_error("hash_create", "load_factor=%g not in (0, 1)", load_factor); // # nocov
+    internal_error(__func__, "load_factor=%g not in (0, 1)", load_factor); // # nocov
   // precondition: n / load_factor < SIZE_MAX
-  // truncate to compare in exact integer arithmetic and preserve all bits of n
-  if ((size_t)(SIZE_MAX * load_factor) <= n) internal_error(
-    "hash_create", "n=%zu / load_factor=%g would overflow size_t",
-    n, load_factor
+  // this is implemented a bit stricter than needed and would fail some almost-too-high sizes
+  // due to the size_t -> double conversion
+  if ((size_t)((double)SIZE_MAX * load_factor) <= n_elements) internal_error(
+    __func__, "n=%zu / load_factor=%g would overflow size_t",
+    n_elements, load_factor
   );
-  size_t n_full = ceil(n / load_factor);
+  return ceil(n_elements / load_factor);
+}
+
+static hashtab * hash_create_(size_t n, double load_factor) {
+  size_t n_full = get_full_size(n, load_factor);
   // precondition: sizeof hashtab + hash_pair[n_full] < SIZE_MAX
   //                        n_full * sizeof hash_pair < SIZE_MAX - sizeof hashtab
   //                                 sizeof hash_pair < (SIZE_MAX - sizeof hashtab) / n_full
   // (note that sometimes n is 0)
-  if (n_full && sizeof(struct hash_pair) >= (SIZE_MAX - sizeof(hashtab)) / n_full) internal_error(
-    "hash_create", "n=%zu with load_factor=%g would overflow total allocation size",
-    n, load_factor
-  );
+  if (n_full && sizeof(struct hash_pair) >= (SIZE_MAX - sizeof(hashtab)) / n_full)
+    internal_error(
+      __func__, "n=%zu with load_factor=%g would overflow total allocation size",
+      n, load_factor
+    );
   hashtab * ret = (hashtab *)R_alloc(sizeof(hashtab) + sizeof(struct hash_pair[n_full]), 1);
   ret->size = n_full;
   ret->free = n;
   // To compute floor(size * (A * key % 1)) in integer arithmetic with A < 1, use ((size * A) * key) % size.
   ret->multiplier = n_full * hash_multiplier;
   // No valid SEXP is a null pointer, so it's a safe marker for empty cells.
   for (size_t i = 0; i < n_full; ++i)
-    ret->tb[i] = (struct hash_pair){.key = NULL, .value = 0};
+    ret->tb[i].key = NULL;
   return ret;
 }
 
+hashtab * hash_create(size_t n) { return hash_create_(n, .5); }
+
 // Hashing for an open addressing hash table. See Cormen et al., Introduction to Algorithms, 3rd ed., section 11.4.
 // This is far from perfect. Make size a prime or a power of two and you'll be able to use double hashing.
 static R_INLINE size_t hash_index(SEXP key, uintptr_t multiplier, size_t offset, size_t size) {
   // The 4 lowest bits of the pointer are probably zeroes because a typical SEXPREC exceeds 16 bytes in size.
-  // Since SEXPRECs are heap-allocated, they are subject to malloc() alignment guarantees, which is at least 4 bytes on 32-bit platforms, most likely more than 8 bytes.
+  // Since SEXPRECs are heap-allocated, they are subject to malloc() alignment guarantees,
+  // which is at least 4 bytes on 32-bit platforms, most likely more than 8 bytes.
   return ((((uintptr_t)key) >> 4) * multiplier + offset) % size;
 }
 
@@ -59,15 +69,15 @@ void hash_set(hashtab * h, SEXP key, R_xlen_t value) {
       return;
     } else if (!cell->key) {
       if (!h->free) internal_error(
-        "hash_insert", "no free slots left (full size=%zu)", h->size
+        __func__, "no free slots left (full size=%zu)", h->size
       );
       --h->free;
       *cell = (struct hash_pair){.key = key, .value = value};
       return;
     }
   }
   internal_error( // # nocov
-    "hash_insert", "did not find a free slot for key %p; size=%zu, free=%zu",
+    __func__, "did not find a free slot for key %p; size=%zu, free=%zu",
     (void*)key, h->size, h->free
   );
 }
@@ -84,3 +94,130 @@ R_xlen_t hash_lookup(const hashtab * h, SEXP key, R_xlen_t ifnotfound) {
   // Should be impossible with a load factor below 1, but just in case:
   return ifnotfound; // # nocov
 }
+
+typedef struct dhashtab_ {
+  dhashtab public; // must be at offset 0
+  size_t size, used, limit;
+  uintptr_t multiplier;
+  struct hash_pair *table, *previous;
+} dhashtab_;
+
+static void dhash_finalizer(SEXP dhash) {
+  dhashtab_ * self = R_ExternalPtrAddr(dhash);
+  if (!self) return;
+  R_ClearExternalPtr(dhash);
+  free(self->previous);
+  free(self->table);
+  free(self);
+}
+
+static struct hash_pair * dhash_allocate(size_t n_full) {
+  if (n_full > SIZE_MAX / sizeof(struct hash_pair))
+    internal_error(__func__, "%zu hash table slots would overflow size_t", n_full); // # nocov
+  struct hash_pair * new = malloc(sizeof(struct hash_pair[n_full]));
+  if (!new) internal_error(__func__, "failed to malloc() %zu hash table slots", n_full); // # nocov
+  for (size_t i = 0; i < n_full; ++i) new[i] = (struct hash_pair){.key = NULL};
+  return new;
+}
+
+static dhashtab * dhash_create_(size_t n, double load_factor) {
+  size_t n_full = get_full_size(n, load_factor);
+
+  SEXP prot = PROTECT(R_MakeExternalPtr(NULL, R_NilValue, R_NilValue));
+  R_RegisterCFinalizerEx(prot, dhash_finalizer, TRUE);
+  dhashtab_ * self = malloc(sizeof(dhashtab_));
+  if (!self) internal_error(__func__, "failed to malloc() the hash table header"); // # nocov
+  *self = (dhashtab_){
+    .public = { .prot = prot },
+  };
+  R_SetExternalPtrAddr(prot, self);
+
+  self->table = dhash_allocate(n_full);
+  self->size = n_full;
+  self->limit = n;
+  self->multiplier = n_full * hash_multiplier;
+  // this is the last time we're allowed to set the table parts piece by piece
+
+  UNPROTECT(1);
+  return &self->public;
+}
+
+dhashtab * dhash_create(size_t n) { return dhash_create_(n, .5); }
+
+static void dhash_enlarge(dhashtab_ * self) {
+  if (self->size > SIZE_MAX / 2)
+    internal_error(__func__, "doubling %zu elements would overflow size_t", self->size); // # nocov
+  size_t new_size = self->size * 2;
+  struct hash_pair * new = dhash_allocate(new_size);
+  uintptr_t new_multiplier = new_size * hash_multiplier;
+  for (size_t i = 0; i < self->size; ++i) {
+    for (size_t j = 0; j < new_size; ++j) {
+      size_t ii = hash_index(self->table[i].key, new_multiplier, j, new_size);
+      if (!new[ii].key) {
+        new[ii] = (struct hash_pair){
+          .key = self->table[i].key,
+          .value = self->table[i].value
+        };
+        break;
+      }
+    }
+  }
+  // Not trying to protect from calls to _set -> _enlarge from other threads!
+  // Writes only come from a critical section, so two threads will not attempt to enlarge at the same time.
+  // What we have to prevent is yanking the self->table from under a different thread reading it right now.
+  free(self->previous);
+  struct hash_pair * previous = self->table;
+  dhashtab public = self->public;
+  size_t used = self->used, limit = self->limit*2;
+  *self = (dhashtab_){
+    .public = public,
+    .size = new_size,
+    .used = used,
+    .limit = limit,
+    .multiplier = new_multiplier,
+    .table = new,
+    .previous = previous,
+  };
+  #pragma omp flush // no locking or atomic access! this is bad
+}
+
+void dhash_set(dhashtab * h, SEXP key, R_xlen_t value) {
+  dhashtab_ * self = (dhashtab_ *)h;
+again:
+  for (size_t i = 0; i < self->size; ++i) {
+    struct hash_pair * cell = self->table + hash_index(key, self->multiplier, i, self->size);
+    if (cell->key == key) {
+      cell->value = value;
+      return;
+    } else if (!cell->key) {
+      if (self->used < self->limit) {
+        *cell = (struct hash_pair){ .key = key, .value = value };
+        ++self->used;
+        return;
+      }
+      dhash_enlarge(self);
+      goto again; // won't be needed next time with the limit doubled
+    }
+  }
+  internal_error( // # nocov
+    __func__, "did not find a free slot for key %p; size=%zu, used=%zu, limit=%zu",
+    (void*)key, self->size, self->used, self->limit
+  );
+}
+
+R_xlen_t dhash_lookup(dhashtab * h, SEXP key, R_xlen_t ifnotfound) {
+  #pragma omp flush // no locking or atomic access! this is bad
+  dhashtab_ self = *(dhashtab_ *)h;
+  R_xlen_t ret = ifnotfound;
+  for (size_t i = 0; i < self.size; ++i) {
+    const struct hash_pair * cell = self.table + hash_index(key, self.multiplier, i, self.size);
+    if (cell->key == key) {
+      ret = cell->value;
+      goto done;
+    } else if (!cell->key) {
+      goto done;
+    }
+  }
+done:
+  return ret;
+}
diff --git a/src/rbindlist.c b/src/rbindlist.c