Skip to content

Commit d7a9a17

Browse files
committed
Dynamically grow the hash table with bound unknown
In forder() and rbindlist(), there is no good upper boundary on the number of elements in the hash known ahead of time. Grow the hash table dynamically. Since the R/W locks are far too slow and OpenMP atomics are too limited, rely on strategically placed flushes, which isn't really a solution.
1 parent 24e8178 commit d7a9a17

File tree

4 files changed

+198
-68
lines changed

4 files changed

+198
-68
lines changed

src/data.table.h

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -283,13 +283,10 @@ SEXP notchin(SEXP x, SEXP table);
283283
typedef struct hash_tab hashtab;
284284
// Allocate, initialise, and return a pointer to the new hash table.
285285
// n is the maximal number of elements that will be inserted.
286-
// load_factor is a real in (0, 1) specifying the desired fraction of used table elements.
287286
// Lower load factors lead to fewer collisions and faster lookups, but waste memory.
288287
// May raise an R error if an allocation fails or a size is out of bounds.
289288
// The table is temporary (allocated via R_alloc()) and will be unprotected upon return from the .Call().
290289
// See vmaxget()/vmaxset() if you need to unprotect it manually.
291-
hashtab * hash_create_(size_t n, double load_factor);
292-
// Hard-coded "good enough" load_factor
293290
hashtab * hash_create(size_t n);
294291
// Inserts a new key-value pair into the hash, or overwrites an existing value.
295292
// Will raise an R error if inserting more than n elements.
@@ -298,6 +295,15 @@ void hash_set(hashtab *, SEXP key, R_xlen_t value);
298295
// Returns the value corresponding to the key present in the hash, otherwise returns ifnotfound.
299296
R_xlen_t hash_lookup(const hashtab *, SEXP key, R_xlen_t ifnotfound);
300297

298+
// The dynamically-allocated hash table has a public field for the R protection wrapper.
299+
// Keep it PROTECTed while the table is in use.
300+
typedef struct dhash_tab {
301+
SEXP prot;
302+
} dhashtab;
303+
dhashtab * dhash_create(size_t n);
304+
void dhash_set(dhashtab * h, SEXP key, R_xlen_t value);
305+
R_xlen_t dhash_lookup(dhashtab * h, SEXP key, R_xlen_t ifnotfound);
306+
301307
// functions called from R level .Call/.External and registered in init.c
302308
// these now live here to pass -Wstrict-prototypes, #5477
303309
// all arguments must be SEXP since they are called from R level

src/forder.c

Lines changed: 18 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -287,7 +287,7 @@ static void cradix(SEXP *x, int n)
287287
free(cradix_xtmp); cradix_xtmp=NULL;
288288
}
289289

290-
static void range_str(const SEXP *x, int n, uint64_t *out_min, uint64_t *out_max, int *out_na_count, bool *out_anynotascii, bool *out_anynotutf8, hashtab * marks)
290+
static void range_str(const SEXP *x, int n, uint64_t *out_min, uint64_t *out_max, int *out_na_count, bool *out_anynotascii, bool *out_anynotutf8, dhashtab * marks)
291291
// group numbers are left in truelength to be fetched by WRITE_KEY
292292
{
293293
int na_count=0;
@@ -302,13 +302,13 @@ static void range_str(const SEXP *x, int n, uint64_t *out_min, uint64_t *out_max
302302
na_count++;
303303
continue;
304304
}
305-
// Why is it acceptable to call hash_lookup when marks can be shared between threads?
306-
// 1. There are no pointers to follow for hash_lookup() inside the hash table, so there's no danger of crashing by following a partially written pointer.
307-
// 2. If another thread writes s into the hash but hash_lookup() fails to see a non-zero value, we'll safely check it again in the critical section below.
305+
// Why is it acceptable to call dhash_lookup when marks can be shared between threads?
306+
// 1. We have rwlocks to avoid crashing on a pointer being invalidated by a different thread.
307+
// 2. We check again after entering the critical section.
308308
// 3. We only change the marks from zero to nonzero, so once a nonzero value is seen, it must be correct.
309-
if (hash_lookup(marks,s,0)<0) continue; // seen this group before
310-
#pragma omp critical
311-
if (hash_lookup(marks,s,0)>=0) { // another thread may have set it while I was waiting, so check it again
309+
if (dhash_lookup(marks,s,0)<0) continue; // seen this group before
310+
#pragma omp critical(range_str_write)
311+
if (dhash_lookup(marks,s,0)>=0) { // another thread may have set it while I was waiting, so check it again
312312
// now save unique SEXP in ustr so we can loop sort uniques when sorting too
313313
if (ustr_alloc<=ustr_n) {
314314
ustr_alloc = (ustr_alloc==0) ? 16384 : ustr_alloc*2; // small initial guess, negligible time to alloc 128KB (32 pages)
@@ -317,7 +317,7 @@ static void range_str(const SEXP *x, int n, uint64_t *out_min, uint64_t *out_max
317317
if (ustr==NULL) STOP(_("Unable to realloc %d * %d bytes in range_str"), ustr_alloc, (int)sizeof(SEXP)); // # nocov
318318
}
319319
ustr[ustr_n++] = s;
320-
hash_set(marks, s, -ustr_n); // unique in any order is fine. first-appearance order is achieved later in count_group
320+
dhash_set(marks, s, -ustr_n); // unique in any order is fine. first-appearance order is achieved later in count_group
321321
if (LENGTH(s)>ustr_maxlen) ustr_maxlen=LENGTH(s);
322322
if (!anynotutf8 && // even if anynotascii we still want to know if anynotutf8, and anynotutf8 implies anynotascii already
323323
!IS_ASCII(s)) { // anynotutf8 implies anynotascii and IS_ASCII will be cheaper than IS_UTF8, so start with this one
@@ -350,20 +350,20 @@ static void range_str(const SEXP *x, int n, uint64_t *out_min, uint64_t *out_max
350350
if (LENGTH(s)>ustr_maxlen) ustr_maxlen=LENGTH(s);
351351
}
352352
cradix(ustr3, ustr_n); // sort to detect possible duplicates after converting; e.g. two different non-utf8 map to the same utf8
353-
hash_set(marks, ustr3[0], -1);
353+
dhash_set(marks, ustr3[0], -1);
354354
int o = -1;
355355
for (int i=1; i<ustr_n; i++) {
356356
if (ustr3[i] == ustr3[i-1]) continue; // use the same o for duplicates
357-
hash_set(marks, ustr3[i], --o);
357+
dhash_set(marks, ustr3[i], --o);
358358
}
359359
// now use the 1-1 mapping from ustr to ustr2 to get the ordering back into original ustr, being careful to reset tl to 0
360360
int *tl = (int *)malloc(ustr_n * sizeof(int));
361361
if (!tl)
362362
STOP(_("Failed to alloc tl when converting strings to UTF8")); // # nocov
363363
const SEXP *tt = STRING_PTR_RO(ustr2);
364-
for (int i=0; i<ustr_n; i++) tl[i] = hash_lookup(marks, tt[i], 0); // fetches the o in ustr3 into tl which is ordered by ustr
365-
for (int i=0; i<ustr_n; i++) hash_set(marks, ustr3[i], 0); // reset to 0 tl of the UTF8 (and possibly non-UTF in ustr too)
366-
for (int i=0; i<ustr_n; i++) hash_set(marks, ustr[i], tl[i]); // put back the o into ustr's tl
364+
for (int i=0; i<ustr_n; i++) tl[i] = dhash_lookup(marks, tt[i], 0); // fetches the o in ustr3 into tl which is ordered by ustr
365+
for (int i=0; i<ustr_n; i++) dhash_set(marks, ustr3[i], 0); // reset to 0 tl of the UTF8 (and possibly non-UTF in ustr too)
366+
for (int i=0; i<ustr_n; i++) dhash_set(marks, ustr[i], tl[i]); // put back the o into ustr's tl
367367
free(tl);
368368
free(ustr3);
369369
UNPROTECT(1);
@@ -376,7 +376,7 @@ static void range_str(const SEXP *x, int n, uint64_t *out_min, uint64_t *out_max
376376
// that this is always ascending; descending is done in WRITE_KEY using max-this
377377
cradix(ustr, ustr_n); // sorts ustr in-place by reference. assumes NA_STRING not present.
378378
for(int i=0; i<ustr_n; i++) // save ordering in the CHARSXP. negative so as to distinguish with R's own usage.
379-
hash_set(marks, ustr[i], -i-1);
379+
dhash_set(marks, ustr[i], -i-1);
380380
}
381381
// else group appearance order was already saved to tl in the first pass
382382
}
@@ -568,7 +568,7 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrpArg, SEXP retStatsArg, SEXP sortGroupsA
568568
STOP(_("Item %d of order (ascending/descending) is %d. Must be +1 or -1."), col+1, sortType);
569569
}
570570
//Rprintf(_("sortType = %d\n"), sortType);
571-
hashtab * marks = NULL; // only used for STRSXP below
571+
dhashtab * marks = NULL; // only used for STRSXP below
572572
switch(TYPEOF(x)) {
573573
case INTSXP : case LGLSXP : // TODO skip LGL and assume range [0,1]
574574
range_i32(INTEGER(x), nrow, &min, &max, &na_count);
@@ -605,7 +605,8 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrpArg, SEXP retStatsArg, SEXP sortGroupsA
605605
break;
606606
case STRSXP :
607607
// need2utf8 now happens inside range_str on the uniques
608-
marks = hash_create(nrow*2); // we mark both original and converted strings, hence the factor of 2
608+
marks = dhash_create(4096); // relatively small to allocate, can grow exponentially later
609+
PROTECT(marks->prot); n_protect++;
609610
range_str(STRING_PTR_RO(x), nrow, &min, &max, &na_count, &anynotascii, &anynotutf8, marks);
610611
break;
611612
default:
@@ -763,7 +764,7 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrpArg, SEXP retStatsArg, SEXP sortGroupsA
763764
if (nalast==-1) anso[i]=0;
764765
elem = naval;
765766
} else {
766-
elem = -hash_lookup(marks, xd[i], 0);
767+
elem = -dhash_lookup(marks, xd[i], 0);
767768
}
768769
WRITE_KEY
769770
}}

src/hash.c

Lines changed: 153 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
#include <pthread.h>
2+
13
#include "data.table.h"
24

35
struct hash_pair {
@@ -10,44 +12,52 @@ struct hash_tab {
1012
struct hash_pair tb[];
1113
};
1214

13-
hashtab * hash_create(size_t n) { return hash_create_(n, .5); }
1415
// TAOCP vol. 3, section 6.4: for multiplication hashing, use A ~ 1/phi, the golden ratio.
1516
static const double hash_multiplier = 0.618033988749895;
1617

17-
hashtab * hash_create_(size_t n, double load_factor) {
18+
static R_INLINE size_t get_full_size(size_t n_elements, double load_factor) {
1819
if (load_factor <= 0 || load_factor >= 1)
19-
internal_error("hash_create", "load_factor=%g not in (0, 1)", load_factor); // # nocov
20+
internal_error(__func__, "load_factor=%g not in (0, 1)", load_factor); // # nocov
2021
// precondition: n / load_factor < SIZE_MAX
21-
// truncate to compare in exact integer arithmetic and preserve all bits of n
22-
if ((size_t)(SIZE_MAX * load_factor) <= n) internal_error(
23-
"hash_create", "n=%zu / load_factor=%g would overflow size_t",
24-
n, load_factor
22+
// this is implemented a bit stricter than needed and would fail some almost-too-high sizes
23+
// due to the size_t -> double conversion
24+
if ((size_t)((double)SIZE_MAX * load_factor) <= n_elements) internal_error(
25+
__func__, "n=%zu / load_factor=%g would overflow size_t",
26+
n_elements, load_factor
2527
);
26-
size_t n_full = ceil(n / load_factor);
28+
return ceil(n_elements / load_factor);
29+
}
30+
31+
static hashtab * hash_create_(size_t n, double load_factor) {
32+
size_t n_full = get_full_size(n, load_factor);
2733
// precondition: sizeof hashtab + hash_pair[n_full] < SIZE_MAX
2834
// n_full * sizeof hash_pair < SIZE_MAX - sizeof hashtab
2935
// sizeof hash_pair < (SIZE_MAX - sizeof hashtab) / n_full
3036
// (note that sometimes n is 0)
31-
if (n_full && sizeof(struct hash_pair) >= (SIZE_MAX - sizeof(hashtab)) / n_full) internal_error(
32-
"hash_create", "n=%zu with load_factor=%g would overflow total allocation size",
33-
n, load_factor
34-
);
37+
if (n_full && sizeof(struct hash_pair) >= (SIZE_MAX - sizeof(hashtab)) / n_full)
38+
internal_error(
39+
__func__, "n=%zu with load_factor=%g would overflow total allocation size",
40+
n, load_factor
41+
);
3542
hashtab * ret = (hashtab *)R_alloc(sizeof(hashtab) + sizeof(struct hash_pair[n_full]), 1);
3643
ret->size = n_full;
3744
ret->free = n;
3845
// To compute floor(size * (A * key % 1)) in integer arithmetic with A < 1, use ((size * A) * key) % size.
3946
ret->multiplier = n_full * hash_multiplier;
4047
// No valid SEXP is a null pointer, so it's a safe marker for empty cells.
4148
for (size_t i = 0; i < n_full; ++i)
42-
ret->tb[i] = (struct hash_pair){.key = NULL, .value = 0};
49+
ret->tb[i].key = NULL;
4350
return ret;
4451
}
4552

53+
hashtab * hash_create(size_t n) { return hash_create_(n, .5); }
54+
4655
// Hashing for an open addressing hash table. See Cormen et al., Introduction to Algorithms, 3rd ed., section 11.4.
4756
// This is far from perfect. Make size a prime or a power of two and you'll be able to use double hashing.
4857
static R_INLINE size_t hash_index(SEXP key, uintptr_t multiplier, size_t offset, size_t size) {
4958
// The 4 lowest bits of the pointer are probably zeroes because a typical SEXPREC exceeds 16 bytes in size.
50-
// Since SEXPRECs are heap-allocated, they are subject to malloc() alignment guarantees, which is at least 4 bytes on 32-bit platforms, most likely more than 8 bytes.
59+
// Since SEXPRECs are heap-allocated, they are subject to malloc() alignment guarantees,
60+
// which is at least 4 bytes on 32-bit platforms, most likely more than 8 bytes.
5161
return ((((uintptr_t)key) >> 4) * multiplier + offset) % size;
5262
}
5363

@@ -59,15 +69,15 @@ void hash_set(hashtab * h, SEXP key, R_xlen_t value) {
5969
return;
6070
} else if (!cell->key) {
6171
if (!h->free) internal_error(
62-
"hash_insert", "no free slots left (full size=%zu)", h->size
72+
__func__, "no free slots left (full size=%zu)", h->size
6373
);
6474
--h->free;
6575
*cell = (struct hash_pair){.key = key, .value = value};
6676
return;
6777
}
6878
}
6979
internal_error( // # nocov
70-
"hash_insert", "did not find a free slot for key %p; size=%zu, free=%zu",
80+
__func__, "did not find a free slot for key %p; size=%zu, free=%zu",
7181
(void*)key, h->size, h->free
7282
);
7383
}
@@ -84,3 +94,130 @@ R_xlen_t hash_lookup(const hashtab * h, SEXP key, R_xlen_t ifnotfound) {
8494
// Should be impossible with a load factor below 1, but just in case:
8595
return ifnotfound; // # nocov
8696
}
97+
98+
typedef struct dhashtab_ {
99+
dhashtab public; // must be at offset 0
100+
size_t size, used, limit;
101+
uintptr_t multiplier;
102+
struct hash_pair *table, *previous;
103+
} dhashtab_;
104+
105+
static void dhash_finalizer(SEXP dhash) {
106+
dhashtab_ * self = R_ExternalPtrAddr(dhash);
107+
if (!self) return;
108+
R_ClearExternalPtr(dhash);
109+
free(self->previous);
110+
free(self->table);
111+
free(self);
112+
}
113+
114+
static struct hash_pair * dhash_allocate(size_t n_full) {
115+
if (n_full > SIZE_MAX / sizeof(struct hash_pair))
116+
internal_error(__func__, "%zu hash table slots would overflow size_t", n_full); // # nocov
117+
struct hash_pair * new = malloc(sizeof(struct hash_pair[n_full]));
118+
if (!new) internal_error(__func__, "failed to malloc() %zu hash table slots", n_full); // # nocov
119+
for (size_t i = 0; i < n_full; ++i) new[i] = (struct hash_pair){.key = NULL};
120+
return new;
121+
}
122+
123+
static dhashtab * dhash_create_(size_t n, double load_factor) {
124+
size_t n_full = get_full_size(n, load_factor);
125+
126+
SEXP prot = PROTECT(R_MakeExternalPtr(NULL, R_NilValue, R_NilValue));
127+
R_RegisterCFinalizerEx(prot, dhash_finalizer, TRUE);
128+
dhashtab_ * self = malloc(sizeof(dhashtab_));
129+
if (!self) internal_error(__func__, "failed to malloc() the hash table header"); // # nocov
130+
*self = (dhashtab_){
131+
.public = { .prot = prot },
132+
};
133+
R_SetExternalPtrAddr(prot, self);
134+
135+
self->table = dhash_allocate(n_full);
136+
self->size = n_full;
137+
self->limit = n;
138+
self->multiplier = n_full * hash_multiplier;
139+
// this is the last time we're allowed to set the table parts piece by piece
140+
141+
UNPROTECT(1);
142+
return &self->public;
143+
}
144+
145+
dhashtab * dhash_create(size_t n) { return dhash_create_(n, .5); }
146+
147+
static void dhash_enlarge(dhashtab_ * self) {
148+
if (self->size > SIZE_MAX / 2)
149+
internal_error(__func__, "doubling %zu elements would overflow size_t", self->size); // # nocov
150+
size_t new_size = self->size * 2;
151+
struct hash_pair * new = dhash_allocate(new_size);
152+
uintptr_t new_multiplier = new_size * hash_multiplier;
153+
for (size_t i = 0; i < self->size; ++i) {
154+
for (size_t j = 0; j < new_size; ++j) {
155+
size_t ii = hash_index(self->table[i].key, new_multiplier, j, new_size);
156+
if (!new[ii].key) {
157+
new[ii] = (struct hash_pair){
158+
.key = self->table[i].key,
159+
.value = self->table[i].value
160+
};
161+
break;
162+
}
163+
}
164+
}
165+
// Not trying to protect from calls to _set -> _enlarge from other threads!
166+
// Writes only come from a critical section, so two threads will not attempt to enlarge at the same time.
167+
// What we have to prevent is yanking the self->table from under a different thread reading it right now.
168+
free(self->previous);
169+
struct hash_pair * previous = self->table;
170+
dhashtab public = self->public;
171+
size_t used = self->used, limit = self->limit*2;
172+
*self = (dhashtab_){
173+
.public = public,
174+
.size = new_size,
175+
.used = used,
176+
.limit = limit,
177+
.multiplier = new_multiplier,
178+
.table = new,
179+
.previous = previous,
180+
};
181+
#pragma omp flush // no locking or atomic access! this is bad
182+
}
183+
184+
void dhash_set(dhashtab * h, SEXP key, R_xlen_t value) {
185+
dhashtab_ * self = (dhashtab_ *)h;
186+
again:
187+
for (size_t i = 0; i < self->size; ++i) {
188+
struct hash_pair * cell = self->table + hash_index(key, self->multiplier, i, self->size);
189+
if (cell->key == key) {
190+
cell->value = value;
191+
return;
192+
} else if (!cell->key) {
193+
if (self->used < self->limit) {
194+
*cell = (struct hash_pair){ .key = key, .value = value };
195+
++self->used;
196+
return;
197+
}
198+
dhash_enlarge(self);
199+
goto again; // won't be needed next time with the limit doubled
200+
}
201+
}
202+
internal_error( // # nocov
203+
__func__, "did not find a free slot for key %p; size=%zu, used=%zu, limit=%zu",
204+
(void*)key, self->size, self->used, self->limit
205+
);
206+
}
207+
208+
R_xlen_t dhash_lookup(dhashtab * h, SEXP key, R_xlen_t ifnotfound) {
209+
#pragma omp flush // no locking or atomic access! this is bad
210+
dhashtab_ self = *(dhashtab_ *)h;
211+
R_xlen_t ret = ifnotfound;
212+
for (size_t i = 0; i < self.size; ++i) {
213+
const struct hash_pair * cell = self.table + hash_index(key, self.multiplier, i, self.size);
214+
if (cell->key == key) {
215+
ret = cell->value;
216+
goto done;
217+
} else if (!cell->key) {
218+
goto done;
219+
}
220+
}
221+
done:
222+
return ret;
223+
}

0 commit comments

Comments
 (0)