Skip to content

Commit afd53b5

Browse files
committed
hashtab: switch to C allocator to avoid longjumps
PROTECT() the corresponding EXTPTRSXP while used. Introduce a separate hash_set_shared() operation that avoids long jumps. Deallocate the previous hash table when growing a non-shared hash table.
1 parent 5f17fab commit afd53b5

File tree

9 files changed

+100
-28
lines changed

9 files changed

+100
-28
lines changed

src/assign.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -762,6 +762,7 @@ const char *memrecycle(const SEXP target, const SEXP where, const int start, con
762762
const SEXP *targetLevelsD=STRING_PTR_RO(targetLevels), *sourceLevelsD=STRING_PTR_RO(sourceLevels);
763763
SEXP newSource = PROTECT(allocVector(INTSXP, length(source))); protecti++;
764764
hashtab * marks = hash_create((size_t)nTargetLevels + nSourceLevels);
765+
PROTECT(marks->prot); protecti++;
765766
for (int k=0; k<nTargetLevels; ++k) {
766767
const SEXP s = targetLevelsD[k];
767768
hash_set(marks, s, -k-1);

src/chmatch.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ static SEXP chmatchMain(SEXP x, SEXP table, int nomatch, bool chin, bool chmatch
5858
// When table >> x, hash x and scan table // ToDo tune the kick-in factor
5959
if (!chmatchdup && tablelen > 2 * xlen) {
6060
hashtab *marks = hash_create(xlen);
61+
PROTECT(marks->prot); nprotect++;
6162
int nuniq = 0;
6263
for (int i = 0; i < xlen; ++i) {
6364
// todo use lookup_insert?
@@ -93,6 +94,7 @@ static SEXP chmatchMain(SEXP x, SEXP table, int nomatch, bool chin, bool chmatch
9394
return ans;
9495
}
9596
hashtab * marks = hash_create(tablelen);
97+
PROTECT(marks->prot); nprotect++;
9698
int nuniq=0;
9799
for (int i=0; i<tablelen; ++i) {
98100
const SEXP s = td[i];

src/data.table.h

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -364,13 +364,12 @@ SEXP substitute_call_arg_namesR(SEXP expr, SEXP env);
364364
SEXP notchin(SEXP x, SEXP table);
365365

366366
// hash.c
367-
typedef struct hash_tab hashtab;
367+
typedef struct {
368+
SEXP prot; // make sure to PROTECT() while the table is in use
369+
} hashtab;
368370
// Allocate, initialise, and return a pointer to the new hash table.
369371
// n is the maximal number of elements that will be inserted.
370-
// Lower load factors lead to fewer collisions and faster lookups, but waste memory.
371372
// May raise an R error if an allocation fails or a size is out of bounds.
372-
// The table is temporary (allocated via R_alloc()) and will be unprotected upon return from the .Call().
373-
// See vmaxget()/vmaxset() if you need to unprotect it manually.
374373
hashtab * hash_create(size_t n);
375374
// Inserts a new key-value pair into the hash, or overwrites an existing value.
376375
// Will grow the table in a thread-unsafe manner if needed.

src/dogroups.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX
9292
double nextTime = (showProgress) ? startTime+3 : 0; // wait 3 seconds before printing progress
9393

9494
hashtab * specials = hash_create(3 + ngrpcols + xlength(SDall)); // .I, .N, .GRP plus columns of .BY plus SDall
95+
PROTECT(specials->prot); nprotect++;
9596

9697
defineVar(sym_BY, BY = PROTECT(allocVector(VECSXP, ngrpcols)), env); nprotect++; // PROTECT for rchk
9798
SEXP bynames = PROTECT(allocVector(STRSXP, ngrpcols)); nprotect++; // TO DO: do we really need bynames, can we assign names afterwards in one step?

src/fmelt.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -409,6 +409,7 @@ static SEXP combineFactorLevels(SEXP factorLevels, SEXP target, int * factorType
409409
for (R_xlen_t i = 0; i < nitem; ++i)
410410
hl += xlength(VECTOR_ELT(factorLevels, i));
411411
hashtab * marks = hash_create(hl);
412+
PROTECT(marks->prot);
412413
int nlevel=0;
413414
for (int i=0; i<nitem; ++i) {
414415
const SEXP this = VECTOR_ELT(factorLevels, i);
@@ -443,7 +444,7 @@ static SEXP combineFactorLevels(SEXP factorLevels, SEXP target, int * factorType
443444
} else {
444445
setAttrib(ans, R_ClassSymbol, ScalarString(char_factor));
445446
}
446-
UNPROTECT(1);
447+
UNPROTECT(2);
447448
return ans;
448449
}
449450

src/forder.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -609,6 +609,7 @@ SEXP forder(SEXP DT, SEXP by, SEXP retGrpArg, SEXP retStatsArg, SEXP sortGroupsA
609609
case STRSXP :
610610
// need2utf8 now happens inside range_str on the uniques
611611
marks = hash_create(4096); // relatively small to allocate, can grow exponentially later
612+
PROTECT(marks->prot); n_protect++;
612613
range_str(STRING_PTR_RO(x), nrow, &min, &max, &na_count, &anynotascii, &anynotutf8, marks);
613614
break;
614615
default:

src/hash.c

Lines changed: 84 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,11 @@ struct hash_pair {
77
R_xlen_t value;
88
};
99
struct hash_tab {
10+
hashtab public; // must be the first member
1011
size_t size, free;
1112
int shift;
1213
struct hash_pair *table;
14+
struct hash_tab *next;
1315
};
1416

1517
static const double default_load_factor = .75;
@@ -45,38 +47,65 @@ static R_INLINE size_t get_full_size(size_t n_elements, double load_factor) {
4547
return pow2;
4648
}
4749

48-
static hashtab * hash_create_(size_t n, double load_factor) {
50+
static struct hash_tab * hash_create_(size_t n, double load_factor) {
4951
size_t n_full = get_full_size(n, load_factor);
50-
hashtab *ret = (hashtab *)R_alloc(sizeof(hashtab), 1);
52+
53+
struct hash_tab *ret = malloc(sizeof *ret);
54+
if (!ret)
55+
return NULL; // #nocov
56+
57+
// Not strictly portable but faster than an explicit loop setting keys to NULL
58+
struct hash_pair *table = calloc(n_full, sizeof(*table));
59+
if (!table) {
60+
// #nocov start
61+
free(ret);
62+
return NULL;
63+
// #nocov end
64+
}
65+
5166
ret->size = n_full;
5267
ret->free = (size_t)(n_full * load_factor);
5368

5469
int k = 0;
5570
while ((1ULL << k) < n_full) k++;
5671
ret->shift = HASH_BITS - k;
57-
ret->table = (struct hash_pair *)R_alloc(n_full, sizeof(*ret->table));
58-
// No valid SEXP is a null pointer, so it's a safe marker for empty cells.
59-
memset(ret->table, 0, n_full * sizeof(struct hash_pair));
72+
ret->table = table;
73+
ret->next = NULL;
74+
6075
return ret;
6176
}
6277

63-
hashtab * hash_create(size_t n) { return hash_create_(n, default_load_factor); }
64-
65-
static R_INLINE size_t hash_index(SEXP key, int shift) {
66-
return (size_t)((uintptr_t)key * HASH_MULTIPLIER) >> shift;
78+
static void hash_finalizer(SEXP prot) {
79+
struct hash_tab * h = R_ExternalPtrAddr(prot);
80+
R_ClearExternalPtr(prot);
81+
while (h) {
82+
struct hash_tab * next = h->next;
83+
free(h->table);
84+
free(h);
85+
h = next;
86+
}
6787
}
6888

69-
static R_INLINE hashtab *hash_rehash(const hashtab *h) {
70-
size_t new_size = h->size * 2;
71-
hashtab *new_h = hash_create_(new_size, default_load_factor);
89+
hashtab * hash_create(size_t n) {
90+
SEXP prot = R_MakeExternalPtr(NULL, R_NilValue, R_NilValue);
91+
R_RegisterCFinalizer(prot, hash_finalizer);
7292

73-
for (size_t i = 0; i < h->size; ++i) {
74-
if (h->table[i].key) hash_set(new_h, h->table[i].key, h->table[i].value);
75-
}
76-
return new_h;
93+
struct hash_tab *ret = hash_create_(n, default_load_factor);
94+
if (!ret) internal_error( // #nocov
95+
__func__, "failed to allocate a hash table for %zu entries", n
96+
);
97+
98+
ret->public.prot = prot;
99+
R_SetExternalPtrAddr(prot, ret);
100+
101+
return &ret->public;
77102
}
78103

79-
static bool hash_set_(hashtab *h, SEXP key, R_xlen_t value) {
104+
static R_INLINE size_t hash_index(SEXP key, int shift) {
105+
return (size_t)((uintptr_t)key * HASH_MULTIPLIER) >> shift;
106+
}
107+
108+
static bool hash_set_(struct hash_tab *h, SEXP key, R_xlen_t value) {
80109
size_t mask = h->size - 1;
81110
size_t idx = hash_index(key, h->shift);
82111
while (true) {
@@ -96,22 +125,54 @@ static bool hash_set_(hashtab *h, SEXP key, R_xlen_t value) {
96125
}
97126
}
98127

99-
void hash_set(hashtab *h, SEXP key, R_xlen_t value) {
128+
static R_INLINE struct hash_tab *hash_rehash(const struct hash_tab *h) {
129+
if (h->size > SIZE_MAX / 2) return NULL; // #nocov
130+
131+
size_t new_size = h->size * 2;
132+
struct hash_tab *new_h = hash_create_(new_size, default_load_factor);
133+
if (!new_h) return NULL;
134+
135+
new_h->public = h->public;
136+
137+
for (size_t i = 0; i < h->size; ++i) {
138+
if (h->table[i].key)
139+
(void)hash_set_(new_h, h->table[i].key, h->table[i].value);
140+
}
141+
142+
return new_h;
143+
}
144+
145+
void hash_set(hashtab *self, SEXP key, R_xlen_t value) {
146+
struct hash_tab *h = (struct hash_tab *)self;
100147
if (!hash_set_(h, key, value)) {
101-
*h = *hash_rehash(h);
148+
struct hash_tab *new_h = hash_rehash(h);
149+
if (!new_h) internal_error( // # nocov
150+
__func__, "hash table full at n_full=%zu and failed to rehash", h->size
151+
);
152+
// overwrite the existing table, keeping the external pointer
153+
free(h->table);
154+
*h = *new_h;
155+
free(new_h);
102156
(void)hash_set_(h, key, value); // must succeed on the second try
103157
}
104158
}
105159

106-
hashtab *hash_set_shared(hashtab *h, SEXP key, R_xlen_t value) {
160+
hashtab *hash_set_shared(hashtab *self, SEXP key, R_xlen_t value) {
161+
struct hash_tab *h = (struct hash_tab *)self;
107162
if (!hash_set_(h, key, value)) {
108-
h = hash_rehash(h);
163+
struct hash_tab * new_h = hash_rehash(h);
164+
if (!new_h) return NULL; // # nocov
165+
// existing 'h' may still be in use by another thread
166+
h->next = new_h;
167+
h = new_h;
109168
(void)hash_set_(h, key, value);
110169
}
111-
return h;
170+
return &h->public;
112171
}
113172

114-
R_xlen_t hash_lookup(const hashtab *h, SEXP key, R_xlen_t ifnotfound) {
173+
R_xlen_t hash_lookup(const hashtab *self, SEXP key, R_xlen_t ifnotfound) {
174+
const struct hash_tab *h = (const struct hash_tab *)self;
175+
115176
size_t mask = h->size - 1;
116177
size_t idx = hash_index(key, h->shift);
117178
while (true) {

src/rbindlist.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor
7575
if (!uniq)
7676
error(_("Failed to allocate upper bound of %"PRId64" unique column names [sum(lapply(l,ncol))]"), (int64_t)upperBoundUniqueNames); // # nocov
7777
hashtab * marks = hash_create(upperBoundUniqueNames);
78+
PROTECT(marks->prot);
7879
int nuniq=0;
7980
// first pass - gather unique column names
8081
for (int i=0; i<LENGTH(l); i++) {
@@ -171,6 +172,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor
171172
}
172173
free(uniq); free(counts); free(uniqMap); free(dupLink); // all local scope so no need to set to NULL
173174

175+
UNPROTECT(1); // marks
174176
// colMapRaw is still allocated. It was allocated with malloc because we needed to catch if the alloc failed.
175177
// move it to R's heap so it gets automatically free'd on exit, and on any error between now and the end of rbindlist.
176178
colMap = (int *)R_alloc(LENGTH(l)*ncol, sizeof(*colMap));
@@ -357,6 +359,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor
357359
if (factor) {
358360
char warnStr[1000] = "";
359361
hashtab * marks = hash_create(xlength(longestLevels));
362+
PROTECT(marks->prot);
360363
int nLevel=0, allocLevel=0;
361364
SEXP *levelsRaw = NULL; // growing list of SEXP pointers. Raw since managed with raw realloc.
362365
if (orderedFactor) {
@@ -513,6 +516,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg, SEXP ignor
513516
} else {
514517
setAttrib(target, R_ClassSymbol, ScalarString(char_factor));
515518
}
519+
UNPROTECT(1); // marks
516520
} else { // factor==false
517521
for (int i=0; i<LENGTH(l); ++i) {
518522
const int thisnrow = eachMax[i];

src/utils.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -268,6 +268,7 @@ void copySharedColumns(SEXP x) {
268268
if (!isNewList(x) || ncol==1) return;
269269
bool *shared = (bool *)R_alloc(ncol, sizeof(*shared)); // on R heap in case alloc fails
270270
hashtab * marks = hash_create(ncol);
271+
PROTECT(marks->prot);
271272
const SEXP *xp = SEXPPTR_RO(x);
272273
int nShared=0;
273274
for (int i=0; i<ncol; ++i) {
@@ -295,6 +296,7 @@ void copySharedColumns(SEXP x) {
295296
nShared);
296297
// GetVerbose() (slightly expensive call of all options) called here only when needed
297298
}
299+
UNPROTECT(1);
298300
}
299301

300302
// lock, unlock and islocked at C level :

0 commit comments

Comments
 (0)