Skip to content

Commit c933247

Browse files
authored
Use linear probing instead of double hashing (#7455)
* use linear probing instead of double hashing * remove mask from struct
1 parent e379405 commit c933247

File tree

1 file changed

+36
-57
lines changed

1 file changed

+36
-57
lines changed

src/hash.c

Lines changed: 36 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,30 @@
11
#include "data.table.h"
22

3+
#define HASH_BITS (8 * sizeof(void*))
4+
#define HASH_MULTIPLIER ((sizeof(void*) == 8) ? 11400714819323198485ULL : 2654435769U)
35
struct hash_pair {
46
SEXP key;
57
R_xlen_t value;
68
};
79
struct hash_tab {
810
size_t size, free;
9-
uintptr_t multiplier1, multiplier2;
10-
struct hash_pair *table; // Single table for double hashing
11+
int shift;
12+
struct hash_pair *table;
1113
};
1214

13-
// 1/phi and sqrt(0.1)
14-
static const double hash_multiplier1 = 0.618033988749895;
15-
static const double hash_multiplier2 = 0.316227766016838;
16-
static const double default_load_factor = .5;
15+
static const double default_load_factor = .75;
16+
17+
static R_INLINE size_t get_next_pow2(size_t n) {
18+
if (n <= 1) return 1;
19+
n--;
20+
n |= n >> 1;
21+
n |= n >> 2;
22+
n |= n >> 4;
23+
n |= n >> 8;
24+
n |= n >> 16;
25+
if (sizeof(size_t) > 4) n |= n >> 32;
26+
return n + 1;
27+
}
1728

1829
static R_INLINE size_t get_full_size(size_t n_elements, double load_factor) {
1930
if (load_factor <= 0 || load_factor >= 1)
@@ -25,13 +36,11 @@ static R_INLINE size_t get_full_size(size_t n_elements, double load_factor) {
2536
__func__, "n=%zu / load_factor=%g would overflow size_t",
2637
n_elements, load_factor
2738
);
28-
size_t min_size = ceil(n_elements / load_factor);
39+
size_t min_size = (size_t)(n_elements / load_factor) + 1;
2940
// Round up to next power of 2 for fast modulo using bitwise AND
30-
size_t pow2 = 1;
31-
while (pow2 < min_size) {
32-
if (pow2 > SIZE_MAX / 2)
41+
size_t pow2 = get_next_pow2(min_size);
42+
if (pow2 == 0) {
3343
internal_error(__func__, "size %zu would overflow size_t", min_size); // # nocov
34-
pow2 *= 2;
3544
}
3645
return pow2;
3746
}
@@ -40,35 +49,21 @@ static hashtab * hash_create_(size_t n, double load_factor) {
4049
size_t n_full = get_full_size(n, load_factor);
4150
hashtab *ret = (hashtab *)R_alloc(sizeof(hashtab), 1);
4251
ret->size = n_full;
43-
ret->free = n;
44-
// Multiply by size to get different hash functions when rehashing
45-
ret->multiplier1 = n_full * hash_multiplier1;
46-
ret->multiplier2 = n_full * hash_multiplier2;
52+
ret->free = (size_t)(n_full * load_factor);
53+
54+
int k = 0;
55+
while ((1ULL << k) < n_full) k++;
56+
ret->shift = HASH_BITS - k;
4757
ret->table = (struct hash_pair *)R_alloc(n_full, sizeof(*ret->table));
4858
// No valid SEXP is a null pointer, so it's a safe marker for empty cells.
49-
for (size_t i = 0; i < n_full; ++i) {
50-
ret->table[i].key = NULL;
51-
}
59+
memset(ret->table, 0, n_full * sizeof(struct hash_pair));
5260
return ret;
5361
}
5462

5563
hashtab * hash_create(size_t n) { return hash_create_(n, default_load_factor); }
5664

57-
// double hashing
58-
static R_INLINE size_t hash_index1(SEXP key, uintptr_t multiplier) {
59-
// The 4 lowest bits of the pointer are probably zeroes because a typical SEXPREC exceeds 16 bytes in size.
60-
// Since SEXPRECs are heap-allocated, they are subject to malloc() alignment guarantees,
61-
// which is at least 4 bytes on 32-bit platforms, most likely more than 8 bytes.
62-
return ((((uintptr_t)key) >> 4) & 0x0fffffff) * multiplier;
63-
}
64-
65-
static R_INLINE size_t hash_index2(SEXP key, uintptr_t multiplier) {
66-
// For double hashing, we need a different hash that's coprime with table size.
67-
// We use higher-order bits that hash_index1 mostly ignores, and ensure
68-
// the result is always odd (coprime with power-of-2 table sizes).
69-
uintptr_t ptr = (uintptr_t)key;
70-
ptr = (ptr >> 12) | (ptr << (sizeof(uintptr_t) * 8 - 12));
71-
return ((ptr & 0x0fffffff) * multiplier) | 1;
65+
static R_INLINE size_t hash_index(SEXP key, int shift) {
66+
return (size_t)((uintptr_t)key * HASH_MULTIPLIER) >> shift;
7267
}
7368

7469
static R_INLINE hashtab *hash_rehash(const hashtab *h) {
@@ -83,17 +78,10 @@ static R_INLINE hashtab *hash_rehash(const hashtab *h) {
8378

8479
static bool hash_set_(hashtab *h, SEXP key, R_xlen_t value) {
8580
size_t mask = h->size - 1;
86-
size_t h1 = hash_index1(key, h->multiplier1) & mask;
87-
size_t h2 = hash_index2(key, h->multiplier2) & mask;
88-
89-
if (h2 == 0) h2 = 1;
90-
else if ((h2 & 1) == 0) h2 |= 1;
91-
92-
for (size_t i = 0; i < h->size; ++i) {
93-
size_t idx = (h1 + i * h2) & mask;
94-
81+
size_t idx = hash_index(key, h->shift);
82+
while (true) {
9583
if (!h->table[idx].key) {
96-
// Empty slot found
84+
if (h->free == 0) return false; // table full -> need rehash
9785
h->table[idx].key = key;
9886
h->table[idx].value = value;
9987
h->free--;
@@ -104,10 +92,8 @@ static bool hash_set_(hashtab *h, SEXP key, R_xlen_t value) {
10492
h->table[idx].value = value;
10593
return true;
10694
}
95+
idx = (idx + 1) & mask;
10796
}
108-
109-
// need to rehash
110-
return false;
11197
}
11298

11399
void hash_set(hashtab *h, SEXP key, R_xlen_t value) {
@@ -127,17 +113,10 @@ hashtab *hash_set_shared(hashtab *h, SEXP key, R_xlen_t value) {
127113

128114
R_xlen_t hash_lookup(const hashtab *h, SEXP key, R_xlen_t ifnotfound) {
129115
size_t mask = h->size - 1;
130-
size_t h1 = hash_index1(key, h->multiplier1) & mask;
131-
size_t h2 = hash_index2(key, h->multiplier2) & mask;
132-
133-
if (h2 == 0) h2 = 1;
134-
else if ((h2 & 1) == 0) h2 |= 1;
135-
136-
for (size_t i = 0; i < h->size; ++i) {
137-
size_t idx = (h1 + i * h2) & mask;
138-
if (!h->table[idx].key) return ifnotfound;
116+
size_t idx = hash_index(key, h->shift);
117+
while (true) {
139118
if (h->table[idx].key == key) return h->table[idx].value;
119+
if (h->table[idx].key == NULL) return ifnotfound;
120+
idx = (idx + 1) & mask;
140121
}
141-
142-
return ifnotfound; // # nocov
143122
}

0 commit comments

Comments
 (0)