Skip to content

Commit 5782a7a

Browse files
committed
Implement the hash table
1 parent 9875906 commit 5782a7a

File tree

2 files changed

+104
-0
lines changed

2 files changed

+104
-0
lines changed

src/data.table.h

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -280,6 +280,25 @@ SEXP substitute_call_arg_namesR(SEXP expr, SEXP env);
280280
//negate.c
281281
SEXP notchin(SEXP x, SEXP table);
282282

283+
// hash.c
284+
typedef struct hash_tab hashtab;
285+
// Allocate, initialise, and return a pointer to the new hash table.
286+
// n is the maximal number of elements that will be inserted.
287+
// load_factor is a real in (0, 1) specifying the desired fraction of used table elements.
288+
// Lower load factors lead to fewer collisions and faster lookups, but waste memory.
289+
// May raise an R error if an allocation fails or a size is out of bounds.
290+
// The table is temporary (allocated via R_alloc()) and will be unprotected upon return from the .Call().
291+
// See vmaxget()/vmaxset() if you need to unprotect it manually.
292+
hashtab * hash_create_(size_t n, double load_factor);
293+
// Hard-coded "good enough" load_factor
294+
hashtab * hash_create(size_t n);
295+
// Inserts a new key-value pair into the hash, or overwrites an existing value.
296+
// Will raise an R error if inserting more than n elements.
297+
// Don't try to insert a null pointer, nothing good will come out of it.
298+
void hash_set(hashtab *, SEXP key, R_xlen_t value);
299+
// Returns the value corresponding to the key present in the hash, otherwise returns ifnotfound.
300+
R_xlen_t hash_lookup(const hashtab *, SEXP key, R_xlen_t ifnotfound);
301+
283302
// functions called from R level .Call/.External and registered in init.c
284303
// these now live here to pass -Wstrict-prototypes, #5477
285304
// all arguments must be SEXP since they are called from R level

src/hash.c

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
#include "data.table.h"
2+
3+
struct hash_pair {
4+
SEXP key;
5+
R_xlen_t value;
6+
};
7+
struct hash_tab {
8+
size_t size, free;
9+
uintptr_t multiplier;
10+
struct hash_pair tb[];
11+
};
12+
13+
hashtab * hash_create(size_t n) { return hash_create_(n, .5); }
14+
// TAOCP vol. 3, section 6.4: for multiplication hashing, use A ~ 1/phi, the golden ratio.
15+
static const double hash_multiplier = 0.618033988749895;
16+
17+
hashtab * hash_create_(size_t n, double load_factor) {
18+
if (load_factor <= 0 || load_factor >= 1)
19+
internal_error("hash_create", "load_factor=%g not in (0, 1)", load_factor);
20+
// precondition: n / load_factor < SIZE_MAX
21+
// truncate to compare in exact integer arithmetic and preserve all bits of n
22+
if ((size_t)(SIZE_MAX * load_factor) <= n) internal_error(
23+
"hash_create", "n=%zu / load_factor=%g would overflow size_t",
24+
n, load_factor
25+
);
26+
size_t n_full = ceil(n / load_factor);
27+
// precondition: sizeof hashtab + hash_pair[n_full] < SIZE_MAX
28+
// n_full * sizeof hash_pair < SIZE_MAX - sizeof hashtab
29+
// sizeof hash_pair < (SIZE_MAX - sizeof hashtab) / n_full
30+
if (sizeof(struct hash_pair) >= (SIZE_MAX - sizeof(hashtab)) / n_full) internal_error(
31+
"hash_create", "n=%zu with load_factor=%g would overflow total allocation size",
32+
n, load_factor
33+
);
34+
hashtab * ret = (hashtab *)R_alloc(sizeof(hashtab) + sizeof(struct hash_pair[n_full]), 1);
35+
ret->size = n_full;
36+
ret->free = n;
37+
// To compute floor(size * (A * key % 1)) in integer arithmetic with A < 1, use ((size * A) * key) % size.
38+
ret->multiplier = n_full * hash_multiplier;
39+
// No valid SEXP is a null pointer, so it's a safe marker for empty cells.
40+
for (size_t i = 0; i < n_full; ++i)
41+
ret->tb[i] = (struct hash_pair){.key = NULL, .value = 0};
42+
return ret;
43+
}
44+
45+
// Hashing for an open addressing hash table. See Cormen et al., Introduction to Algorithms, 3rd ed., section 11.4.
46+
// This is far from perfect. Make size a prime or a power of two and you'll be able to use double hashing.
47+
static R_INLINE size_t hash_index(SEXP key, uintptr_t multiplier, size_t offset, size_t size) {
48+
// The 4 lowest bits of the pointer are probably zeroes because a typical SEXPREC exceeds 16 bytes in size.
49+
// Since SEXPRECs are heap-allocated, they are subject to malloc() alignment guarantees, which is at least 4 bytes on 32-bit platforms, most likely more than 8 bytes.
50+
return ((((uintptr_t)key) >> 4) * multiplier + offset) % size;
51+
}
52+
53+
void hash_set(hashtab * h, SEXP key, R_xlen_t value) {
54+
for (size_t i = 0; i < h->size; ++i) {
55+
struct hash_pair * cell = h->tb + hash_index(key, h->multiplier, i, h->size);
56+
if (cell->key == key) {
57+
cell->value = value;
58+
return;
59+
} else if (!cell->key) {
60+
if (!h->free) internal_error(
61+
"hash_insert", "no free slots left (size=%zu after the load factor)", h->size
62+
);
63+
--h->free;
64+
*cell = (struct hash_pair){.key = key, .value = value};
65+
return;
66+
}
67+
}
68+
internal_error(
69+
"hash_insert", "did not find a free slot for key %p despite size=%zu, free=%zu",
70+
(void*)key, h->size, h->free
71+
);
72+
}
73+
74+
R_xlen_t hash_lookup(const hashtab * h, SEXP key, R_xlen_t ifnotfound) {
75+
for (size_t i = 0; i < h->size; ++i) {
76+
const struct hash_pair * cell = h->tb + hash_index(key, h->multiplier, i, h->size);
77+
if (cell->key == key) {
78+
return cell->value;
79+
} else if (!cell->key) {
80+
return ifnotfound;
81+
}
82+
}
83+
// Should be impossible with a load factor below 1, but just in case:
84+
return ifnotfound;
85+
}

0 commit comments

Comments
 (0)