|
| 1 | +#include "data.table.h" |
| 2 | + |
| 3 | +struct hash_pair { |
| 4 | + SEXP key; |
| 5 | + R_xlen_t value; |
| 6 | +}; |
| 7 | +struct hash_tab { |
| 8 | + size_t size, free; |
| 9 | + uintptr_t multiplier; |
| 10 | + struct hash_pair tb[]; |
| 11 | +}; |
| 12 | + |
| 13 | +hashtab * hash_create(size_t n) { return hash_create_(n, .5); } |
| 14 | +// TAOCP vol. 3, section 6.4: for multiplication hashing, use A ~ 1/phi, the golden ratio. |
| 15 | +static const double hash_multiplier = 0.618033988749895; |
| 16 | + |
| 17 | +hashtab * hash_create_(size_t n, double load_factor) { |
| 18 | + if (load_factor <= 0 || load_factor >= 1) |
| 19 | + internal_error("hash_create", "load_factor=%g not in (0, 1)", load_factor); |
| 20 | + // precondition: n / load_factor < SIZE_MAX |
| 21 | + // truncate to compare in exact integer arithmetic and preserve all bits of n |
| 22 | + if ((size_t)(SIZE_MAX * load_factor) <= n) internal_error( |
| 23 | + "hash_create", "n=%zu / load_factor=%g would overflow size_t", |
| 24 | + n, load_factor |
| 25 | + ); |
| 26 | + size_t n_full = ceil(n / load_factor); |
| 27 | + // precondition: sizeof hashtab + hash_pair[n_full] < SIZE_MAX |
| 28 | + // n_full * sizeof hash_pair < SIZE_MAX - sizeof hashtab |
| 29 | + // sizeof hash_pair < (SIZE_MAX - sizeof hashtab) / n_full |
| 30 | + if (sizeof(struct hash_pair) >= (SIZE_MAX - sizeof(hashtab)) / n_full) internal_error( |
| 31 | + "hash_create", "n=%zu with load_factor=%g would overflow total allocation size", |
| 32 | + n, load_factor |
| 33 | + ); |
| 34 | + hashtab * ret = (hashtab *)R_alloc(sizeof(hashtab) + sizeof(struct hash_pair[n_full]), 1); |
| 35 | + ret->size = n_full; |
| 36 | + ret->free = n; |
| 37 | + // To compute floor(size * (A * key % 1)) in integer arithmetic with A < 1, use ((size * A) * key) % size. |
| 38 | + ret->multiplier = n_full * hash_multiplier; |
| 39 | + // No valid SEXP is a null pointer, so it's a safe marker for empty cells. |
| 40 | + for (size_t i = 0; i < n_full; ++i) |
| 41 | + ret->tb[i] = (struct hash_pair){.key = NULL, .value = 0}; |
| 42 | + return ret; |
| 43 | +} |
| 44 | + |
| 45 | +// Hashing for an open addressing hash table. See Cormen et al., Introduction to Algorithms, 3rd ed., section 11.4. |
| 46 | +// This is far from perfect. Make size a prime or a power of two and you'll be able to use double hashing. |
| 47 | +static R_INLINE size_t hash_index(SEXP key, uintptr_t multiplier, size_t offset, size_t size) { |
| 48 | + // The 4 lowest bits of the pointer are probably zeroes because a typical SEXPREC exceeds 16 bytes in size. |
| 49 | + // Since SEXPRECs are heap-allocated, they are subject to malloc() alignment guarantees, which is at least 4 bytes on 32-bit platforms, most likely more than 8 bytes. |
| 50 | + return ((((uintptr_t)key) >> 4) * multiplier + offset) % size; |
| 51 | +} |
| 52 | + |
| 53 | +void hash_set(hashtab * h, SEXP key, R_xlen_t value) { |
| 54 | + for (size_t i = 0; i < h->size; ++i) { |
| 55 | + struct hash_pair * cell = h->tb + hash_index(key, h->multiplier, i, h->size); |
| 56 | + if (cell->key == key) { |
| 57 | + cell->value = value; |
| 58 | + return; |
| 59 | + } else if (!cell->key) { |
| 60 | + if (!h->free) internal_error( |
| 61 | + "hash_insert", "no free slots left (size=%zu after the load factor)", h->size |
| 62 | + ); |
| 63 | + --h->free; |
| 64 | + *cell = (struct hash_pair){.key = key, .value = value}; |
| 65 | + return; |
| 66 | + } |
| 67 | + } |
| 68 | + internal_error( |
| 69 | + "hash_insert", "did not find a free slot for key %p despite size=%zu, free=%zu", |
| 70 | + (void*)key, h->size, h->free |
| 71 | + ); |
| 72 | +} |
| 73 | + |
| 74 | +R_xlen_t hash_lookup(const hashtab * h, SEXP key, R_xlen_t ifnotfound) { |
| 75 | + for (size_t i = 0; i < h->size; ++i) { |
| 76 | + const struct hash_pair * cell = h->tb + hash_index(key, h->multiplier, i, h->size); |
| 77 | + if (cell->key == key) { |
| 78 | + return cell->value; |
| 79 | + } else if (!cell->key) { |
| 80 | + return ifnotfound; |
| 81 | + } |
| 82 | + } |
| 83 | + // Should be impossible with a load factor below 1, but just in case: |
| 84 | + return ifnotfound; |
| 85 | +} |
0 commit comments