Skip to content

Commit 1b3bf37

Browse files
Ian2327Ian Setiabtovar
authored
Hash table inefficiencies (#4149)
* Update vine_manager.c * Update vine_file_replica_table.c * Fixes invalid read of size 1 * move cache_name copy idea to outter function * Added hash_table.c halve buckets function * implemented bucket halving in ITERATE function * Updated benchmark test script to include insertion and removal times * reduce count until above DEFAULT_MIN_LOAD * reduce on delete * simplify reduction * add function hash_table_load * update benchmark * aux function to insert entry to buckets array * same for reduce buckets * update benchmark * fix reduce * update benchmark * revert incorrect changes to vine_manager.c --------- Co-authored-by: Ian Setia <[email protected]> Co-authored-by: Benjamin Tovar <[email protected]>
1 parent 297668e commit 1b3bf37

File tree

4 files changed

+182
-48
lines changed

4 files changed

+182
-48
lines changed

dttools/src/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -193,7 +193,7 @@ PROGRAMS = $(MOST_PROGRAMS) catalog_query
193193

194194
SCRIPTS = cctools_gpu_autodetect
195195
TARGETS = $(LIBRARIES) $(PRELOAD_LIBRARIES) $(PROGRAMS) $(TEST_PROGRAMS)
196-
TEST_PROGRAMS = auth_test disk_alloc_test jx_test microbench multirun jx_count_obj_test jx_canonicalize_test jx_merge_test hash_table_offset_test hash_table_fromkey_test histogram_test category_test jx_binary_test bucketing_base_test bucketing_manager_test priority_queue_test
196+
TEST_PROGRAMS = auth_test disk_alloc_test jx_test microbench multirun jx_count_obj_test jx_canonicalize_test jx_merge_test hash_table_offset_test hash_table_fromkey_test hash_table_benchmark histogram_test category_test jx_binary_test bucketing_base_test bucketing_manager_test priority_queue_test
197197

198198
all: $(TARGETS) catalog_query
199199

dttools/src/hash_table.c

Lines changed: 92 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,8 @@ See the file COPYING for details.
1212
#include <string.h>
1313

1414
#define DEFAULT_SIZE 127
15-
#define DEFAULT_LOAD 0.75
15+
#define DEFAULT_MAX_LOAD 0.75
16+
#define DEFAULT_MIN_LOAD 0.125
1617
#define DEFAULT_FUNC hash_string
1718

1819
struct entry {
@@ -155,72 +156,113 @@ int hash_table_size(struct hash_table *h)
155156
return h->size;
156157
}
157158

158-
static int hash_table_double_buckets(struct hash_table *h)
159+
double hash_table_load(struct hash_table *h)
159160
{
160-
struct hash_table *hn = hash_table_create(2 * h->bucket_count, h->hash_func);
161+
return ((double)h->size) / h->bucket_count;
162+
}
161163

162-
if (!hn)
163-
return 0;
164+
static int insert_to_buckets_aux(struct entry **buckets, int bucket_count, struct entry *new_entry)
165+
{
166+
unsigned index;
167+
struct entry *e;
164168

165-
/* Move pairs to new hash */
166-
char *key;
167-
void *value;
168-
hash_table_firstkey(h);
169-
while (hash_table_nextkey(h, &key, &value))
170-
if (!hash_table_insert(hn, key, value)) {
171-
hash_table_delete(hn);
169+
index = new_entry->hash % bucket_count;
170+
e = buckets[index];
171+
172+
while (e) {
173+
/* check that this key does not already exist in the table */
174+
if (new_entry->hash == e->hash && !strcmp(new_entry->key, e->key)) {
172175
return 0;
173176
}
177+
e = e->next;
178+
}
179+
180+
new_entry->next = buckets[index];
181+
buckets[index] = new_entry;
182+
183+
return 1;
184+
}
185+
186+
static int hash_table_double_buckets(struct hash_table *h)
187+
{
188+
int new_count = (2 * (h->bucket_count + 1)) - 1;
189+
struct entry **new_buckets = (struct entry **)calloc(new_count, sizeof(struct entry *));
190+
if (!new_buckets) {
191+
return 0;
192+
}
174193

175-
/* Delete all old pairs */
176194
struct entry *e, *f;
177-
int i;
178-
for (i = 0; i < h->bucket_count; i++) {
195+
for (int i = 0; i < h->bucket_count; i++) {
179196
e = h->buckets[i];
180197
while (e) {
181198
f = e->next;
182-
free(e->key);
183-
free(e);
199+
e->next = NULL;
200+
insert_to_buckets_aux(new_buckets, new_count, e);
184201
e = f;
185202
}
186203
}
187204

188205
/* Make the old point to the new */
189206
free(h->buckets);
190-
h->buckets = hn->buckets;
191-
h->bucket_count = hn->bucket_count;
192-
h->size = hn->size;
207+
h->buckets = new_buckets;
208+
h->bucket_count = new_count;
193209

194210
/* structure of hash table changed completely, thus a nextkey would be incorrect. */
195211
h->cant_iterate_yet = 1;
196212

197-
/* Delete reference to new, so old is safe */
198-
free(hn);
199-
200213
return 1;
201214
}
202215

203-
int hash_table_insert(struct hash_table *h, const char *key, const void *value)
216+
static int hash_table_reduce_buckets(struct hash_table *h)
204217
{
205-
struct entry *e;
206-
unsigned hash, index;
218+
int new_count = ((h->bucket_count + 1) / 2) - 1;
207219

208-
if (((float)h->size / h->bucket_count) > DEFAULT_LOAD)
209-
hash_table_double_buckets(h);
220+
/* DEFAULT_SIZE is the minimum size */
221+
if (new_count < DEFAULT_SIZE) {
222+
return 1;
223+
}
210224

211-
hash = h->hash_func(key);
212-
index = hash % h->bucket_count;
213-
e = h->buckets[index];
225+
/* Table cannot be reduced above DEFAULT_MAX_LOAD */
226+
if (((float)h->size / new_count) > DEFAULT_MAX_LOAD) {
227+
return 1;
228+
}
214229

215-
while (e) {
216-
if (hash == e->hash && !strcmp(key, e->key))
217-
return 0;
218-
e = e->next;
230+
struct entry **new_buckets = (struct entry **)calloc(new_count, sizeof(struct entry *));
231+
if (!new_buckets) {
232+
return 0;
219233
}
220234

221-
e = (struct entry *)malloc(sizeof(struct entry));
222-
if (!e)
235+
struct entry *e, *f;
236+
for (int i = 0; i < h->bucket_count; i++) {
237+
e = h->buckets[i];
238+
while (e) {
239+
f = e->next;
240+
e->next = NULL;
241+
insert_to_buckets_aux(new_buckets, new_count, e);
242+
e = f;
243+
}
244+
}
245+
246+
/* Make the old point to the new */
247+
free(h->buckets);
248+
h->buckets = new_buckets;
249+
h->bucket_count = new_count;
250+
251+
/* structure of hash table changed completely, thus a nextkey would be incorrect. */
252+
h->cant_iterate_yet = 1;
253+
254+
return 1;
255+
}
256+
257+
int hash_table_insert(struct hash_table *h, const char *key, const void *value)
258+
{
259+
if (((float)h->size / h->bucket_count) > DEFAULT_MAX_LOAD)
260+
hash_table_double_buckets(h);
261+
262+
struct entry *e = (struct entry *)malloc(sizeof(struct entry));
263+
if (!e) {
223264
return 0;
265+
}
224266

225267
e->key = strdup(key);
226268
if (!e->key) {
@@ -229,16 +271,18 @@ int hash_table_insert(struct hash_table *h, const char *key, const void *value)
229271
}
230272

231273
e->value = (void *)value;
232-
e->hash = hash;
233-
e->next = h->buckets[index];
234-
h->buckets[index] = e;
235-
h->size++;
274+
e->hash = h->hash_func(e->key);
236275

237-
/* inserting cause different behaviours with nextkey (e.g., sometimes the new
238-
* key would be included or skipped in the iteration */
239-
h->cant_iterate_yet = 1;
276+
int inserted = insert_to_buckets_aux(h->buckets, h->bucket_count, e);
277+
if (inserted) {
278+
h->size++;
240279

241-
return 1;
280+
/* inserting cause different behaviours with nextkey (e.g., sometimes the new
281+
* key would be included or skipped in the iteration */
282+
h->cant_iterate_yet = 1;
283+
}
284+
285+
return inserted;
242286
}
243287

244288
void *hash_table_remove(struct hash_table *h, const char *key)
@@ -264,8 +308,9 @@ void *hash_table_remove(struct hash_table *h, const char *key)
264308
free(e);
265309
h->size--;
266310

267-
/* the deletion may cause nextkey to fail */
268-
h->cant_iterate_yet = 1;
311+
if (((float)h->size / h->bucket_count) < DEFAULT_MIN_LOAD) {
312+
hash_table_reduce_buckets(h);
313+
}
269314

270315
return value;
271316
}

dttools/src/hash_table.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,14 @@ void hash_table_delete(struct hash_table *h);
7070

7171
int hash_table_size(struct hash_table *h);
7272

73+
/** Return the proportion of elements
74+
vs buckets in the table.
75+
@return The load of the table.
76+
@param h A pointer to a hash table.
77+
*/
78+
79+
double hash_table_load(struct hash_table *h);
80+
7381
/** Insert a key and value.
7482
This call will fail if the table already contains the same key.
7583
You must call @ref hash_table_remove to remove it.

dttools/src/hash_table_benchmark.c

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
#include <stdio.h>
2+
#include <stdlib.h>
3+
#include <string.h>
4+
#include <time.h>
5+
#include <math.h>
6+
#include <stdint.h>
7+
#include "hash_table.h"
8+
9+
#define MAX_KEY_LEN 32
10+
11+
// Generate a unique string key from an integer
12+
void generate_key(int i, char *key)
13+
{
14+
snprintf(key, MAX_KEY_LEN, "key%d", i);
15+
}
16+
17+
double measure_iteration_time(struct hash_table *h)
18+
{
19+
struct timespec start, end;
20+
clock_gettime(CLOCK_MONOTONIC, &start);
21+
22+
char *key;
23+
void *value;
24+
HASH_TABLE_ITERATE(h, key, value);
25+
26+
clock_gettime(CLOCK_MONOTONIC, &end);
27+
28+
double start_sec = start.tv_sec + start.tv_nsec / 1e9;
29+
double end_sec = end.tv_sec + end.tv_nsec / 1e9;
30+
31+
return end_sec - start_sec;
32+
}
33+
34+
int main()
35+
{
36+
int power_step = 0;
37+
int power_max_step = 15; // ~64k max entries
38+
39+
struct hash_table *h = hash_table_create(0, 0);
40+
41+
char key[MAX_KEY_LEN];
42+
int entries_counter = 0;
43+
44+
printf("INSERTION PHASE:\n");
45+
for (power_step = 0; power_step <= power_max_step; power_step++) {
46+
double total_time = 0;
47+
double max_load = hash_table_load(h);
48+
double entries_to_add_remove = pow(2, power_step);
49+
for (int i = 0; i < entries_to_add_remove; i++) {
50+
entries_counter++;
51+
generate_key(entries_counter, key);
52+
hash_table_insert(h, key, NULL);
53+
max_load = hash_table_load(h) > max_load ? hash_table_load(h) : max_load;
54+
total_time += measure_iteration_time(h);
55+
}
56+
57+
printf("step %3d size %8d buckets %8d load_max %3.6f load_now %3.6f time %3.6f time_norm %3.6f\n", power_step, hash_table_size(h), (int)ceil(hash_table_size(h) / hash_table_load(h)), max_load, hash_table_load(h), total_time, total_time / entries_to_add_remove);
58+
}
59+
60+
printf("REMOVAL PHASE:\n");
61+
62+
entries_counter = 0;
63+
for (power_step = power_max_step; power_step > 0; power_step--) {
64+
double total_time = 0;
65+
double min_load = hash_table_load(h);
66+
double entries_to_add_remove = pow(2, power_step);
67+
for (int i = 0; i < entries_to_add_remove; i++) {
68+
entries_counter++;
69+
generate_key(entries_counter, key);
70+
hash_table_remove(h, key);
71+
min_load = hash_table_load(h) < min_load ? hash_table_load(h) : min_load;
72+
total_time += measure_iteration_time(h);
73+
}
74+
75+
printf("step %3d size %8d buckets %8d load_min %3.6f load_now %3.6f time %3.6f time_norm %3.6f\n", power_step, hash_table_size(h), (int)ceil(hash_table_size(h) / hash_table_load(h)), min_load, hash_table_load(h), total_time, total_time / entries_to_add_remove);
76+
}
77+
78+
hash_table_delete(h);
79+
80+
return 0;
81+
}

0 commit comments

Comments
 (0)