Skip to content
This repository was archived by the owner on Sep 27, 2019. It is now read-only.

Commit ab9b7ea

Browse files
committed
Initial commit of new hash table
1 parent dd4c067 commit ab9b7ea

File tree

2 files changed

+484
-0
lines changed

2 files changed

+484
-0
lines changed

src/codegen/util/hash_table.cpp

Lines changed: 271 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,271 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// Peloton
4+
//
5+
// hash_table.cpp
6+
//
7+
// Identification: src/codegen/util/hash_table.cpp
8+
//
9+
// Copyright (c) 2015-2018, Carnegie Mellon University Database Group
10+
//
11+
//===----------------------------------------------------------------------===//
12+
13+
#include "codegen/util/hash_table.h"
14+
15+
#include "common/platform.h"
16+
#include "type/abstract_pool.h"
17+
18+
namespace peloton {
19+
namespace codegen {
20+
namespace util {
21+
22+
static const uint32_t kDefaultNumElements = 256;
23+
24+
static_assert((kDefaultNumElements & (kDefaultNumElements - 1)) == 0,
25+
"Default number of elements must be a power of two");
26+
/**
27+
* This hash-table uses an open-addressing probing scheme
28+
*
29+
*/
30+
31+
HashTable::HashTable(::peloton::type::AbstractPool &memory, uint64_t key_size,
32+
uint64_t value_size)
33+
: memory_(memory),
34+
entry_size_(sizeof(Entry) + key_size + value_size),
35+
directory_(nullptr),
36+
directory_size_(0),
37+
directory_mask_(0),
38+
block_(nullptr),
39+
next_tuple_pos_(nullptr),
40+
available_bytes_(0),
41+
num_elems_(0),
42+
capacity_(0) {
43+
// Upon creation, we allocate room for kDefaultNumElements in the hash table.
44+
// We assume 50% load factor on the directory, thus the directory size is
45+
// twice the number of elements.
46+
directory_size_ = kDefaultNumElements * 2;
47+
directory_mask_ = directory_size_ - 1;
48+
directory_ = static_cast<Entry **>(
49+
memory_.Allocate(sizeof(Entry *) * directory_size_));
50+
PELOTON_MEMSET(directory_, 0, directory_size_);
51+
52+
// We also need to allocate some space to store tuples. Tuples are stored
53+
// externally from the main hash table in a separate values memory space.
54+
uint64_t block_size =
55+
sizeof(MemoryBlock) + (entry_size_ * kDefaultNumElements);
56+
block_ = reinterpret_cast<MemoryBlock *>(memory_.Allocate(block_size));
57+
block_->next = nullptr;
58+
59+
// Set the next tuple write position and the available bytes
60+
next_tuple_pos_ = block_->data;
61+
available_bytes_ = block_size - sizeof(MemoryBlock);
62+
63+
// Set table stats
64+
num_elems_ = 0;
65+
capacity_ = kDefaultNumElements;
66+
}
67+
68+
HashTable::~HashTable() {
69+
// Free the directory
70+
if (directory_ != nullptr) {
71+
memory_.Free(directory_);
72+
directory_ = nullptr;
73+
}
74+
75+
// Free all the blocks we've allocated
76+
MemoryBlock *block = block_;
77+
while (block != nullptr) {
78+
MemoryBlock *next = block->next;
79+
memory_.Free(block);
80+
block = next;
81+
}
82+
block_ = nullptr;
83+
}
84+
85+
void HashTable::Init(HashTable &table, executor::ExecutorContext &exec_ctx,
86+
uint64_t key_size, uint64_t value_size) {
87+
new (&table) HashTable(*exec_ctx.GetPool(), key_size, value_size);
88+
}
89+
90+
void HashTable::Destroy(HashTable &table) { table.~HashTable(); }
91+
92+
HashTable::Entry *HashTable::AcquireEntrySlot() {
93+
if (entry_size_ > available_bytes_) {
94+
capacity_ *= 2;
95+
uint64_t block_size = sizeof(MemoryBlock) + (entry_size_ * capacity_);
96+
auto *new_block =
97+
reinterpret_cast<MemoryBlock *>(memory_.Allocate(block_size));
98+
new_block->next = block_;
99+
block_ = new_block;
100+
next_tuple_pos_ = new_block->data;
101+
available_bytes_ = block_size - sizeof(MemoryBlock);
102+
}
103+
104+
auto *entry = reinterpret_cast<Entry *>(next_tuple_pos_);
105+
entry->next = nullptr;
106+
107+
next_tuple_pos_ += entry_size_;
108+
available_bytes_ -= entry_size_;
109+
num_elems_++;
110+
111+
return entry;
112+
}
113+
114+
char *HashTable::StoreTupleLazy(uint64_t hash) {
115+
// Since this is a lazy insertion, we just need to acquire/allocate an entry
116+
// from storage. It is assumed that actual construction of the hash table is
117+
// done by a subsequent call to BuildLazy() only after ALL lazy insertions
118+
// have completed.
119+
auto *entry = AcquireEntrySlot();
120+
entry->hash = hash;
121+
122+
// Insert the entry into the linked list in the first directory slot
123+
if (directory_[0] == nullptr) {
124+
// This is the first entry
125+
directory_[0] = directory_[1] = entry;
126+
} else {
127+
PELOTON_ASSERT(directory_[1] != nullptr);
128+
directory_[1]->next = entry;
129+
directory_[1] = entry;
130+
}
131+
132+
// Return data pointer for key/value storage
133+
return entry->data;
134+
}
135+
136+
char *HashTable::StoreTuple(uint64_t hash) {
137+
// Resize the hash table if needed
138+
if (NeedsResize()) {
139+
Resize();
140+
}
141+
142+
// Acquire/allocate an entry from storage
143+
Entry *entry = AcquireEntrySlot();
144+
entry->hash = hash;
145+
146+
// Insert into hash table
147+
uint64_t index = hash & directory_mask_;
148+
entry->next = directory_[index];
149+
directory_[index] = entry;
150+
151+
// Return data pointer for key/value storage
152+
return entry->data;
153+
}
154+
155+
void HashTable::BuildLazy() {
156+
// Grab entry head
157+
Entry *head = directory_[0];
158+
159+
// Clean up old directory
160+
memory_.Free(directory_);
161+
162+
// At this point, all the lazy insertions are assumed to have completed. We
163+
// can allocate a perfectly sized hash table with 50% load factor.
164+
//
165+
// TODO: Use sketches to estimate the real # of unique elements
166+
// TODO: Perhaps change probing strategy based on estimate?
167+
168+
directory_size_ = NextPowerOf2(num_elems_) * 2;
169+
directory_mask_ = directory_size_ - 1;
170+
directory_ = static_cast<Entry **>(
171+
memory_.Allocate(sizeof(Entry *) * directory_size_));
172+
PELOTON_MEMSET(directory_, 0, directory_size_);
173+
174+
// Now insert all elements into the directory
175+
while (head != nullptr) {
176+
// Compute the target index
177+
// Stash the next linked-list entry into a temporary variable
178+
// Connect the current entry into the bucket chain
179+
// Move along
180+
uint64_t index = head->hash & directory_mask_;
181+
Entry *next = head->next;
182+
head->next = directory_[index];
183+
directory_[index] = head;
184+
head = next;
185+
}
186+
}
187+
188+
void HashTable::ReserveLazy(
189+
const executor::ExecutorContext::ThreadStates &thread_states,
190+
uint32_t hash_table_offset) {
191+
// Determine the total number of tuples stored across each hash table
192+
uint64_t total_size = 0;
193+
for (uint32_t i = 0; i < thread_states.NumThreads(); i++) {
194+
auto *hash_table = reinterpret_cast<HashTable *>(
195+
thread_states.AccessThreadState(i) + hash_table_offset);
196+
total_size += hash_table->NumElements();
197+
}
198+
199+
// TODO: Combine sketches to estimate the true unique # of elements
200+
201+
// Perfectly size the hash table
202+
num_elems_ = 0;
203+
capacity_ = NextPowerOf2(total_size);
204+
205+
directory_size_ = capacity_ * 2;
206+
directory_mask_ = directory_size_ - 1;
207+
directory_ = static_cast<Entry **>(
208+
memory_.Allocate(sizeof(Entry *) * directory_size_));
209+
}
210+
211+
void HashTable::MergeLazyUnfinished(const HashTable &other) {
212+
auto *head = other.directory_[0];
213+
while (head != nullptr) {
214+
// Find the index and stash the next entry in the linked list
215+
uint64_t index = head->hash & directory_mask_;
216+
Entry *next = head->next;
217+
218+
// Try to CAS in this entry into the directory
219+
Entry *curr;
220+
do {
221+
curr = directory_[index];
222+
head->next = curr;
223+
} while (!atomic_cas(directory_ + index, curr, head));
224+
225+
// Success, move along
226+
head = next;
227+
}
228+
}
229+
230+
void HashTable::Resize() {
231+
// Sanity check
232+
PELOTON_ASSERT(NeedsResize());
233+
234+
// Double the capacity
235+
capacity_ *= 2;
236+
237+
// Allocate the new directory with 50% fill factor
238+
uint64_t new_dir_size = capacity_ * 2;
239+
uint64_t new_dir_mask = new_dir_size - 1;
240+
auto *new_dir =
241+
static_cast<Entry **>(memory_.Allocate(sizeof(Entry *) * new_dir_size));
242+
PELOTON_MEMSET(new_dir, 0, new_dir_size);
243+
244+
// Insert all old directory entries into new directory
245+
for (uint32_t i = 0; i < directory_size_; i++) {
246+
auto *entry = directory_[i];
247+
if (entry == nullptr) {
248+
continue;
249+
}
250+
// Traverse bucket chain, reinserting into new table
251+
while (entry != nullptr) {
252+
uint64_t index = entry->hash & new_dir_mask;
253+
Entry *next = entry->next;
254+
entry->next = directory_[index];
255+
directory_[index] = entry;
256+
entry = next;
257+
}
258+
}
259+
260+
// Done. First free the old directory.
261+
memory_.Free(directory_);
262+
263+
// Set up the new directory
264+
directory_size_ = new_dir_size;
265+
directory_mask_ = new_dir_mask;
266+
directory_ = new_dir;
267+
}
268+
269+
} // namespace util
270+
} // namespace codegen
271+
} // namespace peloton

0 commit comments

Comments
 (0)