Skip to content

Commit 575078e

Browse files
authored
Ensures miBf is deterministic (#150)
* Ensures miBf is deterministic * remove extra space * add dev only function description * ensure all insert functions are equivalent * remove unused variable
1 parent 10c51ee commit 575078e

File tree

2 files changed

+68
-18
lines changed

2 files changed

+68
-18
lines changed

goldrush_path/MIBFConstructSupport.hpp

Lines changed: 29 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -202,13 +202,13 @@ class MIBFConstructSupport
202202
values.set_empty_key(miBF.size());
203203
while (itr != itr.end()) {
204204
for (unsigned i = 0; i < m_h; ++i) {
205-
values.insert((*itr)[i]);
205+
values.insert(miBF.getRankPos((*itr)[i]));
206206
}
207207
++itr;
208208
}
209209
for (hashSet::iterator itr = values.begin(); itr != values.end(); itr++) {
210210
uint64_t randomSeed = *itr ^ id;
211-
uint64_t rank = miBF.getRankPos(*itr);
211+
uint64_t rank = *itr;
212212
T count = __sync_add_and_fetch(&m_counts[rank], 1);
213213
T randomNum = std::hash<T>{}(randomSeed) % count;
214214
if (randomNum == count - 1) {
@@ -227,15 +227,19 @@ class MIBFConstructSupport
227227
#pragma omp parallel for
228228
#endif
229229
for (size_t i = 0; i < hash_vec.size(); ++i) {
230-
const auto& hash = hash_vec[i];
231-
uint64_t randomSeed = hash ^ id;
232-
uint64_t rank = miBF.getRankPos(hash);
233-
T count = __sync_add_and_fetch(&m_counts[rank], 1);
234-
T randomNum = std::hash<T>{}(randomSeed) % count;
235-
// std::cerr << "id: " << id << " randomNum: " << randomNum << "
236-
// randomSeed: " << randomSeed <<std::endl;
237-
if (randomNum == count - 1) {
238-
miBF.setData(rank, id);
230+
hashSet values;
231+
values.set_empty_key(miBF.size());
232+
for (size_t i = 0; i < hash_vec.size(); ++i) {
233+
values.insert(miBF.getRankPos(hash_vec[i]));
234+
}
235+
for (hashSet::iterator itr = values.begin(); itr != values.end(); itr++) {
236+
uint64_t randomSeed = *itr ^ id;
237+
uint64_t rank = *itr;
238+
T count = __sync_add_and_fetch(&m_counts[rank], 1);
239+
T randomNum = std::hash<T>{}(randomSeed) % count;
240+
if (randomNum == count - 1) {
241+
miBF.setData(rank, id);
242+
}
239243
}
240244
}
241245
}
@@ -248,21 +252,28 @@ class MIBFConstructSupport
248252
{
249253
// assert(m_isBVMade & !m_isMIBFMade);
250254
// get positions
255+
hashSet values;
256+
values.set_empty_key(miBF.size());
251257
size_t vec_size = hash_vec[0].size();
252258
size_t num_elements = 0;
253259
for (size_t i = start; i < end; ++i) {
254260
num_elements += hash_vec[i].size();
255261
}
256262

257-
#if _OPENMP
258-
#pragma omp parallel for
259-
#endif
260263
for (size_t i = 0; i < num_elements; ++i) {
261264
size_t vec_num = i / vec_size;
262265
size_t hash_loc = i % vec_size;
263266
const auto& hash = hash_vec[start + vec_num][hash_loc];
264-
uint64_t randomSeed = hash ^ id;
265-
uint64_t rank = miBF.getRankPos(hash);
267+
values.insert(miBF.getRankPos(hash));
268+
}
269+
270+
std::vector<uint64_t> unique_values(values.begin(), values.end());
271+
#if _OPENMP
272+
#pragma omp parallel for
273+
#endif
274+
for (size_t i = 0; i < unique_values.size(); ++i) {
275+
const auto& rank = unique_values[i];
276+
uint64_t randomSeed = rank ^ id;
266277
T count = __sync_add_and_fetch(&m_counts[rank], 1);
267278
T randomNum = std::hash<T>{}(randomSeed) % count;
268279
if (randomNum == count - 1) {
@@ -283,14 +294,14 @@ class MIBFConstructSupport
283294
for (unsigned i = 0; i < m_h; ++i) {
284295
const std::vector<uint64_t> hash{ (*itr)[i] };
285296
if (solid_vec[i]->contains(hash)) {
286-
values.insert(hash[0]);
297+
values.insert(miBF.getRankPos(hash[0]));
287298
}
288299
}
289300
++itr;
290301
}
291302
for (hashSet::iterator itr = values.begin(); itr != values.end(); itr++) {
292303
uint64_t randomSeed = *itr ^ id;
293-
uint64_t rank = miBF.getRankPos(*itr);
304+
uint64_t rank = *itr;
294305
T count = __sync_add_and_fetch(&m_counts[rank], 1);
295306
T randomNum = std::hash<T>{}(randomSeed) % count;
296307
if (randomNum == count - 1) {

goldrush_path/MIBloomFilter.hpp

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,45 @@ class MIBloomFilter
120120
return colliCount;
121121
}
122122

123+
/*
124+
* Stores the filter as a binary file to the path specified
125+
* Stores uncompressed because the random data tends to
126+
* compress poorly anyway
127+
* FOR DEVELOPMENT PURPOSES ONLY
128+
*/
129+
void store(string const& filterFilePath) const
130+
{
131+
132+
#pragma omp parallel for
133+
for (unsigned i = 0; i < 2; ++i) {
134+
if (i == 0) {
135+
ofstream myFile(filterFilePath.c_str(), ios::out | ios::binary);
136+
137+
assert(myFile);
138+
myFile.write(reinterpret_cast<const char*>(m_data.data()), m_dSize * sizeof(T));
139+
140+
myFile.close();
141+
assert(myFile);
142+
143+
FILE* file = fopen(filterFilePath.c_str(), "rb");
144+
if (file == NULL) {
145+
cerr << "file \"" << filterFilePath << "\" could not be read." << endl;
146+
exit(1);
147+
}
148+
} else {
149+
string bvFilename = filterFilePath + ".sdsl";
150+
// cerr << "Storing sdsl interleaved bit vector to: " << bvFilename
151+
// << endl;
152+
store_to_file(m_bv, bvFilename);
153+
// cerr << "Number of bit vector buckets is " << m_bv.size()
154+
// << endl;
155+
// cerr << "Uncompressed bit vector size is "
156+
// << (m_bv.size() + m_bv.size() * 64 / BLOCKSIZE) / 8
157+
// << " bytes" << endl;
158+
}
159+
}
160+
}
161+
123162
/*
124163
* Constructor using a prebuilt bitvector
125164
*/

0 commit comments

Comments
 (0)