Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/classify.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ using namespace kraken;
#ifdef EXACT_COUNTING
#ifdef USE_KHSET_FOR_EXACT_COUNTING
#include "khset.h"
using READCOUNTS = ReadCounts< khset64_t >;
using READCOUNTS = ReadCounts< kh::khset64_t >;
#else
#include <unordered_set>
using READCOUNTS = ReadCounts< unordered_set<uint64_t> >;
Expand Down
4 changes: 2 additions & 2 deletions src/count_unique.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ int main(int argc, char **argv) {
uint64_t ctr = 0;

unordered_set<uint64_t> exact_counter;
khset64_t exact_counter_khash;
kh::khset64_t exact_counter_khash;

if (use_stdin) {
uint64_t nr;
Expand Down Expand Up @@ -225,7 +225,7 @@ int main(int argc, char **argv) {
std::uniform_int_distribution<uint64_t> distr;
for (size_t j = 0; j < n_redo; ++j) {
unordered_set<uint64_t> exact_counter1;
khset64_t exact_counter_khash1;
kh::khset64_t exact_counter_khash1;
for(size_t i = 0; i < n_rand; i++) {
if (exact_counting_unordered_set) {
exact_counter1.insert(distr(rng));
Expand Down
51 changes: 27 additions & 24 deletions src/hyperloglogplus.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,10 @@ static int clz_manual(uint64_t x)
#define __builtin_clz(x) __lzcnt(x)
#define __builtin_clzl(x) __lzcnt64(x)
#endif
template<typename Func> void for_each(kh::khset32_t &set, const Func &func) {set.for_each(func);}
template<typename Func> void for_each(const kh::khset32_t &set, const Func &func) {set.for_each(func);}
template<typename Func> void for_each(std::unordered_set<uint32_t> &set, const Func &func) {for(auto &x: set) func(x);}
template<typename Func> void for_each(const std::unordered_set<uint32_t> &set, const Func &func) {for(const auto &x: set) func(x);}

inline uint8_t clz(const uint32_t x, const uint8_t max = 32) {
if (x == 0) { return max; }
Expand Down Expand Up @@ -353,14 +357,14 @@ vector<int> registerHistogram(const vector<uint8_t>& M, uint8_t q) {
return C;
}


vector<int> sparseRegisterHistogram(const SparseListType& sparseList, uint8_t pPrime, uint8_t p, uint8_t q){
vector<int> C(q+2, 0);
size_t m = 1 << pPrime;
for (const auto& encoded_hash_value : sparseList) {
uint8_t rank_val = getEncodedRank(encoded_hash_value, pPrime, p);
++C[rank_val];
--m;
}
for_each(sparseList, [&](uint32_t v) {
++C[getEncodedRank(v, pPrime, p)];
--m;
});
C[0] = m;
return C;
}
Expand Down Expand Up @@ -425,14 +429,13 @@ double tau(double x) {
// HyperLogLogPlusMinus class methods

template<>
HyperLogLogPlusMinus<uint64_t>::HyperLogLogPlusMinus(uint8_t precision, bool sparse, uint64_t (*bit_mixer) (uint64_t)):
p(precision), m(1<<precision), sparse(sparse), bit_mixer(bit_mixer) {
HyperLogLogPlusMinus<uint64_t>::HyperLogLogPlusMinus(uint8_t precision, bool sparse):
p(precision), m(1<<precision), sparse(sparse), bit_mixer(), sparseList(SparseListType()) {
if (precision > 18 || precision < 4) {
throw std::invalid_argument("precision (number of register = 2^precision) must be between 4 and 18");
}

if (sparse) {
this->sparseList = SparseListType(); // TODO: if SparseListType is changed, initialize with appropriate size
this->sparseList.reserve(m/4);
} else {
this->M = vector<uint8_t>(m);
Expand All @@ -447,7 +450,6 @@ HyperLogLogPlusMinus<HASH>& HyperLogLogPlusMinus<HASH>::operator= (HyperLogLogPl
n_observed = other.n_observed;
sparse = other.sparse;
sparseList = std::move(other.sparseList);
bit_mixer = other.bit_mixer;
return *this;
}

Expand All @@ -459,16 +461,14 @@ HyperLogLogPlusMinus<HASH>& HyperLogLogPlusMinus<HASH>::operator= (const HyperLo
n_observed = other.n_observed;
sparse = other.sparse;
sparseList = other.sparseList;
bit_mixer = other.bit_mixer;
return *this;
}

template<typename HASH>
HyperLogLogPlusMinus<HASH>::HyperLogLogPlusMinus(const HyperLogLogPlusMinus<HASH>& other):
p(other.p), m(other.m),
M(other.M), n_observed(other.n_observed), sparse(other.sparse),
sparseList(other.sparseList),
bit_mixer(other.bit_mixer) {
M(other.M), n_observed(other.n_observed), sparse(other.sparse), bit_mixer(),
sparseList(other.sparseList) {
}


Expand All @@ -477,8 +477,9 @@ HyperLogLogPlusMinus<HASH>::HyperLogLogPlusMinus(HyperLogLogPlusMinus<HASH>&& ot
p(other.p), m(other.m),
M(std::move(other.M)),
n_observed(other.n_observed), sparse(other.sparse),
sparseList(std::move(other.sparseList)),
bit_mixer(other.bit_mixer) {
bit_mixer(),
sparseList(std::move(other.sparseList))
{
}


Expand Down Expand Up @@ -555,6 +556,7 @@ void HyperLogLogPlusMinus<T>::switchToNormalRepresentation() {
#endif
}


// add sparseList to the registers of M
template<typename T>
void HyperLogLogPlusMinus<T>::addToRegisters(const SparseListType &sparseList) {
Expand All @@ -565,15 +567,14 @@ void HyperLogLogPlusMinus<T>::addToRegisters(const SparseListType &sparseList) {
if (sparseList.size() == 0) {
return;
}
for (auto encoded_hash_value_ptr = sparseList.begin(); encoded_hash_value_ptr != sparseList.end(); ++encoded_hash_value_ptr) {

size_t idx = getIndex(*encoded_hash_value_ptr, p);
for_each(sparseList, [this](uint32_t v) {
size_t idx = getIndex(v, p);
assert_lt(idx,M.size());
uint8_t rank_val = getEncodedRank(*encoded_hash_value_ptr, pPrime, p);
uint8_t rank_val = getEncodedRank(v, pPrime, p);
if (rank_val > this->M[idx]) {
this->M[idx] = rank_val;
}
}
});
}


Expand Down Expand Up @@ -601,7 +602,8 @@ void HyperLogLogPlusMinus<T>::merge(HyperLogLogPlusMinus<T>&& other) {
if (this->sparse && other.sparse) {
// this->merge(static_cast<const HyperLogLogPlusMinus<T>&>(other));
// consider using addHashToSparseList(this->sparseList, val, pPrime) and checking for sizes
this->sparseList.insert(other.sparseList.begin(), other.sparseList.end());
//this->sparseList.insert(other.sparseList.begin(), other.sparseList.end());
for_each(other.sparseList, [this](uint32_t v) {sparseList.insert(v);});
} else if (other.sparse) {
// other is sparse, but this is not
addToRegisters(other.sparseList);
Expand Down Expand Up @@ -642,7 +644,8 @@ void HyperLogLogPlusMinus<T>::merge(const HyperLogLogPlusMinus<T>& other) {
n_observed += other.n_observed;
if (this->sparse && other.sparse) {
// consider using addHashToSparseList(this->sparseList, val, pPrime) and checking for sizes
this->sparseList.insert(other.sparseList.begin(), other.sparseList.end());
//this->sparseList.insert(other.sparseList.begin(), other.sparseList.end());
for_each(other.sparseList, [this](uint32_t v) {sparseList.insert(v);});
} else if (other.sparse) {
// other is sparse, but this is not
addToRegisters(other.sparseList);
Expand Down Expand Up @@ -685,14 +688,14 @@ uint64_t HyperLogLogPlusMinus<uint64_t>::flajoletCardinality(bool use_sparse_pre
} else{
// For testing purposes. Put sparse list into a standard register
M = vector<uint8_t>(m, 0);
for (const auto& val : sparseList) {
for_each(sparseList, [&](uint32_t val) {
size_t idx = getIndex(val, p);
assert_lt(idx,M.size());
uint8_t rank_val = getEncodedRank(val, pPrime, p);
if (rank_val > M[idx]) {
M[idx] = rank_val;
}
}
}); // for_each
}
}
double est = calculateRawEstimate(M);
Expand Down
16 changes: 14 additions & 2 deletions src/hyperloglogplus.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@

#include<vector>
#include<unordered_set>
#include "khset.h"
using namespace std;

//#define HLL_DEBUG
Expand All @@ -44,7 +45,11 @@ uint64_t murmurhash3_finalizer (uint64_t key);

// Heule et al. encode the sparse list with variable length encoding
// see section 5.3.2. This implementation just uses a sorted vector or unordered_set.
#if USE_SLOW_UNORDERED_SET
typedef unordered_set<uint32_t> SparseListType;
#else
using SparseListType = kh::khset32_t;
#endif
// Other possible SparseList types:
// // typedef vector<uint32_t> SparseListType;
// The sorted vector implementation is pretty inefficient currently, as the vector
Expand All @@ -57,6 +62,12 @@ typedef unordered_set<uint32_t> SparseListType;
* HyperLogLogPlusMinus class for counting the number of unique 64-bit values in stream
* Note that only HASH=uint64_t is implemented.
*/


struct Murmur3Finalizer {
uint64_t operator()(uint64_t v) const {return murmurhash3_finalizer(v);}
};

template<typename HASH>
class HyperLogLogPlusMinus {

Expand All @@ -67,8 +78,8 @@ class HyperLogLogPlusMinus {
uint64_t n_observed = 0;

bool sparse; // sparse representation of the data?
const Murmur3Finalizer bit_mixer;
SparseListType sparseList;
HASH (*bit_mixer) (uint64_t);

// sparse versions of p and m
static const uint8_t pPrime = 25; // precision when using a sparse representation
Expand All @@ -82,7 +93,7 @@ class HyperLogLogPlusMinus {
bool use_n_observed = true; // return min(estimate, n_observed) instead of estimate

// Construct HLL with precision bits
HyperLogLogPlusMinus(uint8_t precision=12, bool sparse=true, HASH (*bit_mixer) (uint64_t) = murmurhash3_finalizer);
HyperLogLogPlusMinus(uint8_t precision=12, bool sparse=true);
HyperLogLogPlusMinus(const HyperLogLogPlusMinus<HASH>& other);
HyperLogLogPlusMinus(HyperLogLogPlusMinus<HASH>&& other);
HyperLogLogPlusMinus<HASH>& operator= (HyperLogLogPlusMinus<HASH>&& other);
Expand Down Expand Up @@ -119,4 +130,5 @@ class HyperLogLogPlusMinus {

};


#endif /* HYPERLOGLOGPLUS_H_ */
Loading