|
| 1 | +#ifndef VQ_FILTER_VQ_FILTER_H_ |
| 2 | +#define VQ_FILTER_VQ_FILTER_H_ |
| 3 | + |
| 4 | +#include <assert.h> |
| 5 | +#include <algorithm> |
| 6 | + |
| 7 | +#include "hashutil.h" |
| 8 | + |
| 9 | +#include "vqf_filter.h" |
| 10 | +#include "vqf_filter.c" |
| 11 | + |
| 12 | +using namespace std; |
| 13 | +using namespace hashing; |
| 14 | + |
| 15 | +namespace vqfilter { |
| 16 | +// status returned by a VQ filter operation |
| 17 | +enum Status { |
| 18 | + Ok = 0, |
| 19 | + NotFound = 1, |
| 20 | + NotEnoughSpace = 2, |
| 21 | + NotSupported = 3, |
| 22 | +}; |
| 23 | + |
| 24 | +template <typename ItemType, typename HashFamily = SimpleMixSplit> |
| 25 | +class VQFilter { |
| 26 | + |
| 27 | + vqf_filter *filter; |
| 28 | + uint64_t bytesUsed; |
| 29 | + uint64_t range; |
| 30 | + double bitsPerItem; |
| 31 | + HashFamily hasher; |
| 32 | + |
| 33 | + double BitsPerItem() const { return bitsPerItem; } |
| 34 | + |
| 35 | + void ApplyBlock(uint64_t *tmp, int block, int len); |
| 36 | + |
| 37 | + public: |
| 38 | + explicit VQFilter(const size_t n) : hasher() { |
| 39 | + |
| 40 | + // when inserting in random order |
| 41 | + // uint64_t nslots = (uint64_t) (n / 0.94); |
| 42 | + // when inserting in sorted order |
| 43 | + uint64_t nslots = (uint64_t) (n / 0.89); |
| 44 | + |
| 45 | + if ((filter = vqf_init(nslots)) == NULL) { |
| 46 | + std::cout << "Can't allocate.\n"; |
| 47 | + abort(); |
| 48 | + } |
| 49 | + range = filter->metadata.range; |
| 50 | + bytesUsed = filter->metadata.total_size_in_bytes; |
| 51 | + bitsPerItem = (double) bytesUsed / n; |
| 52 | + |
| 53 | + } |
| 54 | + |
| 55 | + ~VQFilter() { |
| 56 | + free(filter); |
| 57 | + } |
| 58 | + |
| 59 | + // Add an item to the filter. |
| 60 | + Status Add(const ItemType &item); |
| 61 | + |
| 62 | + Status AddAll(const vector<ItemType> &data, const size_t start, const size_t end) { |
| 63 | + return AddAll(data.data(), start, end); |
| 64 | + } |
| 65 | + |
| 66 | + // Add an item to the filter. |
| 67 | + Status AddAll(const ItemType *data, const size_t start, const size_t end); |
| 68 | + |
| 69 | + // Report if the item is inserted, with false positive rate. |
| 70 | + Status Contain(const ItemType &item) const; |
| 71 | + |
| 72 | + /* methods for providing stats */ |
| 73 | + // summary infomation |
| 74 | + std::string Info() const; |
| 75 | + |
| 76 | + // number of current inserted items; |
| 77 | + size_t Size() const { return 0; } |
| 78 | + |
| 79 | + // size of the filter in bytes. |
| 80 | + size_t SizeInBytes() const { return bytesUsed; } |
| 81 | +}; |
| 82 | + |
| 83 | +template <typename ItemType, typename HashFamily> |
| 84 | +Status VQFilter<ItemType, HashFamily>::Add( |
| 85 | + const ItemType &key) { |
| 86 | + uint64_t hash = hasher(key); |
| 87 | + bool ret = vqf_insert(filter, hash); |
| 88 | + if (!ret) { |
| 89 | + std::cout << "failed insertion for key.\n"; |
| 90 | + abort(); |
| 91 | + } |
| 92 | + return Ok; |
| 93 | +} |
| 94 | + |
| 95 | +template <typename ItemType, typename HashFamily> |
| 96 | +Status VQFilter<ItemType, HashFamily>::Contain( |
| 97 | + const ItemType &key) const { |
| 98 | + uint64_t hash = hasher(key); |
| 99 | + bool ret = vqf_is_present(filter, hash); |
| 100 | + return ret ? Ok : NotFound; |
| 101 | +} |
| 102 | + |
| 103 | +const int blockShift = 15; |
| 104 | +const int blockLen = 1 << blockShift; |
| 105 | + |
| 106 | +template <typename ItemType, typename HashFamily> |
| 107 | +void VQFilter<ItemType, HashFamily>::ApplyBlock(uint64_t *tmp, int block, int len) { |
| 108 | + // std::cout << "addAll ApplyBlock block " << block << " len " << len << "\n"; |
| 109 | + for (int i = 0; i < len; i++) { |
| 110 | + uint64_t hash = tmp[(block << blockShift) + i]; |
| 111 | + // std::cout << "inserting " << hash << "\n"; |
| 112 | + bool ret = vqf_insert(filter, hash); |
| 113 | + if (!ret) { |
| 114 | + std::cout << "failed insertion for key.\n"; |
| 115 | + abort(); |
| 116 | + } |
| 117 | + } |
| 118 | +} |
| 119 | + |
| 120 | +template <typename ItemType, typename HashFamily> |
| 121 | +Status VQFilter<ItemType, HashFamily>::AddAll( |
| 122 | + const ItemType* keys, const size_t start, const size_t end) { |
| 123 | + /* |
| 124 | + for (size_t i = start; i < end; i++) { |
| 125 | + uint64_t key = keys[i]; |
| 126 | + uint64_t hash = hasher(key); |
| 127 | + std::cout << "adding " << hash << "\n"; |
| 128 | + } |
| 129 | + */ |
| 130 | + int blocks = 1 + (end - start) / blockLen; |
| 131 | + uint64_t *tmp = new uint64_t[blocks * blockLen]; |
| 132 | + int *tmpLen = new int[blocks](); |
| 133 | + // std::cout << "addAll blocks " << blocks << "\n"; |
| 134 | + for (size_t i = start; i < end; i++) { |
| 135 | + uint64_t key = keys[i]; |
| 136 | + uint64_t hash = hasher(key); |
| 137 | + // __uint128_t x = (__uint128_t)key * (__uint128_t)blocks; |
| 138 | + __uint128_t x = (__uint128_t)hash * (__uint128_t)blocks; |
| 139 | + int block = (uint64_t)(x >> 64); |
| 140 | + int len = tmpLen[block]; |
| 141 | + tmp[(block << blockShift) + len] = hash; |
| 142 | + tmpLen[block] = len + 1; |
| 143 | + if (len + 1 == blockLen) { |
| 144 | + ApplyBlock(tmp, block, len + 1); |
| 145 | + tmpLen[block] = 0; |
| 146 | + } |
| 147 | + } |
| 148 | + for (int block = 0; block < blocks; block++) { |
| 149 | + ApplyBlock(tmp, block, tmpLen[block]); |
| 150 | + tmpLen[block] = 0; |
| 151 | + } |
| 152 | + delete[] tmp; |
| 153 | + delete[] tmpLen; |
| 154 | + return Ok; |
| 155 | +} |
| 156 | + |
| 157 | +template <typename ItemType, typename HashFamily> |
| 158 | +std::string VQFilter<ItemType, HashFamily>::Info() const { |
| 159 | + std::stringstream ss; |
| 160 | + ss << "VQFilter Status:\n" |
| 161 | + << "\t\tKeys stored: " << Size() << "\n"; |
| 162 | + if (Size() > 0) { |
| 163 | + ss << "\t\tk: " << BitsPerItem() << "\n"; |
| 164 | + } else { |
| 165 | + ss << "\t\tk: N/A\n"; |
| 166 | + } |
| 167 | + return ss.str(); |
| 168 | +} |
| 169 | +} // namespace vqfilter |
| 170 | +#endif // VQ_FILTER_VQ_FILTER_H_ |
0 commit comments