Prepare for fuse filter (implementation currently is a copy of the xor filter)

thomasmueller · thomasmueller · commit 64e5eae6167b · 2020-01-23T08:47:11.000+01:00
diff --git a/benchmarks/bulk-insert-and-query.cc b/benchmarks/bulk-insert-and-query.cc
@@ -33,6 +33,7 @@
 #include "xorfilter_2n.h"
 #include "xorfilter_plus.h"
 #include "xorfilter_singleheader.h"
+#include "xor_fuse_filter.h"
 #include "bloom.h"
 #include "counting_bloom.h"
 #include "gcs.h"
@@ -54,6 +55,7 @@ using namespace xorfilter;
 using namespace xorfilter2;
 using namespace xorfilter2n;
 using namespace xorfilter_plus;
+using namespace xorfusefilter;
 using namespace bloomfilter;
 using namespace counting_bloomfilter;
 using namespace gcsfilter;
@@ -313,6 +315,24 @@ struct FilterAPI<XorFilter<ItemType, FingerprintType>> {
   }
 };
 
+template <typename ItemType, typename FingerprintType>
+struct FilterAPI<XorFuseFilter<ItemType, FingerprintType>> {
+  using Table = XorFuseFilter<ItemType, FingerprintType>;
+  static Table ConstructFromAddCount(size_t add_count) { return Table(add_count); }
+  static void Add(uint64_t key, Table* table) {
+    throw std::runtime_error("Unsupported");
+  }
+  static void AddAll(const vector<ItemType> keys, const size_t start, const size_t end, Table* table) {
+    table->AddAll(keys, start, end);
+  }
+  static void Remove(uint64_t key, Table * table) {
+    throw std::runtime_error("Unsupported");
+  }
+  CONTAIN_ATTRIBUTES static bool Contain(uint64_t key, const Table * table) {
+    return (0 == table->Contain(key));
+  }
+};
+
 class MortonFilter {
     Morton3_8* filter;
     size_t size;
@@ -981,6 +1001,8 @@ int main(int argc, char * argv[]) {
     {70, "Xor8-singleheader"},
     {80, "Morton"},
 
+    {90, "XorFuse8"},
+
     // Sort
     {100, "Sort"},
   };
@@ -1453,6 +1475,15 @@ int main(int argc, char * argv[]) {
       cout << setw(NAME_WIDTH) << names[a] << cf << endl;
   }
 
+  // Xor Fuse Filter ----------------------------------------------------------
+  a = 90;
+  if (algorithmId == a || algorithmId < 0 || (algos.find(a) != algos.end())) {
+      auto cf = FilterBenchmark<
+          XorFuseFilter<uint64_t, uint8_t>>(
+          add_count, to_add, distinct_add, to_lookup, distinct_lookup, intersectionsize, hasduplicates, mixed_sets, seed, true);
+      cout << setw(NAME_WIDTH) << names[a] << cf << endl;
+  }
+
   // Sort ----------------------------------------------------------
   a = 100;
   if (algorithmId == a || algorithmId < 0 || (algos.find(a) != algos.end())) {
diff --git a/src/xorfilter/xor_fuse_filter.h b/src/xorfilter/xor_fuse_filter.h
@@ -0,0 +1,321 @@
+#ifndef XOR_FUSE_FILTER_XOR_FILTER_H_
+#define XOR_FUSE_FILTER_XOR_FILTER_H_
+
+#include <assert.h>
+#include <algorithm>
+#include "hashutil.h"
+
+using namespace std;
+using namespace hashing;
+
+namespace xorfusefilter {
+// status returned by a xor filter operation
+enum Status {
+  Ok = 0,
+  NotFound = 1,
+  NotEnoughSpace = 2,
+  NotSupported = 3,
+};
+
+inline uint64_t rotl64(uint64_t n, unsigned int c) {
+    // assumes width is a power of 2
+    const unsigned int mask = (CHAR_BIT * sizeof(n) - 1);
+    // assert ( (c<=mask) &&"rotate by type width or more");
+    c &= mask;
+    return (n << c) | ( n >> ((-c) & mask));
+}
+
+__attribute__((always_inline))
+inline uint32_t reduce(uint32_t hash, uint32_t n) {
+    // http://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
+    return (uint32_t) (((uint64_t) hash * n) >> 32);
+}
+
+size_t getHashFromHash(uint64_t hash, int index, int blockLength) {
+    uint32_t r = rotl64(hash, index * 21);
+    return (size_t) reduce(r, blockLength) + index * blockLength;
+}
+
+template <typename ItemType, typename FingerprintType,
+          typename HashFamily = TwoIndependentMultiplyShift>
+class XorFuseFilter {
+ public:
+
+  size_t size;
+  size_t arrayLength;
+  size_t blockLength;
+  FingerprintType *fingerprints;
+
+  HashFamily* hasher;
+
+  inline FingerprintType fingerprint(const uint64_t hash) const {
+    return (FingerprintType) hash ^ (hash >> 32);
+  }
+
+  explicit XorFuseFilter(const size_t size) {
+    hasher = new HashFamily();
+    this->size = size;
+    this->arrayLength = 32 + 1.23 * size;
+    this->blockLength = arrayLength / 3;
+    fingerprints = new FingerprintType[arrayLength]();
+    std::fill_n(fingerprints, arrayLength, 0);
+  }
+
+  ~XorFuseFilter() {
+    delete[] fingerprints;
+    delete hasher;
+  }
+
+  Status AddAll(const vector<ItemType> &data, const size_t start, const size_t end) {
+      return AddAll(data.data(),start,end);
+  }
+
+  Status AddAll(const ItemType* data, const size_t start, const size_t end);
+
+  // Report if the item is inserted, with false positive rate.
+  Status Contain(const ItemType &item) const;
+
+  /* methods for providing stats  */
+  // summary infomation
+  std::string Info() const;
+
+  // number of current inserted items;
+  size_t Size() const { return size; }
+
+  // size of the filter in bytes.
+  size_t SizeInBytes() const { return arrayLength * sizeof(FingerprintType); }
+};
+
+struct t2val {
+  uint64_t t2;
+  uint64_t t2count;
+};
+
+typedef struct t2val t2val_t;
+
+const int blockShift = 18;
+
+void applyBlock(uint64_t* tmp, int b, int len, t2val_t * t2vals) {
+    for (int i = 0; i < len; i += 2) {
+        uint64_t x = tmp[(b << blockShift) + i];
+        int index = (int) tmp[(b << blockShift) + i + 1];
+        t2vals[index].t2count++;
+        t2vals[index].t2 ^= x;
+    }
+}
+
+int applyBlock2(uint64_t* tmp, int b, int len, t2val_t * t2vals, int* alone, int alonePos) {
+    for (int i = 0; i < len; i += 2) {
+        uint64_t hash = tmp[(b << blockShift) + i];
+        int index = (int) tmp[(b << blockShift) + i + 1];
+        int oldCount = t2vals[index].t2count;
+        if (oldCount >= 1) {
+            int newCount = oldCount - 1;
+            t2vals[index].t2count = newCount;
+            if (newCount == 1) {
+                alone[alonePos++] = index;
+            }
+            t2vals[index].t2 ^= hash;
+        }
+    }
+    return alonePos;
+}
+
+template <typename ItemType, typename FingerprintType,
+          typename HashFamily>
+Status XorFuseFilter<ItemType, FingerprintType, HashFamily>::AddAll(
+    const ItemType* keys, const size_t start, const size_t end) {
+
+    int m = arrayLength;
+    uint64_t* reverseOrder = new uint64_t[size];
+    uint8_t* reverseH = new uint8_t[size];
+    size_t reverseOrderPos;
+    int hashIndex = 0;
+    t2val_t * t2vals = new t2val_t[m];
+    while (true) {
+        memset(t2vals, 0, sizeof(t2val_t[m]));
+        int blocks = 1 + ((3 * blockLength) >> blockShift);
+        uint64_t* tmp = new uint64_t[blocks << blockShift];
+        int* tmpc = new int[blocks]();
+        for(size_t i = start; i < end; i++) {
+            uint64_t k = keys[i];
+            uint64_t hash = (*hasher)(k);
+            for (int hi = 0; hi < 3; hi++) {
+                int index = getHashFromHash(hash, hi, blockLength);
+                int b = index >> blockShift;
+                int i2 = tmpc[b];
+                tmp[(b << blockShift) + i2] = hash;
+                tmp[(b << blockShift) + i2 + 1] = index;
+                tmpc[b] += 2;
+                if (i2 + 2 == (1 << blockShift)) {
+                    applyBlock(tmp, b, i2 + 2, t2vals);
+                    tmpc[b] = 0;
+                }
+            }
+
+        }
+        for (int b = 0; b < blocks; b++) {
+            applyBlock(tmp, b, tmpc[b], t2vals);
+        }
+        delete[] tmp;
+        delete[] tmpc;
+        reverseOrderPos = 0;
+
+        int* alone = new int[arrayLength];
+        int alonePos = 0;
+        for (size_t i = 0; i < arrayLength; i++) {
+            if (t2vals[i].t2count == 1) {
+                alone[alonePos++] = i;
+            }
+        }
+        tmp = new uint64_t[blocks << blockShift];
+        tmpc = new int[blocks]();
+        reverseOrderPos = 0;
+        int bestBlock = -1;
+        while (reverseOrderPos < size) {
+            if (alonePos == 0) {
+                // we need to apply blocks until we have an entry that is alone
+                // (that is, until alonePos > 0)
+                // so, find a large block (the larger the better)
+                // but don't need to search very long
+                // start searching where we stopped the last time
+                // (to make it more even)
+                for (int i = 0, b = bestBlock + 1, best = -1; i < blocks; i++) {
+                    if (b >= blocks) {
+                        b = 0;
+                    }
+                    if (tmpc[b] > best) {
+                        best = tmpc[b];
+                        bestBlock = b;
+                        if (best > (1 << (blockShift - 1))) {
+                            // sufficiently large: stop
+                            break;
+                        }
+                    }
+                }
+                if (tmpc[bestBlock] > 0) {
+                    alonePos = applyBlock2(tmp, bestBlock, tmpc[bestBlock], t2vals, alone, alonePos);
+                    tmpc[bestBlock] = 0;
+                }
+                // applying a block may not actually result in a new entry that is alone
+                if (alonePos == 0) {
+                    for (int b = 0; b < blocks && alonePos == 0; b++) {
+                        if (tmpc[b] > 0) {
+                            alonePos = applyBlock2(tmp, b, tmpc[b], t2vals, alone, alonePos);
+                            tmpc[b] = 0;
+                        }
+                    }
+                }
+            }
+            if (alonePos == 0) {
+                break;
+            }
+            int i = alone[--alonePos];
+            int b = i >> blockShift;
+            if (tmpc[b] > 0) {
+                alonePos = applyBlock2(tmp, b, tmpc[b], t2vals, alone, alonePos);
+                tmpc[b] = 0;
+            }
+            uint8_t found = -1;
+            if (t2vals[i].t2count == 0) {
+                continue;
+            }
+            long hash = t2vals[i].t2;
+            for (int hi = 0; hi < 3; hi++) {
+                int h = getHashFromHash(hash, hi, blockLength);
+                if (h == i) {
+                    found = (uint8_t) hi;
+                    t2vals[i].t2count = 0;
+                } else {
+                    int b = h >> blockShift;
+                    int i2 = tmpc[b];
+                    tmp[(b << blockShift) + i2] = hash;
+                    tmp[(b << blockShift) + i2 + 1] = h;
+                    tmpc[b] += 2;
+                    if (tmpc[b] >= 1 << blockShift) {
+                        alonePos = applyBlock2(tmp, b, tmpc[b], t2vals, alone, alonePos);
+                        tmpc[b] = 0;
+                    }
+                }
+            }
+            reverseOrder[reverseOrderPos] = hash;
+            reverseH[reverseOrderPos] = found;
+            reverseOrderPos++;
+        }
+        delete[] tmp;
+        delete[] tmpc;
+        delete[] alone;
+
+        if (reverseOrderPos == size) {
+            break;
+        }
+
+        std::cout << "WARNING: hashIndex " << hashIndex << "\n";
+        if (hashIndex >= 0) {
+            std::cout << (end - start) << " keys; arrayLength " << arrayLength
+                << " blockLength " << blockLength
+                << " reverseOrderPos " << reverseOrderPos << "\n";
+        }
+
+        hashIndex++;
+
+        // use a new random numbers
+        delete hasher;
+        hasher = new HashFamily();
+
+    }
+
+    for (int i = reverseOrderPos - 1; i >= 0; i--) {
+        // the hash of the key we insert next
+        uint64_t hash = reverseOrder[i];
+        int found = reverseH[i];
+        // which entry in the table we can change
+        int change = -1;
+        // we set table[change] to the fingerprint of the key,
+        // unless the other two entries are already occupied
+        FingerprintType xor2 = fingerprint(hash);
+        for (int hi = 0; hi < 3; hi++) {
+            size_t h = getHashFromHash(hash, hi, blockLength);
+            if (found == hi) {
+                change = h;
+            } else {
+                // this is different from BDZ: using xor to calculate the
+                // fingerprint
+                xor2 ^= fingerprints[h];
+            }
+        }
+        fingerprints[change] = xor2;
+    }
+    delete [] t2vals;
+    delete [] reverseOrder;
+    delete [] reverseH;
+
+    return Ok;
+}
+
+template <typename ItemType, typename FingerprintType,
+          typename HashFamily>
+Status XorFuseFilter<ItemType, FingerprintType, HashFamily>::Contain(
+    const ItemType &key) const {
+    uint64_t hash = (*hasher)(key);
+    FingerprintType f = fingerprint(hash);
+    uint32_t r0 = (uint32_t) hash;
+    uint32_t r1 = (uint32_t) rotl64(hash, 21);
+    uint32_t r2 = (uint32_t) rotl64(hash, 42);
+    uint32_t h0 = reduce(r0, blockLength);
+    uint32_t h1 = reduce(r1, blockLength) + blockLength;
+    uint32_t h2 = reduce(r2, blockLength) + 2 * blockLength;
+    f ^= fingerprints[h0] ^ fingerprints[h1] ^ fingerprints[h2];
+    return f == 0 ? Ok : NotFound;
+}
+
+template <typename ItemType, typename FingerprintType,
+          typename HashFamily>
+std::string XorFuseFilter<ItemType, FingerprintType, HashFamily>::Info() const {
+  std::stringstream ss;
+  ss << "XorFuseFilter Status:\n"
+     << "\t\tKeys stored: " << Size() << "\n";
+  return ss.str();
+}
+}  // namespace xorfusefilter
+#endif  // XOR_FUSE_FILTER_XOR_FILTER_H_