Skip to content

Commit 69a5f8b

Browse files
gavinandresensipa
authored andcommitted
Rolling bloom filter class
For when you need to keep track of the last N items you've seen, and can tolerate some false-positives. Rebased-by: Pieter Wuille <[email protected]>
1 parent 8a10000 commit 69a5f8b

File tree

3 files changed

+173
-16
lines changed

3 files changed

+173
-16
lines changed

src/bloom.cpp

Lines changed: 67 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -21,22 +21,33 @@
2121
using namespace std;
2222

2323
CBloomFilter::CBloomFilter(unsigned int nElements, double nFPRate, unsigned int nTweakIn, unsigned char nFlagsIn) :
24-
/**
25-
* The ideal size for a bloom filter with a given number of elements and false positive rate is:
26-
* - nElements * log(fp rate) / ln(2)^2
27-
* We ignore filter parameters which will create a bloom filter larger than the protocol limits
28-
*/
29-
vData(min((unsigned int)(-1 / LN2SQUARED * nElements * log(nFPRate)), MAX_BLOOM_FILTER_SIZE * 8) / 8),
30-
/**
31-
* The ideal number of hash functions is filter size * ln(2) / number of elements
32-
* Again, we ignore filter parameters which will create a bloom filter with more hash functions than the protocol limits
33-
* See https://en.wikipedia.org/wiki/Bloom_filter for an explanation of these formulas
34-
*/
35-
isFull(false),
36-
isEmpty(false),
37-
nHashFuncs(min((unsigned int)(vData.size() * 8 / nElements * LN2), MAX_HASH_FUNCS)),
38-
nTweak(nTweakIn),
39-
nFlags(nFlagsIn)
24+
/**
25+
* The ideal size for a bloom filter with a given number of elements and false positive rate is:
26+
* - nElements * log(fp rate) / ln(2)^2
27+
* We ignore filter parameters which will create a bloom filter larger than the protocol limits
28+
*/
29+
vData(min((unsigned int)(-1 / LN2SQUARED * nElements * log(nFPRate)), MAX_BLOOM_FILTER_SIZE * 8) / 8),
30+
/**
31+
* The ideal number of hash functions is filter size * ln(2) / number of elements
32+
* Again, we ignore filter parameters which will create a bloom filter with more hash functions than the protocol limits
33+
* See https://en.wikipedia.org/wiki/Bloom_filter for an explanation of these formulas
34+
*/
35+
isFull(false),
36+
isEmpty(false),
37+
nHashFuncs(min((unsigned int)(vData.size() * 8 / nElements * LN2), MAX_HASH_FUNCS)),
38+
nTweak(nTweakIn),
39+
nFlags(nFlagsIn)
40+
{
41+
}
42+
43+
// Private constructor used by CRollingBloomFilter
44+
CBloomFilter::CBloomFilter(unsigned int nElements, double nFPRate, unsigned int nTweakIn) :
45+
vData((unsigned int)(-1 / LN2SQUARED * nElements * log(nFPRate)) / 8),
46+
isFull(false),
47+
isEmpty(true),
48+
nHashFuncs((unsigned int)(vData.size() * 8 / nElements * LN2)),
49+
nTweak(nTweakIn),
50+
nFlags(BLOOM_UPDATE_NONE)
4051
{
4152
}
4253

@@ -197,3 +208,43 @@ void CBloomFilter::UpdateEmptyFull()
197208
isFull = full;
198209
isEmpty = empty;
199210
}
211+
212+
CRollingBloomFilter::CRollingBloomFilter(unsigned int nElements, double fpRate, unsigned int nTweak) :
213+
b1(nElements * 2, fpRate, nTweak), b2(nElements * 2, fpRate, nTweak)
214+
{
215+
// Implemented using two bloom filters of 2 * nElements each.
216+
// We fill them up, and clear them, staggered, every nElements
217+
// inserted, so at least one always contains the last nElements
218+
// inserted.
219+
nBloomSize = nElements * 2;
220+
nInsertions = 0;
221+
}
222+
223+
void CRollingBloomFilter::insert(const std::vector<unsigned char>& vKey)
224+
{
225+
if (nInsertions == 0) {
226+
b1.clear();
227+
} else if (nInsertions == nBloomSize / 2) {
228+
b2.clear();
229+
}
230+
b1.insert(vKey);
231+
b2.insert(vKey);
232+
if (++nInsertions == nBloomSize) {
233+
nInsertions = 0;
234+
}
235+
}
236+
237+
bool CRollingBloomFilter::contains(const std::vector<unsigned char>& vKey) const
238+
{
239+
if (nInsertions < nBloomSize / 2) {
240+
return b2.contains(vKey);
241+
}
242+
return b1.contains(vKey);
243+
}
244+
245+
void CRollingBloomFilter::clear()
246+
{
247+
b1.clear();
248+
b2.clear();
249+
nInsertions = 0;
250+
}

src/bloom.h

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,10 @@ class CBloomFilter
5353

5454
unsigned int Hash(unsigned int nHashNum, const std::vector<unsigned char>& vDataToHash) const;
5555

56+
// Private constructor for CRollingBloomFilter, no restrictions on size
57+
CBloomFilter(unsigned int nElements, double nFPRate, unsigned int nTweak);
58+
friend class CRollingBloomFilter;
59+
5660
public:
5761
/**
5862
* Creates a new bloom filter which will provide the given fp rate when filled with the given number of elements
@@ -97,4 +101,28 @@ class CBloomFilter
97101
void UpdateEmptyFull();
98102
};
99103

104+
/**
105+
* RollingBloomFilter is a probabilistic "keep track of most recently inserted" set.
106+
* Construct it with the number of items to keep track of, and a false-positive rate.
107+
*
108+
* contains(item) will always return true if item was one of the last N things
109+
* insert()'ed ... but may also return true for items that were not inserted.
110+
*/
111+
class CRollingBloomFilter
112+
{
113+
public:
114+
CRollingBloomFilter(unsigned int nElements, double nFPRate, unsigned int nTweak);
115+
116+
void insert(const std::vector<unsigned char>& vKey);
117+
bool contains(const std::vector<unsigned char>& vKey) const;
118+
119+
void clear();
120+
121+
private:
122+
unsigned int nBloomSize;
123+
unsigned int nInsertions;
124+
CBloomFilter b1, b2;
125+
};
126+
127+
100128
#endif // BITCOIN_BLOOM_H

src/test/bloom_tests.cpp

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
#include "clientversion.h"
99
#include "key.h"
1010
#include "merkleblock.h"
11+
#include "random.h"
1112
#include "serialize.h"
1213
#include "streams.h"
1314
#include "uint256.h"
@@ -459,4 +460,81 @@ BOOST_AUTO_TEST_CASE(merkle_block_4_test_update_none)
459460
BOOST_CHECK(!filter.contains(COutPoint(uint256S("0x02981fa052f0481dbc5868f4fc2166035a10f27a03cfd2de67326471df5bc041"), 0)));
460461
}
461462

463+
static std::vector<unsigned char> RandomData()
464+
{
465+
uint256 r = GetRandHash();
466+
return std::vector<unsigned char>(r.begin(), r.end());
467+
}
468+
469+
BOOST_AUTO_TEST_CASE(rolling_bloom)
470+
{
471+
// last-100-entry, 1% false positive:
472+
CRollingBloomFilter rb1(100, 0.01, 0);
473+
474+
// Overfill:
475+
static const int DATASIZE=399;
476+
std::vector<unsigned char> data[DATASIZE];
477+
for (int i = 0; i < DATASIZE; i++) {
478+
data[i] = RandomData();
479+
rb1.insert(data[i]);
480+
}
481+
// Last 100 guaranteed to be remembered:
482+
for (int i = 299; i < DATASIZE; i++) {
483+
BOOST_CHECK(rb1.contains(data[i]));
484+
}
485+
486+
// false positive rate is 1%, so we should get about 100 hits if
487+
// testing 10,000 random keys. We get worst-case false positive
488+
// behavior when the filter is as full as possible, which is
489+
// when we've inserted one minus an integer multiple of nElement*2.
490+
unsigned int nHits = 0;
491+
for (int i = 0; i < 10000; i++) {
492+
if (rb1.contains(RandomData()))
493+
++nHits;
494+
}
495+
// Run test_bitcoin with --log_level=message to see BOOST_TEST_MESSAGEs:
496+
BOOST_TEST_MESSAGE("RollingBloomFilter got " << nHits << " false positives (~100 expected)");
497+
498+
// Insanely unlikely to get a fp count outside this range:
499+
BOOST_CHECK(nHits > 25);
500+
BOOST_CHECK(nHits < 175);
501+
502+
BOOST_CHECK(rb1.contains(data[DATASIZE-1]));
503+
rb1.clear();
504+
BOOST_CHECK(!rb1.contains(data[DATASIZE-1]));
505+
506+
// Now roll through data, make sure last 100 entries
507+
// are always remembered:
508+
for (int i = 0; i < DATASIZE; i++) {
509+
if (i >= 100)
510+
BOOST_CHECK(rb1.contains(data[i-100]));
511+
rb1.insert(data[i]);
512+
}
513+
514+
// Insert 999 more random entries:
515+
for (int i = 0; i < 999; i++) {
516+
rb1.insert(RandomData());
517+
}
518+
// Sanity check to make sure the filter isn't just filling up:
519+
nHits = 0;
520+
for (int i = 0; i < DATASIZE; i++) {
521+
if (rb1.contains(data[i]))
522+
++nHits;
523+
}
524+
// Expect about 5 false positives, more than 100 means
525+
// something is definitely broken.
526+
BOOST_TEST_MESSAGE("RollingBloomFilter got " << nHits << " false positives (~5 expected)");
527+
BOOST_CHECK(nHits < 100);
528+
529+
// last-1000-entry, 0.01% false positive:
530+
CRollingBloomFilter rb2(1000, 0.001, 0);
531+
for (int i = 0; i < DATASIZE; i++) {
532+
rb2.insert(data[i]);
533+
}
534+
// ... room for all of them:
535+
for (int i = 0; i < DATASIZE; i++) {
536+
BOOST_CHECK(rb2.contains(data[i]));
537+
}
538+
}
539+
462540
BOOST_AUTO_TEST_SUITE_END()

0 commit comments

Comments
 (0)