Skip to content

Commit bb1795c

Browse files
committed
update old bloom and readme
1 parent c14bb2e commit bb1795c

File tree

4 files changed

+167
-40
lines changed

4 files changed

+167
-40
lines changed

bloom_old.go

Lines changed: 42 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,7 @@ import (
44
"fmt"
55
"log"
66
"math"
7-
"os"
8-
9-
"github.com/edsrzf/mmap-go"
7+
"unsafe"
108
)
119

1210
var ErrKeyNotFound = fmt.Errorf("Key not found")
@@ -29,9 +27,8 @@ type BloomFilter2 struct {
2927
count int
3028

3129
// the bit array
32-
bit_array []bool
33-
memFile *os.File
34-
mem mmap.MMap
30+
bit_array []uint8
31+
byteSize int
3532

3633
// m is the number bits per slice(hashFn)
3734
m int
@@ -40,28 +37,28 @@ type BloomFilter2 struct {
4037
seeds []int64
4138
}
4239

43-
// NewBloom creates a new bloom filter.
40+
// NewBloom2 creates a new bloom filter in-memory
4441
// err_rate is the desired false positive rate. e.g. 0.1 error rate implies 1 in 1000
4542
//
4643
// capacity is the number of entries intended to be added to the filter
4744
//
4845
// database is the persistent store to attach to the filter. can be nil.
49-
func NewBloom2(err_rate float64, capacity int, database Store) *BloomFilter2 {
50-
if err_rate <= 0 || err_rate >= 1 {
46+
func NewBloom2(opts *BloomOptions) *BloomFilter2 {
47+
if opts.Err_rate <= 0 || opts.Err_rate >= 1 {
5148
panic("Error rate must be between 0 and 1")
5249
}
53-
if capacity <= 0 {
50+
if opts.Capacity <= 0 {
5451
panic("Capacity must be greater than 0")
5552
}
5653

5754
// number of hash functions (k)
58-
numHashFn := int(math.Ceil(math.Log2(1.0 / err_rate)))
55+
numHashFn := int(math.Ceil(math.Log2(1.0 / opts.Err_rate)))
5956

6057
//ln22 = ln2^2
6158
ln22 := math.Pow(math.Ln2, 2)
6259

6360
// M
64-
bit_width := int((float64(capacity) * math.Abs(math.Log(err_rate)) / ln22))
61+
bit_width := int((float64(opts.Capacity) * math.Abs(math.Log(opts.Err_rate)) / ln22))
6562

6663
//m
6764
bits_per_slice := bit_width / numHashFn
@@ -71,14 +68,22 @@ func NewBloom2(err_rate float64, capacity int, database Store) *BloomFilter2 {
7168
seeds[i] = int64((i + 1) << 16)
7269
}
7370

71+
var b byte
72+
byteSize := int(unsafe.Sizeof(&b))
73+
74+
// we only need bit_width/8 bits, but only after calculating m
75+
bit_width /= byteSize
76+
bit_width += byteSize // add extra 1 byte to ensure we have a full byte at the end
77+
7478
return &BloomFilter2{
75-
err_rate: err_rate,
76-
capacity: capacity,
79+
err_rate: opts.Err_rate,
80+
capacity: opts.Capacity,
7781
bit_width: bit_width,
78-
bit_array: make([]bool, bit_width),
82+
bit_array: make([]uint8, bit_width),
7983
m: bits_per_slice,
8084
seeds: seeds,
81-
db: database,
85+
db: opts.Database,
86+
byteSize: byteSize,
8287
}
8388
}
8489

@@ -92,7 +97,8 @@ func (bf *BloomFilter2) Add(key, val []byte) {
9297
}
9398

9499
for i := 0; i < len(indices); i++ {
95-
bf.bit_array[indices[i]] = true
100+
idx, mask := bf.getBitIndexN(indices[i])
101+
bf.bit_array[idx] |= mask
96102
}
97103
bf.count++
98104

@@ -105,7 +111,17 @@ func (bf *BloomFilter2) Add(key, val []byte) {
105111
// Find checks if the key exists in the bloom filter
106112
func (bf *BloomFilter2) Contains(key []byte) bool {
107113
indices := bf.candidates(string(key))
108-
return arrEvery(indices, bf.bit_array)
114+
115+
for i := 0; i < len(indices); i++ {
116+
idx, mask := bf.getBitIndexN(indices[i])
117+
bit := bf.bit_array[idx]
118+
119+
// check if the mask part of the bit is set
120+
if bit&mask == 0 {
121+
return false
122+
}
123+
}
124+
return true
109125
}
110126

111127
// Get Gets the key from the underlying persistent store
@@ -131,21 +147,18 @@ func (bf *BloomFilter2) hasStore() bool {
131147
return bf.db != nil && bf.db.isReady()
132148
}
133149

134-
// every checks if each index in the indices array has a value of 1 in the bit array
135-
func arrEvery(indices []uint64, bits []bool) bool {
136-
allExists := true
137-
for _, idx := range indices {
138-
if !bits[idx] {
139-
allExists = false
140-
return allExists
141-
}
142-
}
143-
return allExists
150+
// getBitIndexN returns the index and mask for the bit.
151+
func (bf *BloomFilter2) getBitIndexN(idx uint64) (uint64, byte) {
152+
quot, rem := divmod(int64(idx), int64(bf.byteSize))
153+
154+
byteSizeInDec := int64(math.Pow(2, float64(bf.byteSize)-1))
155+
shift := byte((byteSizeInDec >> rem)) // 128 >> 1,2..
156+
return uint64(quot), shift
144157
}
145158

146159
// candidates uses the hash function to return all index candidates of the given key
147160
func (bf *BloomFilter2) candidates(key string) []uint64 {
148-
var res []uint64
161+
res := make([]uint64, 0, len(bf.seeds))
149162
for i, seed := range bf.seeds {
150163
hash := getHash(key, seed)
151164
// each hash produces an index over m for its respective slice.

cmd/main.go

Lines changed: 61 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,19 @@ package main
22

33
import (
44
"fmt"
5+
"log"
6+
"os"
57
"runtime"
68
"time"
79

810
"github.com/dsa0x/sprout"
11+
bolt "go.etcd.io/bbolt"
912
)
1013

1114
func main() {
12-
num := 20_000
13-
// main2(num / 10)
15+
num := 20_000_000
16+
// div := num / 10
17+
// main2(num)
1418
// return
1519
opts := &sprout.BloomOptions{
1620
Err_rate: 0.001,
@@ -25,7 +29,11 @@ func main() {
2529

2630
for i := 0; i < num-1; i++ {
2731
bf.Add([]byte(fmt.Sprintf("%d", i)), []byte("bar"))
28-
fmt.Println(i+1, bf.Contains([]byte(fmt.Sprintf("%d", i+1))))
32+
// if i%div == 0 {
33+
// time.Sleep(time.Second * 3)
34+
// fmt.Println(i, "added")
35+
// }
36+
// fmt.Println(i+1, bf.Contains([]byte(fmt.Sprintf("%d", i+1))))
2937
}
3038
fmt.Println(bf.Contains([]byte("foo")))
3139
fmt.Println(bf.Contains([]byte("bar")))
@@ -47,6 +55,7 @@ func main3(num int) {
4755

4856
bf := sprout.NewBloom(opts)
4957
defer bf.Close()
58+
PrintMemUsage()
5059

5160
}
5261

@@ -67,6 +76,55 @@ func main2(num int) {
6776
fmt.Println("Added", num*10, "elements in", time.Since(start))
6877
}
6978

79+
func main4(num int) {
80+
db, err := bolt.Open("store.db", 0600, nil)
81+
if err != nil {
82+
panic(err)
83+
}
84+
85+
err = db.Update(func(tx *bolt.Tx) error {
86+
_, err := tx.CreateBucketIfNotExists([]byte("store.name"))
87+
return err
88+
})
89+
if err != nil {
90+
panic(err)
91+
}
92+
93+
w, err := os.OpenFile("storebolt.db", os.O_RDWR|os.O_CREATE, 0600)
94+
if err != nil {
95+
panic(err)
96+
}
97+
98+
// defer os.Remove("storebolt.db")
99+
100+
start := time.Now()
101+
tx, err := db.Begin(true)
102+
if err != nil {
103+
panic(err)
104+
}
105+
defer tx.Rollback()
106+
size := tx.Size()
107+
108+
b := tx.Bucket([]byte("store.name"))
109+
110+
for i := 0; i < num; i++ {
111+
b.Put([]byte{byte(i)}, []byte("bar"))
112+
}
113+
114+
// write snapshot to pipe
115+
go func() {
116+
defer w.Close()
117+
_, err := tx.WriteTo(w)
118+
if err != nil {
119+
log.Println("Erroring writing to pipe", err)
120+
}
121+
}()
122+
if err != nil {
123+
panic(err)
124+
}
125+
fmt.Println("Added", num, "elements in", time.Since(start), "bytes=", size)
126+
}
127+
70128
func PrintMemUsage() {
71129
var m runtime.MemStats
72130
runtime.ReadMemStats(&m)

cmd/main_test.go

Lines changed: 52 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -10,20 +10,59 @@ import (
1010
"go.etcd.io/bbolt"
1111
)
1212

13-
func Benchmark_NewBloom(b *testing.B) {
13+
func Benchmark_InitializeBloom(b *testing.B) {
1414
b.ReportAllocs()
1515
b.ResetTimer()
1616
opts := &sprout.BloomOptions{
17-
Err_rate: 0.01,
17+
Err_rate: 0.001,
1818
Path: "/tmp/bloom.db",
1919
Capacity: b.N,
2020
}
2121
bf := sprout.NewBloom(opts)
2222

23+
defer func() {
24+
bf.Close()
25+
os.Remove(opts.Path)
26+
}()
27+
28+
}
29+
func Benchmark_NewBloom(b *testing.B) {
30+
b.ReportAllocs()
31+
b.ResetTimer()
32+
opts := &sprout.BloomOptions{
33+
Err_rate: 0.001,
34+
Path: "/tmp/bloom.db",
35+
Capacity: b.N,
36+
}
37+
bf := sprout.NewBloom(opts)
38+
n := 0
2339
for i := 0; i < b.N; i++ {
24-
bf.Add([]byte{byte(i)}, []byte("bar"))
40+
bf.Add([]byte{byte(n)}, []byte("bar"))
41+
n++
42+
}
43+
44+
defer func() {
45+
bf.Close()
46+
os.Remove(opts.Path)
47+
}()
48+
49+
}
50+
func Benchmark_NewBloomFind(b *testing.B) {
51+
b.ReportAllocs()
52+
b.ResetTimer()
53+
opts := &sprout.BloomOptions{
54+
Err_rate: 0.001,
55+
Path: "/tmp/bloom.db",
56+
Capacity: b.N,
2557
}
58+
bf := sprout.NewBloom(opts)
59+
2660
n := 0
61+
for i := 0; i < b.N; i++ {
62+
bf.Add([]byte{byte(n)}, []byte("bar"))
63+
n++
64+
}
65+
n = 0
2766
for i := 0; i < b.N; i++ {
2867
bf.Contains([]byte{byte(n)})
2968
n++
@@ -39,13 +78,20 @@ func Benchmark_NewBloom2(b *testing.B) {
3978
b.ReportAllocs()
4079
b.ResetTimer()
4180

42-
bf := sprout.NewBloom2(0.001, b.N, nil)
81+
opts := &sprout.BloomOptions{
82+
Err_rate: 0.001,
83+
Path: "/tmp/bloom.db",
84+
Capacity: b.N,
85+
}
86+
bf := sprout.NewBloom2(opts)
4387
defer bf.Close()
4488

89+
n := 0
4590
for i := 0; i < b.N; i++ {
46-
bf.Add([]byte{byte(i)}, []byte("bar"))
91+
bf.Add([]byte{byte(n)}, []byte("bar"))
92+
n++
4793
}
48-
n := 0
94+
n = 0
4995
for i := 0; i < b.N; i++ {
5096
bf.Contains([]byte{byte(n)})
5197
n++

readme.md

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,23 @@
11
### Sprout
22

3-
A bloom filter is a probabilistic data structure that is used to determine if an element is present in a set. Bloom filters are fast and space efficient. Bloom filters allow for false positives, but mitigate the probability with an expected false positive rate. An error rate of 0.001 implies that the probability of a false positive is 1 in 1000.
3+
A bloom filter is a probabilistic data structure that is used to determine if an element is present in a set. Bloom filters are fast and space efficient. They allow for false positives, but mitigate the probability with an expected false positive rate. An error rate of 0.001 implies that the probability of a false positive is 1 in 1000.
44

5-
Sprout implements a bloom filter in Go, while using boltdb and badgerdb as optional in-memory persistent storage. Sprout writes the bloom filter to a memory-mapped file, and reads it from disk when needed.
5+
To fulfil the false positive rate, bloom filters are initialized with a capacity. The capacity is the number of elements that can be inserted into the bloom filter, and this cannot be changed.
6+
7+
Sprout implements a bloom filter in Go, while using boltdb and badgerdb as optional in-memory persistent storage. Sprout writes the bloom filter to a memory-mapped file.
68

79
Sprout also implement a scalable bloom filter described in a paper written by [P. Almeida, C.Baquero, N. Preguiça, D. Hutchison](https://haslab.uminho.pt/cbm/files/dbloom.pdf).
810

911
A scalable bloom filter allows you to grow the filter beyond the initial filter capacity, while preserving the desired false positive rate.
1012

13+
### Memory Usage
14+
15+
Bloom filters are space efficient, as they are only storing the bits that are set. For a filter with a capacity of 20,000,000 and a error rate of 0.001, the storage size is approximately 34MB. That implies approx 1.78 bytes (~14 bits) per element.
16+
The number of bits per element is as a result of the number of hash functions derived from the capacity and the error rate.
17+
18+
**Scalable Bloom Filters**
19+
The scalable bloom filter initialized with a capacity of 2,000,000 and a error rate of 0.001, when grown to a capacity of 20,000,000, the total storage size is approximately 37.3MB.
20+
1121
### Installation
1222

1323
```shell

0 commit comments

Comments
 (0)