update old bloom and readme

dsa0x · dsa0x · commit bb1795c72724 · 2022-02-06T15:46:53.000+01:00
diff --git a/bloom_old.go b/bloom_old.go
@@ -4,9 +4,7 @@ import (
 	"fmt"
 	"log"
 	"math"
-	"os"
-
-	"github.com/edsrzf/mmap-go"
+	"unsafe"
 )
 
 var ErrKeyNotFound = fmt.Errorf("Key not found")
@@ -29,9 +27,8 @@ type BloomFilter2 struct {
 	count int
 
 	// the bit array
-	bit_array []bool
-	memFile   *os.File
-	mem       mmap.MMap
+	bit_array []uint8
+	byteSize  int
 
 	// m is the number bits per slice(hashFn)
 	m int
@@ -40,28 +37,28 @@ type BloomFilter2 struct {
 	seeds []int64
 }
 
-// NewBloom creates a new bloom filter.
+// NewBloom2 creates a new bloom filter in-memory
 // err_rate is the desired false positive rate. e.g. 0.1 error rate implies 1 in 1000
 //
 // capacity is the number of entries intended to be added to the filter
 //
 // database is the persistent store to attach to the filter. can be nil.
-func NewBloom2(err_rate float64, capacity int, database Store) *BloomFilter2 {
-	if err_rate <= 0 || err_rate >= 1 {
+func NewBloom2(opts *BloomOptions) *BloomFilter2 {
+	if opts.Err_rate <= 0 || opts.Err_rate >= 1 {
 		panic("Error rate must be between 0 and 1")
 	}
-	if capacity <= 0 {
+	if opts.Capacity <= 0 {
 		panic("Capacity must be greater than 0")
 	}
 
 	// number of hash functions (k)
-	numHashFn := int(math.Ceil(math.Log2(1.0 / err_rate)))
+	numHashFn := int(math.Ceil(math.Log2(1.0 / opts.Err_rate)))
 
 	//ln22 = ln2^2
 	ln22 := math.Pow(math.Ln2, 2)
 
 	// M
-	bit_width := int((float64(capacity) * math.Abs(math.Log(err_rate)) / ln22))
+	bit_width := int((float64(opts.Capacity) * math.Abs(math.Log(opts.Err_rate)) / ln22))
 
 	//m
 	bits_per_slice := bit_width / numHashFn
@@ -71,14 +68,22 @@ func NewBloom2(err_rate float64, capacity int, database Store) *BloomFilter2 {
 		seeds[i] = int64((i + 1) << 16)
 	}
 
+	var b byte
+	byteSize := int(unsafe.Sizeof(&b))
+
+	// we only need bit_width/8 bits, but only after calculating m
+	bit_width /= byteSize
+	bit_width += byteSize // add extra 1 byte to ensure we have a full byte at the end
+
 	return &BloomFilter2{
-		err_rate:  err_rate,
-		capacity:  capacity,
+		err_rate:  opts.Err_rate,
+		capacity:  opts.Capacity,
 		bit_width: bit_width,
-		bit_array: make([]bool, bit_width),
+		bit_array: make([]uint8, bit_width),
 		m:         bits_per_slice,
 		seeds:     seeds,
-		db:        database,
+		db:        opts.Database,
+		byteSize:  byteSize,
 	}
 }
 
@@ -92,7 +97,8 @@ func (bf *BloomFilter2) Add(key, val []byte) {
 	}
 
 	for i := 0; i < len(indices); i++ {
-		bf.bit_array[indices[i]] = true
+		idx, mask := bf.getBitIndexN(indices[i])
+		bf.bit_array[idx] |= mask
 	}
 	bf.count++
 
@@ -105,7 +111,17 @@ func (bf *BloomFilter2) Add(key, val []byte) {
 // Find checks if the key exists in the bloom filter
 func (bf *BloomFilter2) Contains(key []byte) bool {
 	indices := bf.candidates(string(key))
-	return arrEvery(indices, bf.bit_array)
+
+	for i := 0; i < len(indices); i++ {
+		idx, mask := bf.getBitIndexN(indices[i])
+		bit := bf.bit_array[idx]
+
+		// check if the mask part of the bit is set
+		if bit&mask == 0 {
+			return false
+		}
+	}
+	return true
 }
 
 // Get Gets the key from the underlying persistent store
@@ -131,21 +147,18 @@ func (bf *BloomFilter2) hasStore() bool {
 	return bf.db != nil && bf.db.isReady()
 }
 
-// every checks if each index in the indices array has a value of 1 in the bit array
-func arrEvery(indices []uint64, bits []bool) bool {
-	allExists := true
-	for _, idx := range indices {
-		if !bits[idx] {
-			allExists = false
-			return allExists
-		}
-	}
-	return allExists
+// getBitIndexN returns the index and mask for the bit.
+func (bf *BloomFilter2) getBitIndexN(idx uint64) (uint64, byte) {
+	quot, rem := divmod(int64(idx), int64(bf.byteSize))
+
+	byteSizeInDec := int64(math.Pow(2, float64(bf.byteSize)-1))
+	shift := byte((byteSizeInDec >> rem)) // 128 >> 1,2..
+	return uint64(quot), shift
 }
 
 // candidates uses the hash function to return all index candidates of the given key
 func (bf *BloomFilter2) candidates(key string) []uint64 {
-	var res []uint64
+	res := make([]uint64, 0, len(bf.seeds))
 	for i, seed := range bf.seeds {
 		hash := getHash(key, seed)
 		// each hash produces an index over m for its respective slice.
diff --git a/cmd/main.go b/cmd/main.go
@@ -2,15 +2,19 @@ package main
 
 import (
 	"fmt"
+	"log"
+	"os"
 	"runtime"
 	"time"
 
 	"github.com/dsa0x/sprout"
+	bolt "go.etcd.io/bbolt"
 )
 
 func main() {
-	num := 20_000
-	// main2(num / 10)
+	num := 20_000_000
+	// div := num / 10
+	// main2(num)
 	// return
 	opts := &sprout.BloomOptions{
 		Err_rate: 0.001,
@@ -25,7 +29,11 @@ func main() {
 
 	for i := 0; i < num-1; i++ {
 		bf.Add([]byte(fmt.Sprintf("%d", i)), []byte("bar"))
-		fmt.Println(i+1, bf.Contains([]byte(fmt.Sprintf("%d", i+1))))
+		// if i%div == 0 {
+		// 	time.Sleep(time.Second * 3)
+		// 	fmt.Println(i, "added")
+		// }
+		// fmt.Println(i+1, bf.Contains([]byte(fmt.Sprintf("%d", i+1))))
 	}
 	fmt.Println(bf.Contains([]byte("foo")))
 	fmt.Println(bf.Contains([]byte("bar")))
@@ -47,6 +55,7 @@ func main3(num int) {
 
 	bf := sprout.NewBloom(opts)
 	defer bf.Close()
+	PrintMemUsage()
 
 }
 
@@ -67,6 +76,55 @@ func main2(num int) {
 	fmt.Println("Added", num*10, "elements in", time.Since(start))
 }
 
+func main4(num int) {
+	db, err := bolt.Open("store.db", 0600, nil)
+	if err != nil {
+		panic(err)
+	}
+
+	err = db.Update(func(tx *bolt.Tx) error {
+		_, err := tx.CreateBucketIfNotExists([]byte("store.name"))
+		return err
+	})
+	if err != nil {
+		panic(err)
+	}
+
+	w, err := os.OpenFile("storebolt.db", os.O_RDWR|os.O_CREATE, 0600)
+	if err != nil {
+		panic(err)
+	}
+
+	// defer os.Remove("storebolt.db")
+
+	start := time.Now()
+	tx, err := db.Begin(true)
+	if err != nil {
+		panic(err)
+	}
+	defer tx.Rollback()
+	size := tx.Size()
+
+	b := tx.Bucket([]byte("store.name"))
+
+	for i := 0; i < num; i++ {
+		b.Put([]byte{byte(i)}, []byte("bar"))
+	}
+
+	// write snapshot to pipe
+	go func() {
+		defer w.Close()
+		_, err := tx.WriteTo(w)
+		if err != nil {
+			log.Println("Erroring writing to pipe", err)
+		}
+	}()
+	if err != nil {
+		panic(err)
+	}
+	fmt.Println("Added", num, "elements in", time.Since(start), "bytes=", size)
+}
+
 func PrintMemUsage() {
 	var m runtime.MemStats
 	runtime.ReadMemStats(&m)
diff --git a/cmd/main_test.go b/cmd/main_test.go
@@ -10,20 +10,59 @@ import (
 	"go.etcd.io/bbolt"
 )
 
-func Benchmark_NewBloom(b *testing.B) {
+func Benchmark_InitializeBloom(b *testing.B) {
 	b.ReportAllocs()
 	b.ResetTimer()
 	opts := &sprout.BloomOptions{
-		Err_rate: 0.01,
+		Err_rate: 0.001,
 		Path:     "/tmp/bloom.db",
 		Capacity: b.N,
 	}
 	bf := sprout.NewBloom(opts)
 
+	defer func() {
+		bf.Close()
+		os.Remove(opts.Path)
+	}()
+
+}
+func Benchmark_NewBloom(b *testing.B) {
+	b.ReportAllocs()
+	b.ResetTimer()
+	opts := &sprout.BloomOptions{
+		Err_rate: 0.001,
+		Path:     "/tmp/bloom.db",
+		Capacity: b.N,
+	}
+	bf := sprout.NewBloom(opts)
+	n := 0
 	for i := 0; i < b.N; i++ {
-		bf.Add([]byte{byte(i)}, []byte("bar"))
+		bf.Add([]byte{byte(n)}, []byte("bar"))
+		n++
+	}
+
+	defer func() {
+		bf.Close()
+		os.Remove(opts.Path)
+	}()
+
+}
+func Benchmark_NewBloomFind(b *testing.B) {
+	b.ReportAllocs()
+	b.ResetTimer()
+	opts := &sprout.BloomOptions{
+		Err_rate: 0.001,
+		Path:     "/tmp/bloom.db",
+		Capacity: b.N,
 	}
+	bf := sprout.NewBloom(opts)
+
 	n := 0
+	for i := 0; i < b.N; i++ {
+		bf.Add([]byte{byte(n)}, []byte("bar"))
+		n++
+	}
+	n = 0
 	for i := 0; i < b.N; i++ {
 		bf.Contains([]byte{byte(n)})
 		n++
@@ -39,13 +78,20 @@ func Benchmark_NewBloom2(b *testing.B) {
 	b.ReportAllocs()
 	b.ResetTimer()
 
-	bf := sprout.NewBloom2(0.001, b.N, nil)
+	opts := &sprout.BloomOptions{
+		Err_rate: 0.001,
+		Path:     "/tmp/bloom.db",
+		Capacity: b.N,
+	}
+	bf := sprout.NewBloom2(opts)
 	defer bf.Close()
 
+	n := 0
 	for i := 0; i < b.N; i++ {
-		bf.Add([]byte{byte(i)}, []byte("bar"))
+		bf.Add([]byte{byte(n)}, []byte("bar"))
+		n++
 	}
-	n := 0
+	n = 0
 	for i := 0; i < b.N; i++ {
 		bf.Contains([]byte{byte(n)})
 		n++
diff --git a/readme.md b/readme.md
@@ -1,13 +1,23 @@
 ### Sprout
 
-A bloom filter is a probabilistic data structure that is used to determine if an element is present in a set. Bloom filters are fast and space efficient. Bloom filters allow for false positives, but mitigate the probability with an expected false positive rate. An error rate of 0.001 implies that the probability of a false positive is 1 in 1000.
+A bloom filter is a probabilistic data structure that is used to determine if an element is present in a set. Bloom filters are fast and space efficient. They allow for false positives, but mitigate the probability with an expected false positive rate. An error rate of 0.001 implies that the probability of a false positive is 1 in 1000.
 
-Sprout implements a bloom filter in Go, while using boltdb and badgerdb as optional in-memory persistent storage. Sprout writes the bloom filter to a memory-mapped file, and reads it from disk when needed.
+To fulfil the false positive rate, bloom filters are initialized with a capacity. The capacity is the number of elements that can be inserted into the bloom filter, and this cannot be changed.
+
+Sprout implements a bloom filter in Go, while using boltdb and badgerdb as optional in-memory persistent storage. Sprout writes the bloom filter to a memory-mapped file.
 
 Sprout also implement a scalable bloom filter described in a paper written by [P. Almeida, C.Baquero, N. Preguiça, D. Hutchison](https://haslab.uminho.pt/cbm/files/dbloom.pdf).
 
 A scalable bloom filter allows you to grow the filter beyond the initial filter capacity, while preserving the desired false positive rate.
 
+### Memory Usage
+
+Bloom filters are space efficient, as they are only storing the bits that are set. For a filter with a capacity of 20,000,000 and a error rate of 0.001, the storage size is approximately 34MB. That implies approx 1.78 bytes (~14 bits) per element.
+The number of bits per element is as a result of the number of hash functions derived from the capacity and the error rate.
+
+**Scalable Bloom Filters**
+The scalable bloom filter initialized with a capacity of 2,000,000 and a error rate of 0.001, when grown to a capacity of 20,000,000, the total storage size is approximately 37.3MB.
+
 ### Installation
 
 ```shell