Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 29 additions & 3 deletions cuckoofilter.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@ import (
"bytes"
"encoding/binary"
"fmt"
"math/rand"
"time"

"github.com/dgryski/go-wyhash"
)

// maxCuckooKickouts is the maximum number of times reinsert
Expand All @@ -18,6 +20,9 @@ type Filter struct {
// Bit mask set to len(buckets) - 1. As len(buckets) is always a power of 2,
// applying this mask mimics the operation x % len(buckets).
bucketIndexMask uint
// rng is a simple pseudo-random number generator that we store locally
// so that we don't have to spend time locking the global RNG.
rng *wyhash.Rng
}

// NewFilter returns a new cuckoofilter suitable for the given number of elements.
Expand All @@ -33,10 +38,12 @@ func NewFilter(numElements uint) *Filter {
numBuckets = 1
}
buckets := make([]bucket, numBuckets)
rng := wyhash.Rng(time.Now().UnixNano())
return &Filter{
buckets: buckets,
count: 0,
bucketIndexMask: uint(len(buckets) - 1),
rng: &rng,
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Inline here as &wyhash.Rng(time.Now().UnixNano())

}
}

Expand Down Expand Up @@ -72,7 +79,26 @@ func (cf *Filter) Insert(data []byte) bool {
if cf.insert(fp, i2) {
return true
}
return cf.reinsert(fp, randi(i1, i2))
return cf.reinsert(fp, cf.Coinflip(i1, i2))
}

// Using % isn't *perfectly* uniform, but it really only matters when n is a
// significant fraction of the rng's range. It's more than good enough for our
// purposes since n is on the order of 10^6 and our rng is 63 bits (10^19); this
// means the bias is on the order of 10^-13. For our use case, that's well below
// the noise floor.
func (cf *Filter) Intn(n int) int {
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

lowercase, no need to make this public

// we need to make sure it's strictly positive, so mask off the sign bit
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Casting to uint would make this more straight-forward.

return int(cf.rng.Next()&0x7FFF_FFFF_FFFF_FFFF) % n
}

// Coinflip returns either i1 or i2 randomly by examining the least significant
// bit of the RNG.
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Comment nit:

Coinflip returns either i1 or i2 randomly with about equal chance.

The rest is an implementation detail.

func (cf Filter) Coinflip(i1, i2 uint) uint {
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

lowercase, no need to make this public.

if cf.rng.Next()&0x1 == 0 {
return i1
}
return i2
}

func (cf *Filter) insert(fp fingerprint, i uint) bool {
Expand All @@ -85,7 +111,7 @@ func (cf *Filter) insert(fp fingerprint, i uint) bool {

func (cf *Filter) reinsert(fp fingerprint, i uint) bool {
for k := 0; k < maxCuckooKickouts; k++ {
j := rand.Intn(bucketSize)
j := cf.Intn(bucketSize)
// Swap fingerprint with bucket entry.
cf.buckets[i][j], fp = fp, cf.buckets[i][j]

Expand Down
49 changes: 41 additions & 8 deletions cuckoofilter_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,21 +3,13 @@ package cuckoo
import (
"bufio"
"fmt"
"math"
"math/rand"
"os"
"testing"

"github.com/google/go-cmp/cmp"
)

// optFloatNear considers float64 as equal if the relative delta is small.
var optFloatNear = cmp.Comparer(func(x, y float64) bool {
delta := math.Abs(x - y)
mean := math.Abs(x+y) / 2.0
return delta/mean < 0.00001
})

func TestInsertion(t *testing.T) {
cf := NewFilter(1000000)
fd, err := os.Open("/usr/share/dict/words")
Expand Down Expand Up @@ -257,3 +249,44 @@ func TestEncodeDecode(t *testing.T) {
t.Errorf("Decode = %v, want %v, encoded = %v", got, cf, encoded)
}
}

func TestFilter_Coinflip(t *testing.T) {
cf := NewFilter(8)
yes := 0
for i := 0; i < 1000000; i++ {
if cf.Coinflip(0, 1) == 1 {
yes++
}
}
// See below -- we're checking that we're within 1% of the expected value.
if yes < 499000 || yes > 501000 {
t.Errorf("yes: %d, expected 500000", yes)
}
}

func TestFilter_Intn(t *testing.T) {
const tries = 1_000_000
for _, n := range []int{10, 100, 1000} {
t.Run(fmt.Sprintf("n=%d", n), func(t *testing.T) {
cf := NewFilter(8)
yes := 0
buckets := make([]int, n)
for i := 0; i < tries; i++ {
x := cf.Intn(n)
buckets[x]++
if x == 0 {
yes++
}
}
// this is a rectangular distribution, so the expected value is
// tries / n. We expect the actual value to be within 10% of that.
// This means that the test will randomly fail occasionally, but it
// should be rare.
expected := tries / n
if yes < (expected*90)/100 || yes > (expected*110)/100 {
t.Errorf("yes: %d, expected between %d and %d", yes, (expected*95)/100, (expected*105)/100)
t.Log(buckets)
}
})
}
}
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,6 @@ go 1.15

require (
github.com/dgryski/go-metro v0.0.0-20200812162917-85c65e2d0165
github.com/dgryski/go-wyhash v0.0.0-20191203203029-c4841ae36371
github.com/google/go-cmp v0.5.9
)
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
github.com/dgryski/go-metro v0.0.0-20200812162917-85c65e2d0165 h1:BS21ZUJ/B5X2UVUbczfmdWH7GapPWAhxcMsDnjJTU1E=
github.com/dgryski/go-metro v0.0.0-20200812162917-85c65e2d0165/go.mod h1:c9O8+fpSOX1DM8cPNSkX/qsBWdkD4yd2dpciOWQjpBw=
github.com/dgryski/go-wyhash v0.0.0-20191203203029-c4841ae36371 h1:bz5ApY1kzFBvw3yckuyRBCtqGvprWrKswYK468nm+Gs=
github.com/dgryski/go-wyhash v0.0.0-20191203203029-c4841ae36371/go.mod h1:/ENMIO1SQeJ5YQeUWWpbX8f+bS8INHrrhFjXgEqi4LA=
github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38=
github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
20 changes: 2 additions & 18 deletions util.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,11 @@ package cuckoo

import (
"encoding/binary"
"math/rand"
"math/bits"

metro "github.com/dgryski/go-metro"
)

// randi returns either i1 or i2 randomly.
func randi(i1, i2 uint) uint {
if rand.Int31()%2 == 0 {
return i1
}
return i2
}

func getAltIndex(fp fingerprint, i uint, bucketIndexMask uint) uint {
b := make([]byte, 2)
binary.LittleEndian.PutUint16(b, uint16(fp))
Expand All @@ -40,13 +32,5 @@ func getIndexAndFingerprint(data []byte, bucketIndexMask uint) (uint, fingerprin
}

func getNextPow2(n uint64) uint {
n--
n |= n >> 1
n |= n >> 2
n |= n >> 4
n |= n >> 8
n |= n >> 16
n |= n >> 32
n++
return uint(n)
return uint(1 << bits.Len64(n-1))
}