diff --git a/cuckoofilter.go b/cuckoofilter.go index fef219f..c464d87 100644 --- a/cuckoofilter.go +++ b/cuckoofilter.go @@ -4,7 +4,9 @@ import ( "bytes" "encoding/binary" "fmt" - "math/rand" + "time" + + "github.com/dgryski/go-wyhash" ) // maxCuckooKickouts is the maximum number of times reinsert @@ -18,6 +20,9 @@ type Filter struct { // Bit mask set to len(buckets) - 1. As len(buckets) is always a power of 2, // applying this mask mimics the operation x % len(buckets). bucketIndexMask uint + // rng is a simple pseudo-random number generator that we store locally + // so that we don't have to spend time locking the global RNG. + rng *wyhash.Rng } // NewFilter returns a new cuckoofilter suitable for the given number of elements. @@ -33,10 +38,12 @@ func NewFilter(numElements uint) *Filter { numBuckets = 1 } buckets := make([]bucket, numBuckets) + rng := wyhash.Rng(time.Now().UnixNano()) return &Filter{ buckets: buckets, count: 0, bucketIndexMask: uint(len(buckets) - 1), + rng: &rng, } } @@ -72,7 +79,26 @@ func (cf *Filter) Insert(data []byte) bool { if cf.insert(fp, i2) { return true } - return cf.reinsert(fp, randi(i1, i2)) + return cf.reinsert(fp, cf.Coinflip(i1, i2)) +} + +// Using % isn't *perfectly* uniform, but it really only matters when n is a +// significant fraction of the rng's range. It's more than good enough for our +// purposes since n is on the order of 10^6 and our rng is 63 bits (10^19); this +// means the bias is on the order of 10^-13. For our use case, that's well below +// the noise floor. +func (cf *Filter) Intn(n int) int { + // we need to make sure it's strictly positive, so mask off the sign bit + return int(cf.rng.Next()&0x7FFF_FFFF_FFFF_FFFF) % n +} + +// Coinflip returns either i1 or i2 randomly by examining the least significant +// bit of the RNG. +func (cf Filter) Coinflip(i1, i2 uint) uint { + if cf.rng.Next()&0x1 == 0 { + return i1 + } + return i2 } func (cf *Filter) insert(fp fingerprint, i uint) bool { @@ -85,7 +111,7 @@ func (cf *Filter) insert(fp fingerprint, i uint) bool { func (cf *Filter) reinsert(fp fingerprint, i uint) bool { for k := 0; k < maxCuckooKickouts; k++ { - j := rand.Intn(bucketSize) + j := cf.Intn(bucketSize) // Swap fingerprint with bucket entry. cf.buckets[i][j], fp = fp, cf.buckets[i][j] diff --git a/cuckoofilter_test.go b/cuckoofilter_test.go index 08d4ca1..c43b91c 100644 --- a/cuckoofilter_test.go +++ b/cuckoofilter_test.go @@ -3,7 +3,6 @@ package cuckoo import ( "bufio" "fmt" - "math" "math/rand" "os" "testing" @@ -11,13 +10,6 @@ import ( "github.com/google/go-cmp/cmp" ) -// optFloatNear considers float64 as equal if the relative delta is small. -var optFloatNear = cmp.Comparer(func(x, y float64) bool { - delta := math.Abs(x - y) - mean := math.Abs(x+y) / 2.0 - return delta/mean < 0.00001 -}) - func TestInsertion(t *testing.T) { cf := NewFilter(1000000) fd, err := os.Open("/usr/share/dict/words") @@ -257,3 +249,44 @@ func TestEncodeDecode(t *testing.T) { t.Errorf("Decode = %v, want %v, encoded = %v", got, cf, encoded) } } + +func TestFilter_Coinflip(t *testing.T) { + cf := NewFilter(8) + yes := 0 + for i := 0; i < 1000000; i++ { + if cf.Coinflip(0, 1) == 1 { + yes++ + } + } + // See below -- we're checking that we're within 1% of the expected value. + if yes < 499000 || yes > 501000 { + t.Errorf("yes: %d, expected 500000", yes) + } +} + +func TestFilter_Intn(t *testing.T) { + const tries = 1_000_000 + for _, n := range []int{10, 100, 1000} { + t.Run(fmt.Sprintf("n=%d", n), func(t *testing.T) { + cf := NewFilter(8) + yes := 0 + buckets := make([]int, n) + for i := 0; i < tries; i++ { + x := cf.Intn(n) + buckets[x]++ + if x == 0 { + yes++ + } + } + // this is a rectangular distribution, so the expected value is + // tries / n. We expect the actual value to be within 10% of that. + // This means that the test will randomly fail occasionally, but it + // should be rare. + expected := tries / n + if yes < (expected*90)/100 || yes > (expected*110)/100 { + t.Errorf("yes: %d, expected between %d and %d", yes, (expected*95)/100, (expected*105)/100) + t.Log(buckets) + } + }) + } +} diff --git a/go.mod b/go.mod index f261456..95ed942 100644 --- a/go.mod +++ b/go.mod @@ -4,5 +4,6 @@ go 1.15 require ( github.com/dgryski/go-metro v0.0.0-20200812162917-85c65e2d0165 + github.com/dgryski/go-wyhash v0.0.0-20191203203029-c4841ae36371 github.com/google/go-cmp v0.5.9 ) diff --git a/go.sum b/go.sum index aabbb23..ab99de0 100644 --- a/go.sum +++ b/go.sum @@ -1,4 +1,6 @@ github.com/dgryski/go-metro v0.0.0-20200812162917-85c65e2d0165 h1:BS21ZUJ/B5X2UVUbczfmdWH7GapPWAhxcMsDnjJTU1E= github.com/dgryski/go-metro v0.0.0-20200812162917-85c65e2d0165/go.mod h1:c9O8+fpSOX1DM8cPNSkX/qsBWdkD4yd2dpciOWQjpBw= +github.com/dgryski/go-wyhash v0.0.0-20191203203029-c4841ae36371 h1:bz5ApY1kzFBvw3yckuyRBCtqGvprWrKswYK468nm+Gs= +github.com/dgryski/go-wyhash v0.0.0-20191203203029-c4841ae36371/go.mod h1:/ENMIO1SQeJ5YQeUWWpbX8f+bS8INHrrhFjXgEqi4LA= github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38= github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= diff --git a/util.go b/util.go index 00f6309..e84c414 100644 --- a/util.go +++ b/util.go @@ -2,19 +2,11 @@ package cuckoo import ( "encoding/binary" - "math/rand" + "math/bits" metro "github.com/dgryski/go-metro" ) -// randi returns either i1 or i2 randomly. -func randi(i1, i2 uint) uint { - if rand.Int31()%2 == 0 { - return i1 - } - return i2 -} - func getAltIndex(fp fingerprint, i uint, bucketIndexMask uint) uint { b := make([]byte, 2) binary.LittleEndian.PutUint16(b, uint16(fp)) @@ -40,13 +32,5 @@ func getIndexAndFingerprint(data []byte, bucketIndexMask uint) (uint, fingerprin } func getNextPow2(n uint64) uint { - n-- - n |= n >> 1 - n |= n >> 2 - n |= n >> 4 - n |= n >> 8 - n |= n >> 16 - n |= n >> 32 - n++ - return uint(n) + return uint(1 << bits.Len64(n-1)) }