Skip to content

Commit 8d4969c

Browse files
RaduBerindelemire
authored andcommitted
Allow slice reuse when building multiple fuse filters
This commit introduces `BinaryFuseBuilder` which allows reuse of all allocated slices to avoid GC overhead. A single builder can be used with different fingerprint sizes. Fixes #45
1 parent 8e33d15 commit 8d4969c

File tree

4 files changed

+122
-18
lines changed

4 files changed

+122
-18
lines changed

README.md

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,10 +64,10 @@ type BinaryFuse8 struct {
6464

6565
When constructing the filter, you should ensure that there are not too many duplicate keys for best results.
6666

67-
# Generic (8-bit, 16-bit, 32-bit)
67+
## Generic (8-bit, 16-bit, 32-bit)
6868

6969
By default, we use 8-bit fingerprints which provide a 0.4% false positive rate. Some user might want to reduce
70-
this false positive rate at the expensive of more memory usage. For this purpose, we provide a generic type
70+
this false positive rate at the expense of more memory usage. For this purpose, we provide a generic type
7171
(`NewBinaryFuse[T]`).
7272

7373
```Go
@@ -80,6 +80,18 @@ The 32-bit fingerprints are provided but not recommended. Most users will want t
8080
The Binary Fuse filters have memory usages of about 9 bits per key in the 8-bit case, 18 bits per key in the 16-bit case,
8181
for sufficiently large sets (hundreds of thousands of keys). There is more per-key memory usage when the set is smaller.
8282

83+
## Memory reuse for repeated builds
84+
85+
When building many filters, memory can be reused (reducing allocation and GC
86+
overhead) with a `BinaryFuseBuilder`:
87+
```Go
88+
var builder xorfilter.BinaryFuseBuilder
89+
for {
90+
filter8, _ := BuildBinaryFuse[uint8](&builder, keys)
91+
filter16, _ := BuildBinaryFuse[uint16](&builder, keys)
92+
...
93+
}
94+
```
8395

8496
# Implementations of xor filters in other programming languages
8597

binaryfusefilter.go

Lines changed: 74 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import (
44
"errors"
55
"math"
66
"math/bits"
7+
"unsafe"
78
)
89

910
type Unsigned interface {
@@ -20,26 +21,57 @@ type BinaryFuse[T Unsigned] struct {
2021
Fingerprints []T
2122
}
2223

23-
// NewBinaryFuse fills the filter with provided keys. For best results,
24-
// the caller should avoid having too many duplicated keys.
24+
// NewBinaryFuse creates a binary fuse filter with provided keys. For best
25+
// results, the caller should avoid having too many duplicated keys.
26+
//
27+
// The function can mutate the given keys slice to remove duplicates.
28+
//
2529
// The function may return an error if the set is empty.
2630
func NewBinaryFuse[T Unsigned](keys []uint64) (*BinaryFuse[T], error) {
31+
var b BinaryFuseBuilder
32+
filter, err := BuildBinaryFuse[T](&b, keys)
33+
if err != nil {
34+
return nil, err
35+
}
36+
return &filter, nil
37+
}
38+
39+
// BinaryFuseBuilder can be used to reuse memory allocations across multiple
40+
// BinaryFuse builds.
41+
type BinaryFuseBuilder struct {
42+
alone reusableBuffer
43+
t2hash reusableBuffer
44+
reverseOrder reusableBuffer
45+
t2count reusableBuffer
46+
reverseH reusableBuffer
47+
startPos reusableBuffer
48+
fingerprints reusableBuffer
49+
}
50+
51+
// BuildBinaryFuse creates a binary fuse filter with provided keys, reusing
52+
// buffers from the BinaryFuseBuilder if possible. For best results, the caller
53+
// should avoid having too many duplicated keys.
54+
//
55+
// The function can mutate the given keys slice to remove duplicates.
56+
//
57+
// The function may return an error if the set is empty.
58+
func BuildBinaryFuse[T Unsigned](b *BinaryFuseBuilder, keys []uint64) (BinaryFuse[T], error) {
2759
size := uint32(len(keys))
28-
filter := &BinaryFuse[T]{}
29-
filter.initializeParameters(size)
60+
var filter BinaryFuse[T]
61+
filter.initializeParameters(b, size)
3062
rngcounter := uint64(1)
3163
filter.Seed = splitmix64(&rngcounter)
3264
capacity := uint32(len(filter.Fingerprints))
3365

34-
alone := make([]uint32, capacity)
66+
alone := reuseBuffer[uint32](&b.alone, int(capacity))
3567
// the lowest 2 bits are the h index (0, 1, or 2)
3668
// so we only have 6 bits for counting;
3769
// but that's sufficient
38-
t2count := make([]T, capacity)
39-
reverseH := make([]T, size)
70+
t2count := reuseBuffer[T](&b.t2count, int(capacity))
71+
reverseH := reuseBuffer[T](&b.reverseH, int(size))
4072

41-
t2hash := make([]uint64, capacity)
42-
reverseOrder := make([]uint64, size+1)
73+
t2hash := reuseBuffer[uint64](&b.t2hash, int(capacity))
74+
reverseOrder := reuseBuffer[uint64](&b.reverseOrder, int(size+1))
4375
reverseOrder[size] = 1
4476

4577
// the array h0, h1, h2, h0, h1, h2
@@ -50,16 +82,16 @@ func NewBinaryFuse[T Unsigned](keys []uint64) (*BinaryFuse[T], error) {
5082
for {
5183
iterations += 1
5284
if iterations > MaxIterations {
53-
// The probability of this happening is lower than the
54-
// the cosmic-ray probability (i.e., a cosmic ray corrupts your system).
55-
return nil, errors.New("too many iterations")
85+
// The probability of this happening is lower than the cosmic-ray
86+
// probability (i.e., a cosmic ray corrupts your system).
87+
return BinaryFuse[T]{}, errors.New("too many iterations")
5688
}
5789

5890
blockBits := 1
5991
for (1 << blockBits) < filter.SegmentCount {
6092
blockBits += 1
6193
}
62-
startPos := make([]uint, 1<<blockBits)
94+
startPos := reuseBuffer[uint](&b.startPos, 1<<blockBits)
6395
for i := range startPos {
6496
// important: we do not want i * size to overflow!!!
6597
startPos[i] = uint((uint64(i) * uint64(size)) >> blockBits)
@@ -216,7 +248,7 @@ func NewBinaryFuse[T Unsigned](keys []uint64) (*BinaryFuse[T], error) {
216248
return filter, nil
217249
}
218250

219-
func (filter *BinaryFuse[T]) initializeParameters(size uint32) {
251+
func (filter *BinaryFuse[T]) initializeParameters(b *BinaryFuseBuilder, size uint32) {
220252
arity := uint32(3)
221253
filter.SegmentLength = calculateSegmentLength(arity, size)
222254
if filter.SegmentLength > 262144 {
@@ -238,7 +270,7 @@ func (filter *BinaryFuse[T]) initializeParameters(size uint32) {
238270
}
239271
arrayLength = (filter.SegmentCount + arity - 1) * filter.SegmentLength
240272
filter.SegmentCountLength = filter.SegmentCount * filter.SegmentLength
241-
filter.Fingerprints = make([]T, arrayLength)
273+
filter.Fingerprints = reuseBuffer[T](&b.fingerprints, int(arrayLength))
242274
}
243275

244276
func (filter *BinaryFuse[T]) mod3(x T) T {
@@ -292,3 +324,30 @@ func calculateSizeFactor(arity uint32, size uint32) float64 {
292324
return 2.0
293325
}
294326
}
327+
328+
// reusableBuffer allows reuse of a backing buffer to avoid allocations for
329+
// slices of integers.
330+
type reusableBuffer struct {
331+
buf []uint64
332+
}
333+
334+
type integer interface {
335+
~int | ~int8 | ~int16 | ~int32 | ~int64 | ~uint | ~uint8 | ~uint16 | ~uint32 | ~uint64
336+
}
337+
338+
// reuseBuffer returns an empty slice of the given size, reusing the last buffer
339+
// if possible.
340+
func reuseBuffer[T integer](b *reusableBuffer, size int) []T {
341+
const sizeOfUint64 = 8
342+
// Our backing buffer is a []uint64. Figure out how many uint64s we need
343+
// to back a []T of the requested size.
344+
bufSize := int((uintptr(size)*unsafe.Sizeof(T(0)) + sizeOfUint64 - 1) / sizeOfUint64)
345+
if cap(b.buf) >= bufSize {
346+
clear(b.buf[:bufSize])
347+
} else {
348+
// We need to allocate a new buffer. Increase by at least 25% to amortize
349+
// allocations; this is what append() does for large enough slices.
350+
b.buf = make([]uint64, max(bufSize, cap(b.buf)+cap(b.buf)/4))
351+
}
352+
return unsafe.Slice((*T)(unsafe.Pointer(unsafe.SliceData(b.buf))), size)
353+
}

binaryfusefilter_test.go

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,12 @@ package xorfilter
33
import (
44
"fmt"
55
"math/rand/v2"
6+
"slices"
67
"testing"
78

89
"github.com/cespare/xxhash/v2"
910
"github.com/stretchr/testify/assert"
11+
"github.com/stretchr/testify/require"
1012
)
1113

1214
const NUM_KEYS = 1e6
@@ -329,7 +331,37 @@ func TestBinaryFuseN_Issue35(t *testing.T) {
329331
if !e {
330332
panic(i)
331333
}
334+
}
335+
}
336+
}
332337

338+
func TestBinaryFuseBuilder(t *testing.T) {
339+
// Verify that repeated builds with the same builder create the exact same
340+
// filter as using NewBinaryFuse.
341+
var bld BinaryFuseBuilder
342+
for i := 0; i < 100; i++ {
343+
n := 1 + rand.IntN(1<<rand.IntN(20))
344+
keys := make([]uint64, n)
345+
for j := range keys {
346+
keys[j] = rand.Uint64()
347+
}
348+
switch rand.IntN(3) {
349+
case 0:
350+
crossCheckFuseBuilder[uint8](t, &bld, keys)
351+
case 1:
352+
crossCheckFuseBuilder[uint16](t, &bld, keys)
353+
case 2:
354+
crossCheckFuseBuilder[uint32](t, &bld, keys)
333355
}
334356
}
335357
}
358+
359+
func crossCheckFuseBuilder[T Unsigned](t *testing.T, bld *BinaryFuseBuilder, keys []uint64) {
360+
t.Helper()
361+
filter, err := BuildBinaryFuse[T](bld, slices.Clone(keys))
362+
require.NoError(t, err)
363+
expected, err := NewBinaryFuse[T](keys)
364+
require.NoError(t, err)
365+
_ = expected
366+
require.Equal(t, *expected, filter)
367+
}

xorfilter.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,8 @@ func scanCount(Qi []keyindex, setsi []xorset) ([]keyindex, int) {
9999
return Qi, QiSize
100100
}
101101

102-
// The maximum number of iterations allowed before the populate function returns an error
102+
// MaxIterations is the maximum number of iterations allowed before the populate
103+
// function returns an error.
103104
var MaxIterations = 1024
104105

105106
// Populate fills the filter with provided keys. For best results,

0 commit comments

Comments
 (0)