Skip to content

Commit 030bb3c

Browse files
MB-65473: Refactor and Optimize Pre-Filtered Vector Search (#2169)
- Refactor pre-filtered vector search to enhance performance and reduce memory footprint. - Replace the current bitmap-based approach for calculating segment local document numbers with a more direct method, where the local document numbers are mapped directly to the segment ID during the execution of the eligible collector. - Requires: - blevesearch/bleve_index_api#63 - blevesearch/bleve_index_api#66 - blevesearch/zapx#317 - blevesearch/go-faiss#41 - blevesearch/faiss#49 --------- Co-authored-by: Abhinav Dangeti <abhinav@couchbase.com>
1 parent fef722d commit 030bb3c

File tree

12 files changed

+145
-225
lines changed

12 files changed

+145
-225
lines changed

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ require (
77
github.com/bits-and-blooms/bitset v1.12.0
88
github.com/blevesearch/bleve_index_api v1.1.12
99
github.com/blevesearch/geo v0.1.20
10-
github.com/blevesearch/go-faiss v1.0.24
10+
github.com/blevesearch/go-faiss v1.0.25
1111
github.com/blevesearch/go-metrics v0.0.0-20201227073835-cf1acfcdf475
1212
github.com/blevesearch/go-porterstemmer v1.0.3
1313
github.com/blevesearch/goleveldb v1.0.1

go.sum

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,8 @@ github.com/blevesearch/bleve_index_api v1.1.12 h1:P4bw9/G/5rulOF7SJ9l4FsDoo7UFJ+
66
github.com/blevesearch/bleve_index_api v1.1.12/go.mod h1:PbcwjIcRmjhGbkS/lJCpfgVSMROV6TRubGGAODaK1W8=
77
github.com/blevesearch/geo v0.1.20 h1:paaSpu2Ewh/tn5DKn/FB5SzvH0EWupxHEIwbCk/QPqM=
88
github.com/blevesearch/geo v0.1.20/go.mod h1:DVG2QjwHNMFmjo+ZgzrIq2sfCh6rIHzy9d9d0B59I6w=
9-
github.com/blevesearch/go-faiss v1.0.24 h1:K79IvKjoKHdi7FdiXEsAhxpMuns0x4fM0BO93bW5jLI=
10-
github.com/blevesearch/go-faiss v1.0.24/go.mod h1:OMGQwOaRRYxrmeNdMrXJPvVx8gBnvE5RYrr0BahNnkk=
9+
github.com/blevesearch/go-faiss v1.0.25 h1:lel1rkOUGbT1CJ0YgzKwC7k+XH0XVBHnCVWahdCXk4U=
10+
github.com/blevesearch/go-faiss v1.0.25/go.mod h1:OMGQwOaRRYxrmeNdMrXJPvVx8gBnvE5RYrr0BahNnkk=
1111
github.com/blevesearch/go-metrics v0.0.0-20201227073835-cf1acfcdf475 h1:kDy+zgJFJJoJYBvdfBSiZYBbdsUL0XcjHYWezpQBGPA=
1212
github.com/blevesearch/go-metrics v0.0.0-20201227073835-cf1acfcdf475/go.mod h1:9eJDeqxJ3E7WnLebQUlPD7ZjSce7AnDb9vjGmMCbD0A=
1313
github.com/blevesearch/go-porterstemmer v1.0.3 h1:GtmsqID0aZdCSNiY8SkuPJ12pD4jI+DdXTAn4YRcHCo=

index/scorch/optimize_knn.go

Lines changed: 8 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@ import (
2323
"sync"
2424
"sync/atomic"
2525

26-
"github.com/RoaringBitmap/roaring"
2726
"github.com/blevesearch/bleve/v2/search"
2827
index "github.com/blevesearch/bleve_index_api"
2928
segment_api "github.com/blevesearch/scorch_segment_api/v2"
@@ -65,27 +64,6 @@ func (o *OptimizeVR) Finish() error {
6564
var errorsM sync.Mutex
6665
var errors []error
6766

68-
var snapshotGlobalDocNums map[int]*roaring.Bitmap
69-
var eligibleDocIDsMap map[string]map[int]*roaring.Bitmap
70-
fields := make([]string, len(o.vrs))
71-
for field := range o.vrs {
72-
fields = append(fields, field)
73-
}
74-
75-
if o.requiresFiltering {
76-
snapshotGlobalDocNums = o.snapshot.globalDocNums()
77-
eligibleDocIDsMap = make(map[string]map[int]*roaring.Bitmap)
78-
for _, field := range fields {
79-
vrs := o.vrs[field]
80-
eligibleDocIDsMap[field] = make(map[int]*roaring.Bitmap)
81-
for index, vr := range vrs {
82-
if vr.eligibleDocIDs != nil && len(vr.eligibleDocIDs) > 0 {
83-
eligibleDocIDsMap[field][index] = vr.getEligibleDocIDs()
84-
}
85-
}
86-
}
87-
}
88-
8967
defer o.invokeSearcherEndCallback()
9068

9169
wg := sync.WaitGroup{}
@@ -100,8 +78,7 @@ func (o *OptimizeVR) Finish() error {
10078
<-semaphore // Release the semaphore slot
10179
wg.Done()
10280
}()
103-
for _, field := range fields {
104-
vrs := o.vrs[field]
81+
for field, vrs := range o.vrs {
10582
vecIndex, err := segment.InterpretVectorIndex(field,
10683
o.requiresFiltering, origSeg.deleted)
10784
if err != nil {
@@ -114,34 +91,19 @@ func (o *OptimizeVR) Finish() error {
11491
// update the vector index size as a meta value in the segment snapshot
11592
vectorIndexSize := vecIndex.Size()
11693
origSeg.cachedMeta.updateMeta(field, vectorIndexSize)
117-
for vrIdx, vr := range vrs {
94+
for _, vr := range vrs {
11895
var pl segment_api.VecPostingsList
11996
var err error
12097

12198
// for each VR, populate postings list and iterators
12299
// by passing the obtained vector index and getting similar vectors.
123100

124-
// Only applies to filtered kNN.
125-
if vr.eligibleDocIDs != nil && len(vr.eligibleDocIDs) > 0 {
126-
eligibleVectorInternalIDs := eligibleDocIDsMap[field][vrIdx]
127-
eligibleVectorInternalIDsClone := eligibleVectorInternalIDs.Clone()
128-
if snapshotGlobalDocNums != nil {
129-
// Only the eligible documents belonging to this segment
130-
// will get filtered out.
131-
// There is no way to determine which doc belongs to which segment
132-
eligibleVectorInternalIDsClone.And(snapshotGlobalDocNums[index])
133-
}
134-
135-
eligibleLocalDocNums := make([]uint64, 0)
136-
// get the (segment-)local document numbers
137-
for _, docNum := range eligibleVectorInternalIDsClone.ToArray() {
138-
localDocNum := o.snapshot.localDocNumFromGlobal(index,
139-
uint64(docNum))
140-
eligibleLocalDocNums = append(eligibleLocalDocNums, localDocNum)
141-
}
142-
101+
// check if the vector reader is configured to use a pre-filter
102+
// to filter out ineligible documents before performing
103+
// kNN search.
104+
if vr.eligibleSelector != nil {
143105
pl, err = vecIndex.SearchWithFilter(vr.vector, vr.k,
144-
eligibleLocalDocNums, vr.searchParams)
106+
vr.eligibleSelector.SegmentEligibleDocs(index), vr.searchParams)
145107
} else {
146108
pl, err = vecIndex.Search(vr.vector, vr.k, vr.searchParams)
147109
}
@@ -195,7 +157,7 @@ func (s *IndexSnapshotVectorReader) VectorOptimize(ctx context.Context,
195157
}
196158
o.ctx = ctx
197159
if !o.requiresFiltering {
198-
o.requiresFiltering = len(s.eligibleDocIDs) > 0
160+
o.requiresFiltering = s.eligibleSelector != nil
199161
}
200162

201163
if o.snapshot != s.snapshot {

index/scorch/snapshot_index.go

Lines changed: 10 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -473,31 +473,7 @@ func (is *IndexSnapshot) segmentIndexAndLocalDocNumFromGlobal(docNum uint64) (in
473473
return is.offsets[x] > docNum
474474
}) - 1
475475

476-
localDocNum := is.localDocNumFromGlobal(segmentIndex, docNum)
477-
return int(segmentIndex), localDocNum
478-
}
479-
480-
// This function returns the local docnum, given the segment index and global docnum
481-
func (is *IndexSnapshot) localDocNumFromGlobal(segmentIndex int, docNum uint64) uint64 {
482-
return docNum - is.offsets[segmentIndex]
483-
}
484-
485-
// Function to return a mapping of the segment index to the live global doc nums
486-
// in the segment of the specified index snapshot.
487-
func (is *IndexSnapshot) globalDocNums() map[int]*roaring.Bitmap {
488-
if len(is.segment) == 0 {
489-
return nil
490-
}
491-
492-
segmentIndexGlobalDocNums := make(map[int]*roaring.Bitmap)
493-
494-
for i := range is.segment {
495-
segmentIndexGlobalDocNums[i] = roaring.NewBitmap()
496-
for _, localDocNum := range is.segment[i].DocNumbersLive().ToArray() {
497-
segmentIndexGlobalDocNums[i].Add(localDocNum + uint32(is.offsets[i]))
498-
}
499-
}
500-
return segmentIndexGlobalDocNums
476+
return int(segmentIndex), docNum - is.offsets[segmentIndex]
501477
}
502478

503479
func (is *IndexSnapshot) ExternalID(id index.IndexInternalID) (string, error) {
@@ -518,6 +494,15 @@ func (is *IndexSnapshot) ExternalID(id index.IndexInternalID) (string, error) {
518494
return string(v), nil
519495
}
520496

497+
func (is *IndexSnapshot) segmentIndexAndLocalDocNum(id index.IndexInternalID) (int, uint64, error) {
498+
docNum, err := docInternalToNumber(id)
499+
if err != nil {
500+
return 0, 0, err
501+
}
502+
segIdx, localDocNum := is.segmentIndexAndLocalDocNumFromGlobal(docNum)
503+
return segIdx, localDocNum, nil
504+
}
505+
521506
func (is *IndexSnapshot) InternalID(id string) (rv index.IndexInternalID, err error) {
522507
// FIXME could be done more efficiently directly, but reusing for simplicity
523508
tfr, err := is.TermFieldReader(nil, []byte(id), "_id", false, false, false)

index/scorch/snapshot_index_vr.go

Lines changed: 4 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@ import (
2424
"fmt"
2525
"reflect"
2626

27-
"github.com/RoaringBitmap/roaring"
2827
"github.com/blevesearch/bleve/v2/size"
2928
index "github.com/blevesearch/bleve_index_api"
3029
segment_api "github.com/blevesearch/scorch_segment_api/v2"
@@ -51,32 +50,8 @@ type IndexSnapshotVectorReader struct {
5150
currID index.IndexInternalID
5251
ctx context.Context
5352

54-
searchParams json.RawMessage
55-
56-
// The following fields are only applicable for vector readers which will
57-
// process pre-filtered kNN queries.
58-
eligibleDocIDs []index.IndexInternalID
59-
}
60-
61-
// Function to convert the internal IDs of the eligible documents to a type suitable
62-
// for addition to a bitmap.
63-
// Useful to have the eligible doc IDs in a bitmap to leverage the fast intersection
64-
// (AND) operations. Eg. finding the eligible doc IDs present in a segment.
65-
func (i *IndexSnapshotVectorReader) getEligibleDocIDs() *roaring.Bitmap {
66-
res := roaring.NewBitmap()
67-
if len(i.eligibleDocIDs) > 0 {
68-
internalDocIDs := make([]uint32, 0, len(i.eligibleDocIDs))
69-
// converts the doc IDs to uint32 and returns
70-
for _, eligibleDocInternalID := range i.eligibleDocIDs {
71-
internalDocID, err := docInternalToNumber(index.IndexInternalID(eligibleDocInternalID))
72-
if err != nil {
73-
continue
74-
}
75-
internalDocIDs = append(internalDocIDs, uint32(internalDocID))
76-
}
77-
res.AddMany(internalDocIDs)
78-
}
79-
return res
53+
searchParams json.RawMessage
54+
eligibleSelector index.EligibleDocumentSelector
8055
}
8156

8257
func (i *IndexSnapshotVectorReader) Size() int {
@@ -134,17 +109,8 @@ func (i *IndexSnapshotVectorReader) Advance(ID index.IndexInternalID,
134109
preAlloced *index.VectorDoc) (*index.VectorDoc, error) {
135110

136111
if i.currPosting != nil && bytes.Compare(i.currID, ID) >= 0 {
137-
var i2 index.VectorReader
138-
var err error
139-
140-
if len(i.eligibleDocIDs) > 0 {
141-
i2, err = i.snapshot.VectorReaderWithFilter(i.ctx, i.vector, i.field,
142-
i.k, i.searchParams, i.eligibleDocIDs)
143-
} else {
144-
i2, err = i.snapshot.VectorReader(i.ctx, i.vector, i.field, i.k,
145-
i.searchParams)
146-
}
147-
112+
i2, err := i.snapshot.VectorReader(i.ctx, i.vector, i.field, i.k,
113+
i.searchParams, i.eligibleSelector)
148114
if err != nil {
149115
return nil, err
150116
}

index/scorch/snapshot_vector_index.go

Lines changed: 37 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -20,21 +20,23 @@ package scorch
2020
import (
2121
"context"
2222
"encoding/json"
23+
"fmt"
2324

2425
index "github.com/blevesearch/bleve_index_api"
2526
segment_api "github.com/blevesearch/scorch_segment_api/v2"
2627
)
2728

2829
func (is *IndexSnapshot) VectorReader(ctx context.Context, vector []float32,
29-
field string, k int64, searchParams json.RawMessage) (
30+
field string, k int64, searchParams json.RawMessage,
31+
eligibleSelector index.EligibleDocumentSelector) (
3032
index.VectorReader, error) {
31-
3233
rv := &IndexSnapshotVectorReader{
33-
vector: vector,
34-
field: field,
35-
k: k,
36-
snapshot: is,
37-
searchParams: searchParams,
34+
vector: vector,
35+
field: field,
36+
k: k,
37+
snapshot: is,
38+
searchParams: searchParams,
39+
eligibleSelector: eligibleSelector,
3840
}
3941

4042
if rv.postings == nil {
@@ -43,34 +45,41 @@ func (is *IndexSnapshot) VectorReader(ctx context.Context, vector []float32,
4345
if rv.iterators == nil {
4446
rv.iterators = make([]segment_api.VecPostingsIterator, len(is.segment))
4547
}
46-
4748
// initialize postings and iterators within the OptimizeVR's Finish()
48-
4949
return rv, nil
5050
}
5151

52-
func (is *IndexSnapshot) VectorReaderWithFilter(ctx context.Context, vector []float32,
53-
field string, k int64, searchParams json.RawMessage,
54-
filterIDs []index.IndexInternalID) (
55-
index.VectorReader, error) {
52+
// eligibleDocumentSelector is used to filter out documents that are eligible for
53+
// the KNN search from a pre-filter query.
54+
type eligibleDocumentSelector struct {
55+
// segment ID -> segment local doc nums
56+
eligibleDocNums map[int][]uint64
57+
is *IndexSnapshot
58+
}
5659

57-
rv := &IndexSnapshotVectorReader{
58-
vector: vector,
59-
field: field,
60-
k: k,
61-
snapshot: is,
62-
searchParams: searchParams,
63-
eligibleDocIDs: filterIDs,
64-
}
60+
// SegmentEligibleDocs returns the list of eligible local doc numbers for the given segment.
61+
func (eds *eligibleDocumentSelector) SegmentEligibleDocs(segmentID int) []uint64 {
62+
return eds.eligibleDocNums[segmentID]
63+
}
6564

66-
if rv.postings == nil {
67-
rv.postings = make([]segment_api.VecPostingsList, len(is.segment))
65+
// AddEligibleDocumentMatch adds a document match to the list of eligible documents.
66+
func (eds *eligibleDocumentSelector) AddEligibleDocumentMatch(id index.IndexInternalID) error {
67+
if eds.is == nil {
68+
return fmt.Errorf("eligibleDocumentSelector is not initialized with IndexSnapshot")
6869
}
69-
if rv.iterators == nil {
70-
rv.iterators = make([]segment_api.VecPostingsIterator, len(is.segment))
70+
// Get the segment number and the local doc number for this document.
71+
segIdx, docNum, err := eds.is.segmentIndexAndLocalDocNum(id)
72+
if err != nil {
73+
return err
7174
}
75+
// Add the local doc number to the list of eligible doc numbers for this segment.
76+
eds.eligibleDocNums[segIdx] = append(eds.eligibleDocNums[segIdx], docNum)
77+
return nil
78+
}
7279

73-
// initialize postings and iterators within the OptimizeVR's Finish()
74-
75-
return rv, nil
80+
func (is *IndexSnapshot) NewEligibleDocumentSelector() index.EligibleDocumentSelector {
81+
return &eligibleDocumentSelector{
82+
eligibleDocNums: map[int][]uint64{},
83+
is: is,
84+
}
7685
}

search.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -589,3 +589,13 @@ func (r *SearchRequest) SortFunc() func(data sort.Interface) {
589589

590590
return sort.Sort
591591
}
592+
593+
func isMatchNoneQuery(q query.Query) bool {
594+
_, ok := q.(*query.MatchNoneQuery)
595+
return ok
596+
}
597+
598+
func isMatchAllQuery(q query.Query) bool {
599+
_, ok := q.(*query.MatchAllQuery)
600+
return ok
601+
}

0 commit comments

Comments
 (0)