Skip to content

Commit 96e7fbd

Browse files
MB-69881: Improved APIs and perf optimizations for vector search (#2270)
- Use a `bitset` to track eligible documents instead of a slice of `N uint64s`, reducing memory usage from `8N bytes` to `N/8 bytes` per segment (up to `64×` reduction) and improving cache locality. - Pass an iterator over eligible documents that iterates the bitset directly, allowing direct translation into a bitset of eligible vector IDs in the storage layer and eliminating the need for a separate slice intermediary. - Fix garbage creation in the `UnadornedPostingsIterator`, which previously allocated a temporary struct per Next() call to wrap a doc number and satisfy the `Postings` interface; the iterator now returns a single reusable struct (one-time allocation) consistent with the working of the `PostingsIterator` in the storage-layer. - Avoid unnecessary `BytesRead` statistics computation when executing searches in no-scoring mode, removing redundant work as a micro-optimization. --------- Co-authored-by: Abhinav Dangeti <[email protected]>
1 parent 11c2008 commit 96e7fbd

File tree

14 files changed

+156
-63
lines changed

14 files changed

+156
-63
lines changed

analysis/analyzer/custom/custom.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,7 @@ func convertInterfaceSliceToStringSlice(interfaceSlice []interface{}, objType st
140140
if ok {
141141
stringSlice[i] = stringObj
142142
} else {
143-
return nil, fmt.Errorf(objType + " name must be a string")
143+
return nil, fmt.Errorf("%s name must be a string", objType)
144144
}
145145
}
146146

analysis/datetime/iso/iso.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,7 @@ func letterCounter(layout string, idx int) int {
118118
}
119119

120120
func invalidFormatError(character byte, count int) error {
121-
return fmt.Errorf("invalid format string, unknown format specifier: " + strings.Repeat(string(character), count))
121+
return fmt.Errorf("invalid format string, unknown format specifier: %s", strings.Repeat(string(character), count))
122122
}
123123

124124
func parseISOString(layout string) (string, error) {
@@ -146,7 +146,7 @@ func parseISOString(layout string) (string, error) {
146146
// second text literal delimiter
147147
if idx == len(layout) {
148148
// text literal delimiter not found error
149-
return "", fmt.Errorf("invalid format string, expected text literal delimiter: " + string(textLiteralDelimiter))
149+
return "", fmt.Errorf("invalid format string, expected text literal delimiter: %s", string(textLiteralDelimiter))
150150
}
151151
// increment idx to skip the second text literal delimiter
152152
idx++

cmd/bleve/cmd/registry.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -71,12 +71,12 @@ var registryCmd = &cobra.Command{
7171
func printType(label string, types, instances []string) {
7272
sort.Strings(types)
7373
sort.Strings(instances)
74-
fmt.Printf(label + " Types:\n")
74+
fmt.Printf("%s Types:\n", label)
7575
for _, name := range types {
7676
fmt.Printf("\t%s\n", name)
7777
}
7878
fmt.Println()
79-
fmt.Printf(label + " Instances:\n")
79+
fmt.Printf("%s Instances:\n", label)
8080
for _, name := range instances {
8181
fmt.Printf("\t%s\n", name)
8282
}

go.mod

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,18 @@
11
module github.com/blevesearch/bleve/v2
22

3-
go 1.23
4-
5-
toolchain go1.23.9
3+
go 1.24
64

75
require (
86
github.com/RoaringBitmap/roaring/v2 v2.4.5
97
github.com/bits-and-blooms/bitset v1.22.0
10-
github.com/blevesearch/bleve_index_api v1.2.9-0.20250929185838-e1be6a8cc229
8+
github.com/blevesearch/bleve_index_api v1.2.12-0.20260109154621-f19a6d6af728
119
github.com/blevesearch/geo v0.2.4
12-
github.com/blevesearch/go-faiss v1.0.25
10+
github.com/blevesearch/go-faiss v1.0.27
1311
github.com/blevesearch/go-metrics v0.0.0-20201227073835-cf1acfcdf475
1412
github.com/blevesearch/go-porterstemmer v1.0.3
1513
github.com/blevesearch/goleveldb v1.0.1
1614
github.com/blevesearch/gtreap v0.1.1
17-
github.com/blevesearch/scorch_segment_api/v2 v2.3.10
15+
github.com/blevesearch/scorch_segment_api/v2 v2.3.14-0.20260109154938-b56b54c737df
1816
github.com/blevesearch/segment v0.9.1
1917
github.com/blevesearch/snowball v0.6.1
2018
github.com/blevesearch/snowballstem v0.9.0
@@ -26,7 +24,8 @@ require (
2624
github.com/blevesearch/zapx/v13 v13.4.2
2725
github.com/blevesearch/zapx/v14 v14.4.2
2826
github.com/blevesearch/zapx/v15 v15.4.2
29-
github.com/blevesearch/zapx/v16 v16.2.5-0.20251215174251-3f2bc83c91c0
27+
github.com/blevesearch/zapx/v16 v16.2.8
28+
github.com/blevesearch/zapx/v17 v17.0.0-20260112205515-7d8cac80436c
3029
github.com/couchbase/moss v0.2.0
3130
github.com/golang/protobuf v1.3.2
3231
github.com/spf13/cobra v1.8.1

go.sum

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,12 @@ github.com/RoaringBitmap/roaring/v2 v2.4.5/go.mod h1:FiJcsfkGje/nZBZgCu0ZxCPOKD/
33
github.com/bits-and-blooms/bitset v1.12.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8=
44
github.com/bits-and-blooms/bitset v1.22.0 h1:Tquv9S8+SGaS3EhyA+up3FXzmkhxPGjQQCkcs2uw7w4=
55
github.com/bits-and-blooms/bitset v1.22.0/go.mod h1:7hO7Gc7Pp1vODcmWvKMRA9BNmbv6a/7QIWpPxHddWR8=
6-
github.com/blevesearch/bleve_index_api v1.2.9-0.20250929185838-e1be6a8cc229 h1:q0hzsKqukWjLO5MEahNWP994XvbY1B2ZSzuM/Vfhx/A=
7-
github.com/blevesearch/bleve_index_api v1.2.9-0.20250929185838-e1be6a8cc229/go.mod h1:rKQDl4u51uwafZxFrPD1R7xFOwKnzZW7s/LSeK4lgo0=
6+
github.com/blevesearch/bleve_index_api v1.2.12-0.20260109154621-f19a6d6af728 h1:qFnvr+SqVOCbhMl5sVynhuwVkv1yrc7Vhrn8lVdw1nU=
7+
github.com/blevesearch/bleve_index_api v1.2.12-0.20260109154621-f19a6d6af728/go.mod h1:xvd48t5XMeeioWQ5/jZvgLrV98flT2rdvEJ3l/ki4Ko=
88
github.com/blevesearch/geo v0.2.4 h1:ECIGQhw+QALCZaDcogRTNSJYQXRtC8/m8IKiA706cqk=
99
github.com/blevesearch/geo v0.2.4/go.mod h1:K56Q33AzXt2YExVHGObtmRSFYZKYGv0JEN5mdacJJR8=
10-
github.com/blevesearch/go-faiss v1.0.25 h1:lel1rkOUGbT1CJ0YgzKwC7k+XH0XVBHnCVWahdCXk4U=
11-
github.com/blevesearch/go-faiss v1.0.25/go.mod h1:OMGQwOaRRYxrmeNdMrXJPvVx8gBnvE5RYrr0BahNnkk=
10+
github.com/blevesearch/go-faiss v1.0.27 h1:7cBImYDDQ82WJd5RUZ1ie6zXztCsC73W94ZzwOjkatk=
11+
github.com/blevesearch/go-faiss v1.0.27/go.mod h1:OMGQwOaRRYxrmeNdMrXJPvVx8gBnvE5RYrr0BahNnkk=
1212
github.com/blevesearch/go-metrics v0.0.0-20201227073835-cf1acfcdf475 h1:kDy+zgJFJJoJYBvdfBSiZYBbdsUL0XcjHYWezpQBGPA=
1313
github.com/blevesearch/go-metrics v0.0.0-20201227073835-cf1acfcdf475/go.mod h1:9eJDeqxJ3E7WnLebQUlPD7ZjSce7AnDb9vjGmMCbD0A=
1414
github.com/blevesearch/go-porterstemmer v1.0.3 h1:GtmsqID0aZdCSNiY8SkuPJ12pD4jI+DdXTAn4YRcHCo=
@@ -20,8 +20,8 @@ github.com/blevesearch/gtreap v0.1.1/go.mod h1:QaQyDRAT51sotthUWAH4Sj08awFSSWzgY
2020
github.com/blevesearch/mmap-go v1.0.2/go.mod h1:ol2qBqYaOUsGdm7aRMRrYGgPvnwLe6Y+7LMvAB5IbSA=
2121
github.com/blevesearch/mmap-go v1.0.4 h1:OVhDhT5B/M1HNPpYPBKIEJaD0F3Si+CrEKULGCDPWmc=
2222
github.com/blevesearch/mmap-go v1.0.4/go.mod h1:EWmEAOmdAS9z/pi/+Toxu99DnsbhG1TIxUoRmJw/pSs=
23-
github.com/blevesearch/scorch_segment_api/v2 v2.3.10 h1:Yqk0XD1mE0fDZAJXTjawJ8If/85JxnLd8v5vG/jWE/s=
24-
github.com/blevesearch/scorch_segment_api/v2 v2.3.10/go.mod h1:Z3e6ChN3qyN35yaQpl00MfI5s8AxUJbpTR/DL8QOQ+8=
23+
github.com/blevesearch/scorch_segment_api/v2 v2.3.14-0.20260109154938-b56b54c737df h1:gBuVkzZLUpGJGnCBRgY0ruZVjppD7WaQLeHZei7QQnU=
24+
github.com/blevesearch/scorch_segment_api/v2 v2.3.14-0.20260109154938-b56b54c737df/go.mod h1:f8fXitmMpzgNziIMqUlpTrfPxVVDN8at9k7POEohvJU=
2525
github.com/blevesearch/segment v0.9.1 h1:+dThDy+Lvgj5JMxhmOVlgFfkUtZV2kw49xax4+jTfSU=
2626
github.com/blevesearch/segment v0.9.1/go.mod h1:zN21iLm7+GnBHWTao9I+Au/7MBiL8pPFtJBJTsk6kQw=
2727
github.com/blevesearch/snowball v0.6.1 h1:cDYjn/NCH+wwt2UdehaLpr2e4BwLIjN4V/TdLsL+B5A=
@@ -44,8 +44,10 @@ github.com/blevesearch/zapx/v14 v14.4.2 h1:2SGHakVKd+TrtEqpfeq8X+So5PShQ5nW6GNxT
4444
github.com/blevesearch/zapx/v14 v14.4.2/go.mod h1:rz0XNb/OZSMjNorufDGSpFpjoFKhXmppH9Hi7a877D8=
4545
github.com/blevesearch/zapx/v15 v15.4.2 h1:sWxpDE0QQOTjyxYbAVjt3+0ieu8NCE0fDRaFxEsp31k=
4646
github.com/blevesearch/zapx/v15 v15.4.2/go.mod h1:1pssev/59FsuWcgSnTa0OeEpOzmhtmr/0/11H0Z8+Nw=
47-
github.com/blevesearch/zapx/v16 v16.2.5-0.20251215174251-3f2bc83c91c0 h1:HZssgO3JqQFBTrrBTb5LWkfGlOhUdPzUjsPHQoKEjhg=
48-
github.com/blevesearch/zapx/v16 v16.2.5-0.20251215174251-3f2bc83c91c0/go.mod h1:Rti/REtuuMmzwsI8/C/qIzRaEoSK/wiFYw5e5ctUKKs=
47+
github.com/blevesearch/zapx/v16 v16.2.8 h1:SlnzF0YGtSlrsOE3oE7EgEX6BIepGpeqxs1IjMbHLQI=
48+
github.com/blevesearch/zapx/v16 v16.2.8/go.mod h1:murSoCJPCk25MqURrcJaBQ1RekuqSCSfMjXH4rHyA14=
49+
github.com/blevesearch/zapx/v17 v17.0.0-20260112205515-7d8cac80436c h1:OfYh0noLbJmt6k2tqYlnSU3zMZEJbFfbSClSGG59A/M=
50+
github.com/blevesearch/zapx/v17 v17.0.0-20260112205515-7d8cac80436c/go.mod h1:ybWwo00MGrNJuFDnl9smEBVUCZmNANf0+E/QVBmfBTs=
4951
github.com/couchbase/ghistogram v0.1.0 h1:b95QcQTCzjTUocDXp/uMgSNQi8oj1tGwnJ4bODWZnps=
5052
github.com/couchbase/ghistogram v0.1.0/go.mod h1:s1Jhy76zqfEecpNWJfWUiKZookAFaiGOEoyzgHt9i7k=
5153
github.com/couchbase/moss v0.2.0 h1:VCYrMzFwEryyhRSeI+/b3tRBSeTpi/8gn5Kf6dxqn+o=

index/scorch/optimize.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -395,5 +395,7 @@ func (i *IndexSnapshot) unadornedTermFieldReader(
395395
recycle: false,
396396
// signal downstream that this is a special unadorned termFieldReader
397397
unadorned: true,
398+
// unadorned TFRs do not require bytes read tracking
399+
updateBytesRead: false,
398400
}
399401
}

index/scorch/optimize_knn.go

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,6 @@ type OptimizeVR struct {
3434
totalCost uint64
3535
// maps field to vector readers
3636
vrs map[string][]*IndexSnapshotVectorReader
37-
// if at least one of the vector readers requires filtered kNN.
38-
requiresFiltering bool
3937
}
4038

4139
// This setting _MUST_ only be changed during init and not after.
@@ -79,8 +77,7 @@ func (o *OptimizeVR) Finish() error {
7977
wg.Done()
8078
}()
8179
for field, vrs := range o.vrs {
82-
vecIndex, err := segment.InterpretVectorIndex(field,
83-
o.requiresFiltering, origSeg.deleted)
80+
vecIndex, err := segment.InterpretVectorIndex(field, origSeg.deleted)
8481
if err != nil {
8582
errorsM.Lock()
8683
errors = append(errors, err)
@@ -103,7 +100,7 @@ func (o *OptimizeVR) Finish() error {
103100
// kNN search.
104101
if vr.eligibleSelector != nil {
105102
pl, err = vecIndex.SearchWithFilter(vr.vector, vr.k,
106-
vr.eligibleSelector.SegmentEligibleDocs(index), vr.searchParams)
103+
vr.eligibleSelector.SegmentEligibleDocuments(index), vr.searchParams)
107104
} else {
108105
pl, err = vecIndex.Search(vr.vector, vr.k, vr.searchParams)
109106
}
@@ -157,9 +154,6 @@ func (s *IndexSnapshotVectorReader) VectorOptimize(ctx context.Context,
157154
return octx, nil
158155
}
159156
o.ctx = ctx
160-
if !o.requiresFiltering {
161-
o.requiresFiltering = s.eligibleSelector != nil
162-
}
163157

164158
if o.snapshot != s.snapshot {
165159
o.invokeSearcherEndCallback()

index/scorch/segment_plugin.go

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ import (
2828
zapv14 "github.com/blevesearch/zapx/v14"
2929
zapv15 "github.com/blevesearch/zapx/v15"
3030
zapv16 "github.com/blevesearch/zapx/v16"
31+
zapv17 "github.com/blevesearch/zapx/v17"
3132
)
3233

3334
// SegmentPlugin represents the essential functions required by a package to plug in
@@ -73,7 +74,8 @@ var defaultSegmentPlugin SegmentPlugin
7374

7475
func init() {
7576
ResetSegmentPlugins()
76-
RegisterSegmentPlugin(&zapv16.ZapPlugin{}, true)
77+
RegisterSegmentPlugin(&zapv17.ZapPlugin{}, true)
78+
RegisterSegmentPlugin(&zapv16.ZapPlugin{}, false)
7779
RegisterSegmentPlugin(&zapv15.ZapPlugin{}, false)
7880
RegisterSegmentPlugin(&zapv14.ZapPlugin{}, false)
7981
RegisterSegmentPlugin(&zapv13.ZapPlugin{}, false)

index/scorch/snapshot_index.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -671,6 +671,8 @@ func (is *IndexSnapshot) TermFieldReader(ctx context.Context, term []byte, field
671671
rv.incrementBytesRead(bytesRead - prevBytesReadItr)
672672
}
673673
}
674+
// ONLY update the bytes read value beyond this point for this TFR if scoring is enabled
675+
rv.updateBytesRead = rv.includeFreq || rv.includeNorm || rv.includeTermVectors
674676
atomic.AddUint64(&is.parent.stats.TotTermSearchersStarted, uint64(1))
675677
return rv, nil
676678
}

index/scorch/snapshot_index_tfr.go

Lines changed: 20 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,10 @@ type IndexSnapshotTermFieldReader struct {
5151
bytesRead uint64
5252
ctx context.Context
5353
unadorned bool
54+
// flag to indicate whether to increment our bytesRead
55+
// value after creation of the TFR while iterating our postings
56+
// lists
57+
updateBytesRead bool
5458
}
5559

5660
func (i *IndexSnapshotTermFieldReader) incrementBytesRead(val uint64) {
@@ -83,10 +87,15 @@ func (i *IndexSnapshotTermFieldReader) Next(preAlloced *index.TermFieldDoc) (*in
8387
if rv == nil {
8488
rv = &index.TermFieldDoc{}
8589
}
90+
var prevBytesRead uint64
8691
// find the next hit
8792
for i.segmentOffset < len(i.iterators) {
88-
prevBytesRead := i.iterators[i.segmentOffset].BytesRead()
89-
next, err := i.iterators[i.segmentOffset].Next()
93+
// get our current postings iterator
94+
curItr := i.iterators[i.segmentOffset]
95+
if i.updateBytesRead {
96+
prevBytesRead = curItr.BytesRead()
97+
}
98+
next, err := curItr.Next()
9099
if err != nil {
91100
return nil, err
92101
}
@@ -99,13 +108,15 @@ func (i *IndexSnapshotTermFieldReader) Next(preAlloced *index.TermFieldDoc) (*in
99108

100109
i.currID = rv.ID
101110
i.currPosting = next
102-
// postingsIterators is maintain the bytesRead stat in a cumulative fashion.
103-
// this is because there are chances of having a series of loadChunk calls,
104-
// and they have to be added together before sending the bytesRead at this point
105-
// upstream.
106-
bytesRead := i.iterators[i.segmentOffset].BytesRead()
107-
if bytesRead > prevBytesRead {
108-
i.incrementBytesRead(bytesRead - prevBytesRead)
111+
if i.updateBytesRead {
112+
// postingsIterators maintains the bytesRead stat in a cumulative fashion.
113+
// this is because there are chances of having a series of loadChunk calls,
114+
// and they have to be added together before sending the bytesRead at this point
115+
// upstream.
116+
bytesRead := curItr.BytesRead()
117+
if bytesRead > prevBytesRead {
118+
i.incrementBytesRead(bytesRead - prevBytesRead)
119+
}
109120
}
110121
return rv, nil
111122
}

0 commit comments

Comments
 (0)