Skip to content

Commit f68cdd4

Browse files
committed
LUCENE-10375: Write merged vectors to file before building graph (#601)
When merging segments together, the `KnnVectorsWriter` creates a `VectorValues` instance with a merged view of all the segments' vectors. This merged instance is used when constructing the new HNSW graph. Graph building needs random access, and the merged VectorValues support this by mapping from merged ordinals to segments and segment ordinals. This mapping can add significant overhead when building the graph. This change updates the HNSW merging logic to first write the combined segment vectors to a file, then use that the file to build the graph. This helps speed up segment merging, and also lets us simplify `VectorValuesMerger`, which provides the merged view of vector values.
1 parent af3a0bc commit f68cdd4

File tree

7 files changed

+259
-226
lines changed

7 files changed

+259
-226
lines changed

lucene/CHANGES.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,9 @@ Optimizations
128128
* LUCENE-10379: Count directly into the dense values array in FastTaxonomyFacetCounts#countAll.
129129
(Guo Feng, Greg Miller)
130130

131+
* LUCENE-10375: Speed up HNSW vectors merge by first writing combined vector
132+
data to a file. (Julie Tibshirani, Adrien Grand)
133+
131134
Changes in runtime behavior
132135
---------------------
133136

lucene/core/src/java/org/apache/lucene/codecs/KnnVectorsWriter.java

Lines changed: 62 additions & 177 deletions
Original file line numberDiff line numberDiff line change
@@ -17,19 +17,13 @@
1717

1818
package org.apache.lucene.codecs;
1919

20-
import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
21-
2220
import java.io.Closeable;
2321
import java.io.IOException;
2422
import java.util.ArrayList;
25-
import java.util.Arrays;
2623
import java.util.List;
2724
import org.apache.lucene.index.DocIDMerger;
2825
import org.apache.lucene.index.FieldInfo;
2926
import org.apache.lucene.index.MergeState;
30-
import org.apache.lucene.index.RandomAccessVectorValues;
31-
import org.apache.lucene.index.RandomAccessVectorValuesProducer;
32-
import org.apache.lucene.index.VectorSimilarityFunction;
3327
import org.apache.lucene.index.VectorValues;
3428
import org.apache.lucene.search.TopDocs;
3529
import org.apache.lucene.util.Bits;
@@ -48,7 +42,11 @@ public abstract void writeField(FieldInfo fieldInfo, KnnVectorsReader knnVectors
4842
/** Called once at the end before close */
4943
public abstract void finish() throws IOException;
5044

51-
/** Merge the vector values from multiple segments, for all fields */
45+
/**
46+
* Merges the segment vectors for all fields. This default implementation delegates to {@link
47+
* #writeField}, passing a {@link KnnVectorsReader} that combines the vector values and ignores
48+
* deleted documents.
49+
*/
5250
public void merge(MergeState mergeState) throws IOException {
5351
for (int i = 0; i < mergeState.fieldInfos.length; i++) {
5452
KnnVectorsReader reader = mergeState.knnVectorsReaders[i];
@@ -57,163 +55,106 @@ public void merge(MergeState mergeState) throws IOException {
5755
reader.checkIntegrity();
5856
}
5957
}
58+
6059
for (FieldInfo fieldInfo : mergeState.mergeFieldInfos) {
6160
if (fieldInfo.hasVectorValues()) {
62-
mergeVectors(fieldInfo, mergeState);
63-
}
64-
}
65-
finish();
66-
}
61+
if (mergeState.infoStream.isEnabled("VV")) {
62+
mergeState.infoStream.message("VV", "merging " + mergeState.segmentInfo);
63+
}
6764

68-
private void mergeVectors(FieldInfo mergeFieldInfo, final MergeState mergeState)
69-
throws IOException {
70-
if (mergeState.infoStream.isEnabled("VV")) {
71-
mergeState.infoStream.message("VV", "merging " + mergeState.segmentInfo);
72-
}
73-
// Create a new VectorValues by iterating over the sub vectors, mapping the resulting
74-
// docids using docMaps in the mergeState.
75-
writeField(
76-
mergeFieldInfo,
77-
new KnnVectorsReader() {
78-
@Override
79-
public long ramBytesUsed() {
80-
return 0;
81-
}
65+
writeField(
66+
fieldInfo,
67+
new KnnVectorsReader() {
68+
@Override
69+
public long ramBytesUsed() {
70+
return 0;
71+
}
8272

83-
@Override
84-
public void close() throws IOException {
85-
throw new UnsupportedOperationException();
86-
}
73+
@Override
74+
public void close() {
75+
throw new UnsupportedOperationException();
76+
}
8777

88-
@Override
89-
public void checkIntegrity() throws IOException {
90-
throw new UnsupportedOperationException();
91-
}
78+
@Override
79+
public void checkIntegrity() {
80+
throw new UnsupportedOperationException();
81+
}
9282

93-
@Override
94-
public VectorValues getVectorValues(String field) throws IOException {
95-
List<VectorValuesSub> subs = new ArrayList<>();
96-
int dimension = -1;
97-
VectorSimilarityFunction similarityFunction = null;
98-
int nonEmptySegmentIndex = 0;
99-
for (int i = 0; i < mergeState.knnVectorsReaders.length; i++) {
100-
KnnVectorsReader knnVectorsReader = mergeState.knnVectorsReaders[i];
101-
if (knnVectorsReader != null) {
102-
if (mergeFieldInfo != null && mergeFieldInfo.hasVectorValues()) {
103-
int segmentDimension = mergeFieldInfo.getVectorDimension();
104-
VectorSimilarityFunction segmentSimilarityFunction =
105-
mergeFieldInfo.getVectorSimilarityFunction();
106-
if (dimension == -1) {
107-
dimension = segmentDimension;
108-
similarityFunction = mergeFieldInfo.getVectorSimilarityFunction();
109-
} else if (dimension != segmentDimension) {
110-
throw new IllegalStateException(
111-
"Varying dimensions for vector-valued field "
112-
+ mergeFieldInfo.name
113-
+ ": "
114-
+ dimension
115-
+ "!="
116-
+ segmentDimension);
117-
} else if (similarityFunction != segmentSimilarityFunction) {
118-
throw new IllegalStateException(
119-
"Varying similarity functions for vector-valued field "
120-
+ mergeFieldInfo.name
121-
+ ": "
122-
+ similarityFunction
123-
+ "!="
124-
+ segmentSimilarityFunction);
125-
}
126-
VectorValues values = knnVectorsReader.getVectorValues(mergeFieldInfo.name);
127-
if (values != null) {
128-
subs.add(
129-
new VectorValuesSub(nonEmptySegmentIndex++, mergeState.docMaps[i], values));
130-
}
131-
}
83+
@Override
84+
public VectorValues getVectorValues(String field) throws IOException {
85+
return MergedVectorValues.mergeVectorValues(fieldInfo, mergeState);
13286
}
133-
}
134-
return new VectorValuesMerger(subs, mergeState);
135-
}
13687

137-
@Override
138-
public TopDocs search(String field, float[] target, int k, Bits acceptDocs)
139-
throws IOException {
140-
throw new UnsupportedOperationException();
141-
}
142-
});
88+
@Override
89+
public TopDocs search(String field, float[] target, int k, Bits acceptDocs) {
90+
throw new UnsupportedOperationException();
91+
}
92+
});
14393

144-
if (mergeState.infoStream.isEnabled("VV")) {
145-
mergeState.infoStream.message("VV", "merge done " + mergeState.segmentInfo);
94+
if (mergeState.infoStream.isEnabled("VV")) {
95+
mergeState.infoStream.message("VV", "merge done " + mergeState.segmentInfo);
96+
}
97+
}
14698
}
99+
finish();
147100
}
148101

149102
/** Tracks state of one sub-reader that we are merging */
150103
private static class VectorValuesSub extends DocIDMerger.Sub {
151104

152105
final VectorValues values;
153-
final int segmentIndex;
154-
int count;
155106

156-
VectorValuesSub(int segmentIndex, MergeState.DocMap docMap, VectorValues values) {
107+
VectorValuesSub(MergeState.DocMap docMap, VectorValues values) {
157108
super(docMap);
158109
this.values = values;
159-
this.segmentIndex = segmentIndex;
160110
assert values.docID() == -1;
161111
}
162112

163113
@Override
164114
public int nextDoc() throws IOException {
165-
int docId = values.nextDoc();
166-
if (docId != NO_MORE_DOCS) {
167-
// Note: this does count deleted docs since they are present in the to-be-merged segment
168-
++count;
169-
}
170-
return docId;
115+
return values.nextDoc();
171116
}
172117
}
173118

174-
/**
175-
* View over multiple VectorValues supporting iterator-style access via DocIdMerger. Maintains a
176-
* reverse ordinal mapping for documents having values in order to support random access by dense
177-
* ordinal.
178-
*/
179-
private static class VectorValuesMerger extends VectorValues
180-
implements RandomAccessVectorValuesProducer {
119+
/** View over multiple VectorValues supporting iterator-style access via DocIdMerger. */
120+
public static class MergedVectorValues extends VectorValues {
181121
private final List<VectorValuesSub> subs;
182122
private final DocIDMerger<VectorValuesSub> docIdMerger;
183-
private final int[] ordBase;
184123
private final int cost;
185-
private int size;
124+
private final int size;
186125

187126
private int docId;
188127
private VectorValuesSub current;
189-
/* For each doc with a vector, record its ord in the segments being merged. This enables random
190-
* access into the unmerged segments using the ords from the merged segment.
191-
*/
192-
private int[] ordMap;
193-
private int ord;
194128

195-
VectorValuesMerger(List<VectorValuesSub> subs, MergeState mergeState) throws IOException {
129+
/** Returns a merged view over all the segment's {@link VectorValues}. */
130+
public static MergedVectorValues mergeVectorValues(FieldInfo fieldInfo, MergeState mergeState)
131+
throws IOException {
132+
assert fieldInfo != null && fieldInfo.hasVectorValues();
133+
134+
List<VectorValuesSub> subs = new ArrayList<>();
135+
for (int i = 0; i < mergeState.knnVectorsReaders.length; i++) {
136+
KnnVectorsReader knnVectorsReader = mergeState.knnVectorsReaders[i];
137+
if (knnVectorsReader != null) {
138+
VectorValues values = knnVectorsReader.getVectorValues(fieldInfo.name);
139+
if (values != null) {
140+
subs.add(new VectorValuesSub(mergeState.docMaps[i], values));
141+
}
142+
}
143+
}
144+
return new MergedVectorValues(subs, mergeState);
145+
}
146+
147+
private MergedVectorValues(List<VectorValuesSub> subs, MergeState mergeState)
148+
throws IOException {
196149
this.subs = subs;
197150
docIdMerger = DocIDMerger.of(subs, mergeState.needsIndexSort);
198151
int totalCost = 0, totalSize = 0;
199152
for (VectorValuesSub sub : subs) {
200153
totalCost += sub.values.cost();
201154
totalSize += sub.values.size();
202155
}
203-
/* This size includes deleted docs, but when we iterate over docs here (nextDoc())
204-
* we skip deleted docs. So we sneakily update this size once we observe that iteration is complete.
205-
* That way by the time we are asked to do random access for graph building, we have a correct size.
206-
*/
207156
cost = totalCost;
208157
size = totalSize;
209-
ordMap = new int[size];
210-
ordBase = new int[subs.size()];
211-
int lastBase = 0;
212-
for (int k = 0; k < subs.size(); k++) {
213-
int size = subs.get(k).values.size();
214-
ordBase[k] = lastBase;
215-
lastBase += size;
216-
}
217158
docId = -1;
218159
}
219160

@@ -227,12 +168,8 @@ public int nextDoc() throws IOException {
227168
current = docIdMerger.next();
228169
if (current == null) {
229170
docId = NO_MORE_DOCS;
230-
/* update the size to reflect the number of *non-deleted* documents seen so we can support
231-
* random access. */
232-
size = ord;
233171
} else {
234172
docId = current.mappedDocID;
235-
ordMap[ord++] = ordBase[current.segmentIndex] + current.count - 1;
236173
}
237174
return docId;
238175
}
@@ -247,11 +184,6 @@ public BytesRef binaryValue() throws IOException {
247184
return current.values.binaryValue();
248185
}
249186

250-
@Override
251-
public RandomAccessVectorValues randomAccess() {
252-
return new MergerRandomAccess();
253-
}
254-
255187
@Override
256188
public int advance(int target) {
257189
throw new UnsupportedOperationException();
@@ -271,52 +203,5 @@ public long cost() {
271203
public int dimension() {
272204
return subs.get(0).values.dimension();
273205
}
274-
275-
class MergerRandomAccess implements RandomAccessVectorValues {
276-
277-
private final List<RandomAccessVectorValues> raSubs;
278-
279-
MergerRandomAccess() {
280-
raSubs = new ArrayList<>(subs.size());
281-
for (VectorValuesSub sub : subs) {
282-
if (sub.values instanceof RandomAccessVectorValuesProducer) {
283-
raSubs.add(((RandomAccessVectorValuesProducer) sub.values).randomAccess());
284-
} else {
285-
throw new IllegalStateException(
286-
"Cannot merge VectorValues without support for random access");
287-
}
288-
}
289-
}
290-
291-
@Override
292-
public int size() {
293-
return size;
294-
}
295-
296-
@Override
297-
public int dimension() {
298-
return VectorValuesMerger.this.dimension();
299-
}
300-
301-
@Override
302-
public float[] vectorValue(int target) throws IOException {
303-
int unmappedOrd = ordMap[target];
304-
int segmentOrd = Arrays.binarySearch(ordBase, unmappedOrd);
305-
if (segmentOrd < 0) {
306-
// get the index of the greatest lower bound
307-
segmentOrd = -2 - segmentOrd;
308-
}
309-
while (segmentOrd < ordBase.length - 1 && ordBase[segmentOrd + 1] == ordBase[segmentOrd]) {
310-
// forward over empty segments which will share the same ordBase
311-
segmentOrd++;
312-
}
313-
return raSubs.get(segmentOrd).vectorValue(unmappedOrd - ordBase[segmentOrd]);
314-
}
315-
316-
@Override
317-
public BytesRef binaryValue(int targetOrd) throws IOException {
318-
throw new UnsupportedOperationException();
319-
}
320-
}
321206
}
322207
}

0 commit comments

Comments
 (0)