Skip to content

Commit b1b3812

Browse files
committed
LUCENE-10375: Write vectors to file in flush (#617)
In a previous commit, we updated HNSW merge to first write the combined segment vectors to a file, then use that file to build the graph. This commit applies the same strategy to flush, which lets us use the same logic for flush and merge.
1 parent 77ee2a7 commit b1b3812

File tree

4 files changed

+5
-81
lines changed

4 files changed

+5
-81
lines changed

lucene/core/src/java/org/apache/lucene/codecs/KnnVectorsWriter.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ public int nextDoc() throws IOException {
117117
}
118118

119119
/** View over multiple VectorValues supporting iterator-style access via DocIdMerger. */
120-
public static class MergedVectorValues extends VectorValues {
120+
private static class MergedVectorValues extends VectorValues {
121121
private final List<VectorValuesSub> subs;
122122
private final DocIDMerger<VectorValuesSub> docIdMerger;
123123
private final int cost;
@@ -127,7 +127,7 @@ public static class MergedVectorValues extends VectorValues {
127127
private VectorValuesSub current;
128128

129129
/** Returns a merged view over all the segment's {@link VectorValues}. */
130-
public static MergedVectorValues mergeVectorValues(FieldInfo fieldInfo, MergeState mergeState)
130+
static MergedVectorValues mergeVectorValues(FieldInfo fieldInfo, MergeState mergeState)
131131
throws IOException {
132132
assert fieldInfo != null && fieldInfo.hasVectorValues();
133133

lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90HnswVectorsReader.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -354,7 +354,7 @@ int size() {
354354
}
355355

356356
/** Read the vector values from the index input. This supports both iterated and random access. */
357-
public static class OffHeapVectorValues extends VectorValues
357+
static class OffHeapVectorValues extends VectorValues
358358
implements RandomAccessVectorValues, RandomAccessVectorValuesProducer {
359359

360360
final int dimension;

lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90HnswVectorsWriter.java

Lines changed: 2 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@
2626
import org.apache.lucene.codecs.KnnVectorsWriter;
2727
import org.apache.lucene.index.FieldInfo;
2828
import org.apache.lucene.index.IndexFileNames;
29-
import org.apache.lucene.index.MergeState;
3029
import org.apache.lucene.index.RandomAccessVectorValuesProducer;
3130
import org.apache.lucene.index.SegmentWriteState;
3231
import org.apache.lucene.index.VectorSimilarityFunction;
@@ -114,79 +113,16 @@ public final class Lucene90HnswVectorsWriter extends KnnVectorsWriter {
114113
public void writeField(FieldInfo fieldInfo, KnnVectorsReader knnVectorsReader)
115114
throws IOException {
116115
long vectorDataOffset = vectorData.alignFilePointer(Float.BYTES);
117-
118116
VectorValues vectors = knnVectorsReader.getVectorValues(fieldInfo.name);
119-
// TODO - use a better data structure; a bitset? DocsWithFieldSet is p.p. in o.a.l.index
120-
int[] docIds = writeVectorData(vectorData, vectors);
121-
assert vectors.size() == docIds.length;
122-
123-
long[] offsets = new long[docIds.length];
124-
long vectorIndexOffset = vectorIndex.getFilePointer();
125-
if (vectors instanceof RandomAccessVectorValuesProducer) {
126-
writeGraph(
127-
vectorIndex,
128-
(RandomAccessVectorValuesProducer) vectors,
129-
fieldInfo.getVectorSimilarityFunction(),
130-
vectorIndexOffset,
131-
offsets,
132-
maxConn,
133-
beamWidth);
134-
} else {
135-
throw new IllegalArgumentException(
136-
"Indexing an HNSW graph requires a random access vector values, got " + vectors);
137-
}
138-
139-
long vectorDataLength = vectorData.getFilePointer() - vectorDataOffset;
140-
long vectorIndexLength = vectorIndex.getFilePointer() - vectorIndexOffset;
141-
writeMeta(
142-
fieldInfo,
143-
vectorDataOffset,
144-
vectorDataLength,
145-
vectorIndexOffset,
146-
vectorIndexLength,
147-
docIds);
148-
writeGraphOffsets(meta, offsets);
149-
}
150-
151-
@Override
152-
public void merge(MergeState mergeState) throws IOException {
153-
for (int i = 0; i < mergeState.fieldInfos.length; i++) {
154-
KnnVectorsReader reader = mergeState.knnVectorsReaders[i];
155-
assert reader != null || mergeState.fieldInfos[i].hasVectorValues() == false;
156-
if (reader != null) {
157-
reader.checkIntegrity();
158-
}
159-
}
160-
161-
for (FieldInfo fieldInfo : mergeState.mergeFieldInfos) {
162-
if (fieldInfo.hasVectorValues()) {
163-
if (mergeState.infoStream.isEnabled("VV")) {
164-
mergeState.infoStream.message("VV", "merging " + mergeState.segmentInfo);
165-
}
166-
mergeField(fieldInfo, mergeState);
167-
if (mergeState.infoStream.isEnabled("VV")) {
168-
mergeState.infoStream.message("VV", "merge done " + mergeState.segmentInfo);
169-
}
170-
}
171-
}
172-
finish();
173-
}
174117

175-
private void mergeField(FieldInfo fieldInfo, MergeState mergeState) throws IOException {
176-
if (mergeState.infoStream.isEnabled("VV")) {
177-
mergeState.infoStream.message("VV", "merging " + mergeState.segmentInfo);
178-
}
179-
180-
long vectorDataOffset = vectorData.alignFilePointer(Float.BYTES);
181-
182-
VectorValues vectors = MergedVectorValues.mergeVectorValues(fieldInfo, mergeState);
183118
IndexOutput tempVectorData =
184119
segmentWriteState.directory.createTempOutput(
185120
vectorData.getName(), "temp", segmentWriteState.context);
186121
IndexInput vectorDataInput = null;
187122
boolean success = false;
188123
try {
189-
// write the merged vector data to a temporary file
124+
// write the vector data to a temporary file
125+
// TODO - use a better data structure; a bitset? DocsWithFieldSet is p.p. in o.a.l.index
190126
int[] docIds = writeVectorData(tempVectorData, vectors);
191127
CodecUtil.writeFooter(tempVectorData);
192128
IOUtils.close(tempVectorData);
@@ -235,10 +171,6 @@ private void mergeField(FieldInfo fieldInfo, MergeState mergeState) throws IOExc
235171
segmentWriteState.directory, tempVectorData.getName());
236172
}
237173
}
238-
239-
if (mergeState.infoStream.isEnabled("VV")) {
240-
mergeState.infoStream.message("VV", "merge done " + mergeState.segmentInfo);
241-
}
242174
}
243175

244176
/**

lucene/test-framework/src/java/org/apache/lucene/tests/index/BaseKnnVectorsFormatTestCase.java

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -37,8 +37,6 @@
3737
import org.apache.lucene.index.IndexWriterConfig;
3838
import org.apache.lucene.index.LeafReader;
3939
import org.apache.lucene.index.LeafReaderContext;
40-
import org.apache.lucene.index.RandomAccessVectorValues;
41-
import org.apache.lucene.index.RandomAccessVectorValuesProducer;
4240
import org.apache.lucene.index.Term;
4341
import org.apache.lucene.index.VectorSimilarityFunction;
4442
import org.apache.lucene.index.VectorValues;
@@ -693,12 +691,6 @@ public void testSortedIndex() throws Exception {
693691
assertEquals("4", leaf.document(vectorValues.nextDoc()).get("id"));
694692
assertEquals(0, vectorValues.vectorValue()[0], 0);
695693
assertEquals(NO_MORE_DOCS, vectorValues.nextDoc());
696-
697-
RandomAccessVectorValues ra =
698-
((RandomAccessVectorValuesProducer) vectorValues).randomAccess();
699-
assertEquals(-1f, ra.vectorValue(0)[0], 0);
700-
assertEquals(1f, ra.vectorValue(1)[0], 0);
701-
assertEquals(0f, ra.vectorValue(2)[0], 0);
702694
}
703695
}
704696
}

0 commit comments

Comments
 (0)