Skip to content

Commit a47ba33

Browse files
authored
Refactoring HNSW to use a new internal FlatVectorFormat (#12729)
Currently the HNSW codec does too many things, it not only indexes vectors, but stores them and determines how to store them given the vector type. This PR extracts out the vector storage into a new format `Lucene99FlatVectorsFormat` and adds new base class called `FlatVectorsFormat`. This allows for some additional helper functions that allow an indexing codec (like HNSW) take advantage of the flat formats. Additionally, this PR refactors the new `Lucene99ScalarQuantizedVectorsFormat` to be a `FlatVectorsFormat`. Now, `Lucene99HnswVectorsFormat` is constructed with a `Lucene99FlatVectorsFormat` and a new `Lucene99HnswScalarQuantizedVectorsFormat` that uses `Lucene99ScalarQuantizedVectorsFormat`
1 parent c28d174 commit a47ba33

33 files changed

+2222
-1008
lines changed

lucene/CHANGES.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,10 @@ New Features
184184
* GITHUB#12660: HNSW graph now can be merged with multiple thread. Configurable in Lucene99HnswVectorsFormat.
185185
(Patrick Zhai)
186186

187+
* GITHUB#12729: Add new Lucene99FlatVectorsFormat for writing vectors in a flat format and refactor
188+
Lucene99ScalarQuantizedVectorsFormat & Lucene99HnswVectorsFormat to reuse the flat formats.
189+
Additionally, this allows flat formats to be pluggable independent of HNSW. (Ben Trent)
190+
187191
Improvements
188192
---------------------
189193
* GITHUB#12523: TaskExecutor waits for all tasks to complete before returning when Exceptions

lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene92/OffHeapFloatVectorValues.java

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -80,8 +80,6 @@ static OffHeapFloatVectorValues load(
8080
}
8181
}
8282

83-
abstract Bits getAcceptOrds(Bits acceptDocs);
84-
8583
static class DenseOffHeapVectorValues extends OffHeapFloatVectorValues {
8684

8785
private int doc = -1;
@@ -120,7 +118,7 @@ public RandomAccessVectorValues<float[]> copy() throws IOException {
120118
}
121119

122120
@Override
123-
Bits getAcceptOrds(Bits acceptDocs) {
121+
public Bits getAcceptOrds(Bits acceptDocs) {
124122
return acceptDocs;
125123
}
126124
}
@@ -184,7 +182,7 @@ public int ordToDoc(int ord) {
184182
}
185183

186184
@Override
187-
Bits getAcceptOrds(Bits acceptDocs) {
185+
public Bits getAcceptOrds(Bits acceptDocs) {
188186
if (acceptDocs == null) {
189187
return null;
190188
}
@@ -256,7 +254,7 @@ public int ordToDoc(int ord) {
256254
}
257255

258256
@Override
259-
Bits getAcceptOrds(Bits acceptDocs) {
257+
public Bits getAcceptOrds(Bits acceptDocs) {
260258
return null;
261259
}
262260
}

lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene94/OffHeapByteVectorValues.java

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -89,8 +89,6 @@ static OffHeapByteVectorValues load(
8989
}
9090
}
9191

92-
abstract Bits getAcceptOrds(Bits acceptDocs);
93-
9492
static class DenseOffHeapVectorValues extends OffHeapByteVectorValues {
9593

9694
private int doc = -1;
@@ -129,7 +127,7 @@ public RandomAccessVectorValues<byte[]> copy() throws IOException {
129127
}
130128

131129
@Override
132-
Bits getAcceptOrds(Bits acceptDocs) {
130+
public Bits getAcceptOrds(Bits acceptDocs) {
133131
return acceptDocs;
134132
}
135133
}
@@ -196,7 +194,7 @@ public int ordToDoc(int ord) {
196194
}
197195

198196
@Override
199-
Bits getAcceptOrds(Bits acceptDocs) {
197+
public Bits getAcceptOrds(Bits acceptDocs) {
200198
if (acceptDocs == null) {
201199
return null;
202200
}
@@ -268,7 +266,7 @@ public int ordToDoc(int ord) {
268266
}
269267

270268
@Override
271-
Bits getAcceptOrds(Bits acceptDocs) {
269+
public Bits getAcceptOrds(Bits acceptDocs) {
272270
return null;
273271
}
274272
}

lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene94/OffHeapFloatVectorValues.java

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -86,8 +86,6 @@ static OffHeapFloatVectorValues load(
8686
}
8787
}
8888

89-
abstract Bits getAcceptOrds(Bits acceptDocs);
90-
9189
static class DenseOffHeapVectorValues extends OffHeapFloatVectorValues {
9290

9391
private int doc = -1;
@@ -126,7 +124,7 @@ public RandomAccessVectorValues<float[]> copy() throws IOException {
126124
}
127125

128126
@Override
129-
Bits getAcceptOrds(Bits acceptDocs) {
127+
public Bits getAcceptOrds(Bits acceptDocs) {
130128
return acceptDocs;
131129
}
132130
}
@@ -193,7 +191,7 @@ public int ordToDoc(int ord) {
193191
}
194192

195193
@Override
196-
Bits getAcceptOrds(Bits acceptDocs) {
194+
public Bits getAcceptOrds(Bits acceptDocs) {
197195
if (acceptDocs == null) {
198196
return null;
199197
}
@@ -265,7 +263,7 @@ public int ordToDoc(int ord) {
265263
}
266264

267265
@Override
268-
Bits getAcceptOrds(Bits acceptDocs) {
266+
public Bits getAcceptOrds(Bits acceptDocs) {
269267
return null;
270268
}
271269
}

lucene/core/src/java/module-info.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616
*/
1717

1818
import org.apache.lucene.codecs.lucene99.Lucene99Codec;
19-
import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat;
2019

2120
/** Lucene Core. */
2221
@SuppressWarnings("module") // the test framework is compiled after the core...
@@ -70,7 +69,8 @@
7069
provides org.apache.lucene.codecs.DocValuesFormat with
7170
org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat;
7271
provides org.apache.lucene.codecs.KnnVectorsFormat with
73-
Lucene99HnswVectorsFormat;
72+
org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat,
73+
org.apache.lucene.codecs.lucene99.Lucene99HnswScalarQuantizedVectorsFormat;
7474
provides org.apache.lucene.codecs.PostingsFormat with
7575
org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat;
7676
provides org.apache.lucene.index.SortFieldProvider with
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.lucene.codecs;
19+
20+
/**
21+
* Vectors' writer for a field
22+
*
23+
* @param <T> an array type; the type of vectors to be written
24+
* @lucene.experimental
25+
*/
26+
public abstract class FlatFieldVectorsWriter<T> extends KnnFieldVectorsWriter<T> {
27+
28+
/**
29+
* The delegate to write to, can be null When non-null, all vectors seen should be written to the
30+
* delegate along with being written to the flat vectors.
31+
*/
32+
protected final KnnFieldVectorsWriter<T> indexingDelegate;
33+
34+
/**
35+
* Sole constructor that expects some indexingDelegate. All vectors seen should be written to the
36+
* delegate along with being written to the flat vectors.
37+
*
38+
* @param indexingDelegate the delegate to write to, can be null
39+
*/
40+
protected FlatFieldVectorsWriter(KnnFieldVectorsWriter<T> indexingDelegate) {
41+
this.indexingDelegate = indexingDelegate;
42+
}
43+
}
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.lucene.codecs;
19+
20+
import java.io.IOException;
21+
import org.apache.lucene.index.SegmentReadState;
22+
import org.apache.lucene.index.SegmentWriteState;
23+
24+
/**
25+
* Encodes/decodes per-document vectors
26+
*
27+
* @lucene.experimental
28+
*/
29+
public abstract class FlatVectorsFormat {
30+
31+
/** Sole constructor */
32+
protected FlatVectorsFormat() {}
33+
34+
/** Returns a {@link FlatVectorsWriter} to write the vectors to the index. */
35+
public abstract FlatVectorsWriter fieldsWriter(SegmentWriteState state) throws IOException;
36+
37+
/** Returns a {@link KnnVectorsReader} to read the vectors from the index. */
38+
public abstract FlatVectorsReader fieldsReader(SegmentReadState state) throws IOException;
39+
}
Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.lucene.codecs;
19+
20+
import java.io.Closeable;
21+
import java.io.IOException;
22+
import org.apache.lucene.index.ByteVectorValues;
23+
import org.apache.lucene.index.FieldInfo;
24+
import org.apache.lucene.index.FloatVectorValues;
25+
import org.apache.lucene.util.Accountable;
26+
import org.apache.lucene.util.hnsw.RandomVectorScorer;
27+
28+
/**
29+
* Reads vectors from an index. When searching this reader, it iterates every vector in the index
30+
* and scores them
31+
*
32+
* <p>This class is useful when:
33+
*
34+
* <ul>
35+
* <li>the number of vectors is small
36+
* <li>when used along side some additional indexing structure that can be used to better search
37+
* the vectors (like HNSW).
38+
* </ul>
39+
*
40+
* @lucene.experimental
41+
*/
42+
public abstract class FlatVectorsReader implements Closeable, Accountable {
43+
44+
/** Sole constructor */
45+
protected FlatVectorsReader() {}
46+
47+
/**
48+
* Returns a {@link RandomVectorScorer} for the given field and target vector.
49+
*
50+
* @param field the field to search
51+
* @param target the target vector
52+
* @return a {@link RandomVectorScorer} for the given field and target vector.
53+
* @throws IOException if an I/O error occurs when reading from the index.
54+
*/
55+
public abstract RandomVectorScorer getRandomVectorScorer(String field, float[] target)
56+
throws IOException;
57+
58+
/**
59+
* Returns a {@link RandomVectorScorer} for the given field and target vector.
60+
*
61+
* @param field the field to search
62+
* @param target the target vector
63+
* @return a {@link RandomVectorScorer} for the given field and target vector.
64+
* @throws IOException if an I/O error occurs when reading from the index.
65+
*/
66+
public abstract RandomVectorScorer getRandomVectorScorer(String field, byte[] target)
67+
throws IOException;
68+
69+
/**
70+
* Checks consistency of this reader.
71+
*
72+
* <p>Note that this may be costly in terms of I/O, e.g. may involve computing a checksum value
73+
* against large data files.
74+
*
75+
* @lucene.internal
76+
*/
77+
public abstract void checkIntegrity() throws IOException;
78+
79+
/**
80+
* Returns the {@link FloatVectorValues} for the given {@code field}. The behavior is undefined if
81+
* the given field doesn't have KNN vectors enabled on its {@link FieldInfo}. The return value is
82+
* never {@code null}.
83+
*/
84+
public abstract FloatVectorValues getFloatVectorValues(String field) throws IOException;
85+
86+
/**
87+
* Returns the {@link ByteVectorValues} for the given {@code field}. The behavior is undefined if
88+
* the given field doesn't have KNN vectors enabled on its {@link FieldInfo}. The return value is
89+
* never {@code null}.
90+
*/
91+
public abstract ByteVectorValues getByteVectorValues(String field) throws IOException;
92+
}
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.lucene.codecs;
19+
20+
import java.io.Closeable;
21+
import java.io.IOException;
22+
import org.apache.lucene.index.FieldInfo;
23+
import org.apache.lucene.index.MergeState;
24+
import org.apache.lucene.index.Sorter;
25+
import org.apache.lucene.util.Accountable;
26+
import org.apache.lucene.util.IOUtils;
27+
import org.apache.lucene.util.hnsw.CloseableRandomVectorScorerSupplier;
28+
29+
/**
30+
* Vectors' writer for a field that allows additional indexing logic to be implemented by the caller
31+
*
32+
* @lucene.experimental
33+
*/
34+
public abstract class FlatVectorsWriter implements Accountable, Closeable {
35+
36+
/** Sole constructor */
37+
protected FlatVectorsWriter() {}
38+
39+
/**
40+
* Add a new field for indexing, allowing the user to provide a writer that the flat vectors
41+
* writer can delegate to if additional indexing logic is required.
42+
*
43+
* @param fieldInfo fieldInfo of the field to add
44+
* @param indexWriter the writer to delegate to, can be null
45+
* @return a writer for the field
46+
* @throws IOException if an I/O error occurs when adding the field
47+
*/
48+
public abstract FlatFieldVectorsWriter<?> addField(
49+
FieldInfo fieldInfo, KnnFieldVectorsWriter<?> indexWriter) throws IOException;
50+
51+
/**
52+
* Write the field for merging, providing a scorer over the newly merged flat vectors. This way
53+
* any additional merging logic can be implemented by the user of this class.
54+
*
55+
* @param fieldInfo fieldInfo of the field to merge
56+
* @param mergeState mergeState of the segments to merge
57+
* @return a scorer over the newly merged flat vectors, which should be closed as it holds a
58+
* temporary file handle to read over the newly merged vectors
59+
* @throws IOException if an I/O error occurs when merging
60+
*/
61+
public abstract CloseableRandomVectorScorerSupplier mergeOneFieldToIndex(
62+
FieldInfo fieldInfo, MergeState mergeState) throws IOException;
63+
64+
/** Write field for merging */
65+
public void mergeOneField(FieldInfo fieldInfo, MergeState mergeState) throws IOException {
66+
IOUtils.close(mergeOneFieldToIndex(fieldInfo, mergeState));
67+
}
68+
69+
/** Called once at the end before close */
70+
public abstract void finish() throws IOException;
71+
72+
/** Flush all buffered data on disk * */
73+
public abstract void flush(int maxDoc, Sorter.DocMap sortMap) throws IOException;
74+
}

lucene/core/src/java/org/apache/lucene/codecs/lucene95/OffHeapByteVectorValues.java

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -94,8 +94,6 @@ public static OffHeapByteVectorValues load(
9494
}
9595
}
9696

97-
public abstract Bits getAcceptOrds(Bits acceptDocs);
98-
9997
/**
10098
* Dense vector values that are stored off-heap. This is the most common case when every doc has a
10199
* vector.

0 commit comments

Comments
 (0)