Skip to content

Commit 6f42ab3

Browse files
committed
Moves the 102 legacy binary formats to backwards codecs (#15353)
Users should utilize the new 104 scalar quantization format with the asymmetric quantization encoding. When migrating to the new format, the main change will be instead of using `new Lucene102HnswBinaryQuantizedVectorsFormat()`, user's should use `new Lucene104HnswScalarQuantizedVectorsFormat(ScalarEncoding.SINGLE_BIT_QUERY_NIBBLE, DEFAULT_MAX_CONN, DEFAULT_BEAM_WIDTH, DEFAULT_NUM_MERGE_WORKER, null, HNSW_GRAPH_THRESHOLD)`
1 parent cb5ab36 commit 6f42ab3

20 files changed

+346
-563
lines changed

lucene/CHANGES.txt

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,9 +35,11 @@ New Features
3535
`Lucene104HnswScalarQuantizedVectorsFormat` replaces the now legacy `Lucene99HnswScalarQuantizedVectorsFormat`
3636
(Trevor McCulloch)
3737

38-
* GITHUB#15271: Extend `Lucene104ScalarQuantizedVectorsFormat` and `Lucene104HnswScalarQuantizedVectorsFormat` to
38+
* GITHUB#15271, GITHUB#15353: Extend `Lucene104ScalarQuantizedVectorsFormat` and `Lucene104HnswScalarQuantizedVectorsFormat` to
3939
allow asymmetric quantization. The initially supported bits are single bit with 4 bit queries. This is a replacement
40-
for the now legacy `Lucene102HnswBinaryQuantizedVectorsFormat` and `Lucene102BinaryQuantizedVectorsFormat`.
40+
for the now legacy `Lucene102HnswBinaryQuantizedVectorsFormat` and `Lucene102BinaryQuantizedVectorsFormat`. To use
41+
the new format for asymmetric binary quantization,
42+
`Lucene104HnswScalarQuantizedVectorsFormat(ScalarEncoding.SINGLE_BIT_QUERY_NIBBLE, int, int)`
4143
(Ben Trent)
4244

4345
Improvements

lucene/backward-codecs/src/java/module-info.java

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
exports org.apache.lucene.backward_codecs.lucene912;
4141
exports org.apache.lucene.backward_codecs.lucene100;
4242
exports org.apache.lucene.backward_codecs.lucene101;
43+
exports org.apache.lucene.backward_codecs.lucene102;
4344
exports org.apache.lucene.backward_codecs.lucene103;
4445
exports org.apache.lucene.backward_codecs.packed;
4546
exports org.apache.lucene.backward_codecs.store;
@@ -61,7 +62,9 @@
6162
org.apache.lucene.backward_codecs.lucene94.Lucene94HnswVectorsFormat,
6263
org.apache.lucene.backward_codecs.lucene95.Lucene95HnswVectorsFormat,
6364
org.apache.lucene.backward_codecs.lucene99.Lucene99ScalarQuantizedVectorsFormat,
64-
org.apache.lucene.backward_codecs.lucene99.Lucene99HnswScalarQuantizedVectorsFormat;
65+
org.apache.lucene.backward_codecs.lucene99.Lucene99HnswScalarQuantizedVectorsFormat,
66+
org.apache.lucene.backward_codecs.lucene102.Lucene102BinaryQuantizedVectorsFormat,
67+
org.apache.lucene.backward_codecs.lucene102.Lucene102HnswBinaryQuantizedVectorsFormat;
6568
provides org.apache.lucene.codecs.Codec with
6669
org.apache.lucene.backward_codecs.lucene80.Lucene80Codec,
6770
org.apache.lucene.backward_codecs.lucene84.Lucene84Codec,

lucene/core/src/java/org/apache/lucene/codecs/lucene102/BinarizedByteVectorValues.java renamed to lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene102/BinarizedByteVectorValues.java

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
* See the License for the specific language governing permissions and
1515
* limitations under the License.
1616
*/
17-
package org.apache.lucene.codecs.lucene102;
17+
package org.apache.lucene.backward_codecs.lucene102;
1818

1919
import static org.apache.lucene.util.quantization.OptimizedScalarQuantizer.discretize;
2020

@@ -51,15 +51,15 @@ abstract class BinarizedByteVectorValues extends ByteVectorValues {
5151
* @return the corrective terms
5252
* @throws IOException if an I/O error occurs
5353
*/
54-
public abstract OptimizedScalarQuantizer.QuantizationResult getCorrectiveTerms(int vectorOrd)
54+
abstract OptimizedScalarQuantizer.QuantizationResult getCorrectiveTerms(int vectorOrd)
5555
throws IOException;
5656

5757
/**
5858
* @return the quantizer used to quantize the vectors
5959
*/
60-
public abstract OptimizedScalarQuantizer getQuantizer();
60+
abstract OptimizedScalarQuantizer getQuantizer();
6161

62-
public abstract float[] getCentroid() throws IOException;
62+
abstract float[] getCentroid() throws IOException;
6363

6464
int discretizedDimensions() {
6565
return discretize(dimension(), 64);
@@ -71,7 +71,7 @@ int discretizedDimensions() {
7171
* @param query the query vector
7272
* @return a {@link VectorScorer} instance or null
7373
*/
74-
public abstract VectorScorer scorer(float[] query) throws IOException;
74+
abstract VectorScorer scorer(float[] query) throws IOException;
7575

7676
@Override
7777
public abstract BinarizedByteVectorValues copy() throws IOException;
Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
package org.apache.lucene.backward_codecs.lucene102;
18+
19+
import static org.apache.lucene.backward_codecs.lucene102.Lucene102BinaryQuantizedVectorsFormat.QUERY_BITS;
20+
import static org.apache.lucene.index.VectorSimilarityFunction.COSINE;
21+
import static org.apache.lucene.index.VectorSimilarityFunction.EUCLIDEAN;
22+
import static org.apache.lucene.index.VectorSimilarityFunction.MAXIMUM_INNER_PRODUCT;
23+
import static org.apache.lucene.util.quantization.OptimizedScalarQuantizer.transposeHalfByte;
24+
25+
import java.io.IOException;
26+
import org.apache.lucene.codecs.hnsw.FlatVectorsScorer;
27+
import org.apache.lucene.index.KnnVectorValues;
28+
import org.apache.lucene.index.VectorSimilarityFunction;
29+
import org.apache.lucene.util.ArrayUtil;
30+
import org.apache.lucene.util.VectorUtil;
31+
import org.apache.lucene.util.hnsw.RandomVectorScorer;
32+
import org.apache.lucene.util.hnsw.RandomVectorScorerSupplier;
33+
import org.apache.lucene.util.quantization.OptimizedScalarQuantizer;
34+
import org.apache.lucene.util.quantization.OptimizedScalarQuantizer.QuantizationResult;
35+
36+
/** Vector scorer over binarized vector values */
37+
public class Lucene102BinaryFlatVectorsScorer implements FlatVectorsScorer {
38+
/** The delegate scorer for non-quantized vectors */
39+
protected final FlatVectorsScorer nonQuantizedDelegate;
40+
41+
/** Scaling factor for 4-bit quantization */
42+
protected static final float FOUR_BIT_SCALE = 1f / ((1 << 4) - 1);
43+
44+
/**
45+
* Construct a new scorer
46+
*
47+
* @param nonQuantizedDelegate the delegate scorer for non-quantized vectors
48+
*/
49+
public Lucene102BinaryFlatVectorsScorer(FlatVectorsScorer nonQuantizedDelegate) {
50+
this.nonQuantizedDelegate = nonQuantizedDelegate;
51+
}
52+
53+
@Override
54+
public RandomVectorScorerSupplier getRandomVectorScorerSupplier(
55+
VectorSimilarityFunction similarityFunction, KnnVectorValues vectorValues)
56+
throws IOException {
57+
throw new UnsupportedOperationException("Old codecs may only be used for reading");
58+
}
59+
60+
@Override
61+
public RandomVectorScorer getRandomVectorScorer(
62+
VectorSimilarityFunction similarityFunction, KnnVectorValues vectorValues, float[] target)
63+
throws IOException {
64+
if (vectorValues instanceof BinarizedByteVectorValues binarizedVectors) {
65+
OptimizedScalarQuantizer quantizer = binarizedVectors.getQuantizer();
66+
float[] centroid = binarizedVectors.getCentroid();
67+
// We make a copy as the quantization process mutates the input
68+
float[] copy = ArrayUtil.copyOfSubArray(target, 0, target.length);
69+
if (similarityFunction == COSINE) {
70+
VectorUtil.l2normalize(copy);
71+
}
72+
target = copy;
73+
byte[] initial = new byte[target.length];
74+
byte[] quantized = new byte[QUERY_BITS * binarizedVectors.discretizedDimensions() / 8];
75+
QuantizationResult queryCorrections =
76+
quantizer.scalarQuantize(target, initial, (byte) 4, centroid);
77+
transposeHalfByte(initial, quantized);
78+
return new RandomVectorScorer.AbstractRandomVectorScorer(binarizedVectors) {
79+
@Override
80+
public float score(int node) throws IOException {
81+
return quantizedScore(
82+
quantized, queryCorrections, binarizedVectors, node, similarityFunction);
83+
}
84+
};
85+
}
86+
return nonQuantizedDelegate.getRandomVectorScorer(similarityFunction, vectorValues, target);
87+
}
88+
89+
@Override
90+
public RandomVectorScorer getRandomVectorScorer(
91+
VectorSimilarityFunction similarityFunction, KnnVectorValues vectorValues, byte[] target)
92+
throws IOException {
93+
return nonQuantizedDelegate.getRandomVectorScorer(similarityFunction, vectorValues, target);
94+
}
95+
96+
@Override
97+
public String toString() {
98+
return "Lucene102BinaryFlatVectorsScorer(nonQuantizedDelegate=" + nonQuantizedDelegate + ")";
99+
}
100+
101+
static float quantizedScore(
102+
byte[] quantizedQuery,
103+
QuantizationResult queryCorrections,
104+
BinarizedByteVectorValues targetVectors,
105+
int targetOrd,
106+
VectorSimilarityFunction similarityFunction)
107+
throws IOException {
108+
byte[] binaryCode = targetVectors.vectorValue(targetOrd);
109+
float qcDist = VectorUtil.int4BitDotProduct(quantizedQuery, binaryCode);
110+
QuantizationResult indexCorrections = targetVectors.getCorrectiveTerms(targetOrd);
111+
float x1 = indexCorrections.quantizedComponentSum();
112+
float ax = indexCorrections.lowerInterval();
113+
// Here we assume `lx` is simply bit vectors, so the scaling isn't necessary
114+
float lx = indexCorrections.upperInterval() - ax;
115+
float ay = queryCorrections.lowerInterval();
116+
float ly = (queryCorrections.upperInterval() - ay) * FOUR_BIT_SCALE;
117+
float y1 = queryCorrections.quantizedComponentSum();
118+
float score =
119+
ax * ay * targetVectors.dimension() + ay * lx * x1 + ax * ly * y1 + lx * ly * qcDist;
120+
// For euclidean, we need to invert the score and apply the additional correction, which is
121+
// assumed to be the squared l2norm of the centroid centered vectors.
122+
if (similarityFunction == EUCLIDEAN) {
123+
score =
124+
queryCorrections.additionalCorrection()
125+
+ indexCorrections.additionalCorrection()
126+
- 2 * score;
127+
return Math.max(1 / (1f + score), 0);
128+
} else {
129+
// For cosine and max inner product, we need to apply the additional correction, which is
130+
// assumed to be the non-centered dot-product between the vector and the centroid
131+
score +=
132+
queryCorrections.additionalCorrection()
133+
+ indexCorrections.additionalCorrection()
134+
- targetVectors.getCentroidDP();
135+
if (similarityFunction == MAXIMUM_INNER_PRODUCT) {
136+
return VectorUtil.scaleMaxInnerProductScore(score);
137+
}
138+
return Math.max((1f + score) / 2f, 0);
139+
}
140+
}
141+
}

lucene/core/src/java/org/apache/lucene/codecs/lucene102/Lucene102BinaryQuantizedVectorsFormat.java renamed to lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene102/Lucene102BinaryQuantizedVectorsFormat.java

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
* See the License for the specific language governing permissions and
1515
* limitations under the License.
1616
*/
17-
package org.apache.lucene.codecs.lucene102;
17+
package org.apache.lucene.backward_codecs.lucene102;
1818

1919
import java.io.IOException;
2020
import org.apache.lucene.codecs.hnsw.FlatVectorScorerUtil;
@@ -89,11 +89,11 @@
8989
*/
9090
public class Lucene102BinaryQuantizedVectorsFormat extends FlatVectorsFormat {
9191

92-
public static final byte QUERY_BITS = 4;
93-
public static final byte INDEX_BITS = 1;
92+
static final byte QUERY_BITS = 4;
93+
static final byte INDEX_BITS = 1;
9494

95-
public static final String BINARIZED_VECTOR_COMPONENT = "BVEC";
96-
public static final String NAME = "Lucene102BinaryQuantizedVectorsFormat";
95+
static final String BINARIZED_VECTOR_COMPONENT = "BVEC";
96+
static final String NAME = "Lucene102BinaryQuantizedVectorsFormat";
9797

9898
static final int VERSION_START = 0;
9999
static final int VERSION_CURRENT = VERSION_START;
@@ -103,7 +103,8 @@ public class Lucene102BinaryQuantizedVectorsFormat extends FlatVectorsFormat {
103103
static final String VECTOR_DATA_EXTENSION = "veb";
104104
static final int DIRECT_MONOTONIC_BLOCK_SHIFT = 16;
105105

106-
private static final FlatVectorsFormat rawVectorFormat =
106+
/** The raw (unquantized) vector format used to read the original vectors. */
107+
protected static final FlatVectorsFormat rawVectorFormat =
107108
new Lucene99FlatVectorsFormat(FlatVectorScorerUtil.getLucene99FlatVectorsScorer());
108109

109110
private static final Lucene102BinaryFlatVectorsScorer scorer =
@@ -116,8 +117,7 @@ public Lucene102BinaryQuantizedVectorsFormat() {
116117

117118
@Override
118119
public FlatVectorsWriter fieldsWriter(SegmentWriteState state) throws IOException {
119-
return new Lucene102BinaryQuantizedVectorsWriter(
120-
scorer, rawVectorFormat.fieldsWriter(state), state);
120+
throw new UnsupportedOperationException("Old codecs may only be used for reading");
121121
}
122122

123123
@Override

lucene/core/src/java/org/apache/lucene/codecs/lucene102/Lucene102BinaryQuantizedVectorsReader.java renamed to lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene102/Lucene102BinaryQuantizedVectorsReader.java

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,9 @@
1414
* See the License for the specific language governing permissions and
1515
* limitations under the License.
1616
*/
17-
package org.apache.lucene.codecs.lucene102;
17+
package org.apache.lucene.backward_codecs.lucene102;
1818

19-
import static org.apache.lucene.codecs.lucene102.Lucene102BinaryQuantizedVectorsFormat.VECTOR_DATA_EXTENSION;
19+
import static org.apache.lucene.backward_codecs.lucene102.Lucene102BinaryQuantizedVectorsFormat.VECTOR_DATA_EXTENSION;
2020
import static org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsReader.readSimilarityFunction;
2121
import static org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsReader.readVectorEncoding;
2222
import static org.apache.lucene.util.quantization.OptimizedScalarQuantizer.discretize;
@@ -55,7 +55,7 @@
5555
import org.apache.lucene.util.quantization.OptimizedScalarQuantizer;
5656

5757
/** Reader for binary quantized vectors in the Lucene 10.2 format. */
58-
class Lucene102BinaryQuantizedVectorsReader extends FlatVectorsReader {
58+
public class Lucene102BinaryQuantizedVectorsReader extends FlatVectorsReader {
5959

6060
private static final long SHALLOW_SIZE =
6161
RamUsageEstimator.shallowSizeOfInstance(Lucene102BinaryQuantizedVectorsReader.class);
@@ -65,7 +65,15 @@ class Lucene102BinaryQuantizedVectorsReader extends FlatVectorsReader {
6565
private final FlatVectorsReader rawVectorsReader;
6666
private final Lucene102BinaryFlatVectorsScorer vectorScorer;
6767

68-
Lucene102BinaryQuantizedVectorsReader(
68+
/**
69+
* Creates a new reader for binary quantized vectors.
70+
*
71+
* @param state the segment read state
72+
* @param rawVectorsReader the reader for the raw (non-quantized) vectors
73+
* @param vectorsScorer the scorer for binary quantized vectors
74+
* @throws IOException if an I/O error occurs
75+
*/
76+
public Lucene102BinaryQuantizedVectorsReader(
6977
SegmentReadState state,
7078
FlatVectorsReader rawVectorsReader,
7179
Lucene102BinaryFlatVectorsScorer vectorsScorer)
@@ -101,7 +109,7 @@ class Lucene102BinaryQuantizedVectorsReader extends FlatVectorsReader {
101109
openDataInput(
102110
state,
103111
versionMeta,
104-
Lucene102BinaryQuantizedVectorsFormat.VECTOR_DATA_EXTENSION,
112+
VECTOR_DATA_EXTENSION,
105113
Lucene102BinaryQuantizedVectorsFormat.VECTOR_DATA_CODEC_NAME,
106114
// Quantized vectors are accessed randomly from their node ID stored in the HNSW
107115
// graph.
@@ -277,7 +285,7 @@ public Map<String, Long> getOffHeapByteSize(FieldInfo fieldInfo) {
277285
return KnnVectorsReader.mergeOffHeapByteSizeMaps(raw, quant);
278286
}
279287

280-
public float[] getCentroid(String field) {
288+
float[] getCentroid(String field) {
281289
FieldEntry fieldEntry = fields.get(field);
282290
if (fieldEntry != null) {
283291
return fieldEntry.centroid;

lucene/core/src/java/org/apache/lucene/codecs/lucene102/Lucene102HnswBinaryQuantizedVectorsFormat.java renamed to lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene102/Lucene102HnswBinaryQuantizedVectorsFormat.java

Lines changed: 11 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
* See the License for the specific language governing permissions and
1515
* limitations under the License.
1616
*/
17-
package org.apache.lucene.codecs.lucene102;
17+
package org.apache.lucene.backward_codecs.lucene102;
1818

1919
import static org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat.DEFAULT_BEAM_WIDTH;
2020
import static org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat.DEFAULT_MAX_CONN;
@@ -30,7 +30,6 @@
3030
import org.apache.lucene.codecs.hnsw.FlatVectorsFormat;
3131
import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat;
3232
import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsReader;
33-
import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsWriter;
3433
import org.apache.lucene.index.SegmentReadState;
3534
import org.apache.lucene.index.SegmentWriteState;
3635
import org.apache.lucene.search.TaskExecutor;
@@ -42,43 +41,36 @@
4241
*/
4342
public class Lucene102HnswBinaryQuantizedVectorsFormat extends KnnVectorsFormat {
4443

45-
public static final String NAME = "Lucene102HnswBinaryQuantizedVectorsFormat";
44+
static final String NAME = "Lucene102HnswBinaryQuantizedVectorsFormat";
4645

4746
/**
4847
* Controls how many of the nearest neighbor candidates are connected to the new node. Defaults to
4948
* {@link Lucene99HnswVectorsFormat#DEFAULT_MAX_CONN}. See {@link HnswGraph} for more details.
5049
*/
51-
private final int maxConn;
50+
protected final int maxConn;
5251

5352
/**
5453
* The number of candidate neighbors to track while searching the graph for each newly inserted
5554
* node. Defaults to {@link Lucene99HnswVectorsFormat#DEFAULT_BEAM_WIDTH}. See {@link HnswGraph}
5655
* for details.
5756
*/
58-
private final int beamWidth;
57+
protected final int beamWidth;
5958

6059
/** The format for storing, reading, merging vectors on disk */
61-
private static final FlatVectorsFormat flatVectorsFormat =
60+
protected static final FlatVectorsFormat flatVectorsFormat =
6261
new Lucene102BinaryQuantizedVectorsFormat();
6362

64-
private final int numMergeWorkers;
65-
private final TaskExecutor mergeExec;
63+
/** Number of workers (threads) that will be used when doing merge. */
64+
protected final int numMergeWorkers;
65+
66+
/** The {@link TaskExecutor} that will be used to do merge. */
67+
protected final TaskExecutor mergeExec;
6668

6769
/** Constructs a format using default graph construction parameters */
6870
public Lucene102HnswBinaryQuantizedVectorsFormat() {
6971
this(DEFAULT_MAX_CONN, DEFAULT_BEAM_WIDTH, DEFAULT_NUM_MERGE_WORKER, null);
7072
}
7173

72-
/**
73-
* Constructs a format using the given graph construction parameters.
74-
*
75-
* @param maxConn the maximum number of connections to a node in the HNSW graph
76-
* @param beamWidth the size of the queue maintained during graph construction.
77-
*/
78-
public Lucene102HnswBinaryQuantizedVectorsFormat(int maxConn, int beamWidth) {
79-
this(maxConn, beamWidth, DEFAULT_NUM_MERGE_WORKER, null);
80-
}
81-
8274
/**
8375
* Constructs a format using the given graph construction parameters and scalar quantization.
8476
*
@@ -122,14 +114,7 @@ public Lucene102HnswBinaryQuantizedVectorsFormat(
122114

123115
@Override
124116
public KnnVectorsWriter fieldsWriter(SegmentWriteState state) throws IOException {
125-
return new Lucene99HnswVectorsWriter(
126-
state,
127-
maxConn,
128-
beamWidth,
129-
flatVectorsFormat.fieldsWriter(state),
130-
numMergeWorkers,
131-
mergeExec,
132-
0);
117+
throw new UnsupportedOperationException("Old codecs may only be used for reading");
133118
}
134119

135120
@Override

0 commit comments

Comments
 (0)