Skip to content

Commit 7f454a6

Browse files
liangkaiwenkliang78alessandrobenedetti
authored
[SOLR-17812] Add support for BinaryQuantizedDenseVectorField (#3468)
* added binary quantisation + documentation and tests --------- Co-authored-by: kliang78 <[email protected]> Co-authored-by: Alessandro Benedetti <[email protected]>
1 parent 4f02148 commit 7f454a6

File tree

5 files changed

+152
-0
lines changed

5 files changed

+152
-0
lines changed

solr/CHANGES.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@ New Features
2121

2222
* SOLR-17780: Add support for scalar quantized dense vectors (Kevin Liang via Alessandro Benedetti)
2323

24+
* SOLR-17812: Add support for binary quantized dense vectors (Kevin Liang via Alessandro Benedetti)
25+
2426
* SOLR-17023: Use Modern NLP Models from Apache OpenNLP with Solr (Jeff Zemerick, Eric Pugh)
2527

2628
* SOLR-17814: Add support for PatienceKnnVectorQuery. (Ilaria Petreti via Alessandro Benedetti)
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
package org.apache.solr.schema;
18+
19+
import org.apache.lucene.codecs.KnnVectorsFormat;
20+
import org.apache.lucene.codecs.lucene102.Lucene102HnswBinaryQuantizedVectorsFormat;
21+
22+
public class BinaryQuantizedDenseVectorField extends DenseVectorField {
23+
24+
@Override
25+
public KnnVectorsFormat buildKnnVectorsFormat() {
26+
return new Lucene102HnswBinaryQuantizedVectorsFormat(getHnswMaxConn(), getHnswBeamWidth());
27+
}
28+
}
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
<?xml version="1.0" ?>
2+
<!--
3+
Licensed to the Apache Software Foundation (ASF) under one or more
4+
contributor license agreements. See the NOTICE file distributed with
5+
this work for additional information regarding copyright ownership.
6+
The ASF licenses this file to You under the Apache License, Version 2.0
7+
(the "License"); you may not use this file except in compliance with
8+
the License. You may obtain a copy of the License at
9+
10+
http://www.apache.org/licenses/LICENSE-2.0
11+
12+
Unless required by applicable law or agreed to in writing, software
13+
distributed under the License is distributed on an "AS IS" BASIS,
14+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
See the License for the specific language governing permissions and
16+
limitations under the License.
17+
-->
18+
19+
<!-- Test schema file for DenseVectorField types -->
20+
21+
<schema name="schema-densevector-bbq" version="1.0">
22+
<fieldType name="string" class="solr.StrField" multiValued="true"/>
23+
<fieldType name="plong" class="solr.LongPointField" useDocValuesAsStored="false"/>
24+
25+
<!-- Binary Bit Quantized vectors -->
26+
<fieldType name="knn_vector_binary_quantized" class="solr.BinaryQuantizedDenseVectorField" vectorDimension="4"/>
27+
28+
<field name="v_bq" type="knn_vector_binary_quantized" indexed="true" stored="true" />
29+
30+
<field name="string_field" type="string" indexed="true" stored="true" multiValued="false" required="false"/>
31+
<field name="id" type="string" indexed="true" stored="true" multiValued="false" required="false"/>
32+
<field name="_version_" type="plong" indexed="true" stored="true" multiValued="false" />
33+
<field name="_text_" type="text_general" indexed="true" stored="false" multiValued="true"/>
34+
<copyField source="*" dest="_text_"/>
35+
<fieldType name="text_general" class="solr.TextField" positionIncrementGap="100" multiValued="true">
36+
<analyzer type="index">
37+
<tokenizer class="solr.StandardTokenizerFactory"/>
38+
<filter class="solr.StopFilterFactory" words="stopwords.txt" ignoreCase="true"/>
39+
<filter class="solr.LowerCaseFilterFactory"/>
40+
</analyzer>
41+
<analyzer type="query">
42+
<tokenizer class="solr.StandardTokenizerFactory"/>
43+
<filter class="solr.StopFilterFactory" words="stopwords.txt" ignoreCase="true"/>
44+
<filter class="solr.SynonymGraphFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
45+
<filter class="solr.LowerCaseFilterFactory"/>
46+
</analyzer>
47+
</fieldType>
48+
49+
<uniqueKey>id</uniqueKey>
50+
</schema>
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
package org.apache.solr.schema;
18+
19+
import org.apache.solr.core.AbstractBadConfigTestBase;
20+
import org.junit.Test;
21+
22+
public class BinaryQuantizedDenseVectorFieldTest extends AbstractBadConfigTestBase {
23+
@Test
24+
public void fieldDefinition_correctConfiguration_shouldLoadSchemaField() throws Exception {
25+
try {
26+
initCore("solrconfig-basic.xml", "schema-densevector-bq.xml");
27+
IndexSchema schema = h.getCore().getLatestSchema();
28+
29+
SchemaField vector = schema.getField("v_bq");
30+
assertNotNull(vector);
31+
32+
BinaryQuantizedDenseVectorField type = (BinaryQuantizedDenseVectorField) vector.getType();
33+
assertEquals(4, type.getDimension());
34+
assertTrue(vector.indexed());
35+
assertTrue(vector.stored());
36+
} finally {
37+
deleteCore();
38+
}
39+
}
40+
41+
// there are no major interface differences between BinaryBitQuantizedDenseVectorField and
42+
// DenseVectorField
43+
// so we can rely on those tests for validation cases
44+
//
45+
// as for behavior, there are no externally visible state differences. Internal implementation
46+
// is tested at Lucene level
47+
}

solr/solr-ref-guide/modules/query-guide/pages/dense-vector-search.adoc

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -316,6 +316,31 @@ preserved when `stored` is true.
316316
+
317317
Accepted values: `BOOLEAN`
318318

319+
=== BinaryQuantizedDenseVectorField
320+
321+
Binary quantization is a quantization technique that extends scalar quantization, and is even more aggressive in its compression;
322+
able to reduce in-memory representation of each vector dimension from a 32 bit float down to a single bit.
323+
This is done by normalizing each dimension of a vector relative to a centroid (mid-point pre-calculated against all vectors in the index)
324+
with the stored bit representing whether the actual value is "above" or "below" the centroid's value. A further "corrective factor" is also computed
325+
and stored to help compensate accuracy in the estimated distance. At query time asymmetric quantization is applied to the query
326+
vector (reducing its dimension values down to 4 bits each), but allowing comparison with the stored binary quantized vector via bit arithmetic.
327+
328+
This implementation comprises of LVQ, proposed in https://arxiv.org/abs/2304.04759[Similarity Search in the Blink of an Eye With Compressed Indices]
329+
by Cecilia Aguerrebere et al., previous work on globally optimized scalar quantization in Apache Lucene, and ideas from
330+
https://arxiv.org/abs/1908.10396[Accelerating Large-Scale Inference with Anisotropic Vector Quantization] by Ruiqi Guo et al.
331+
332+
This vector type is best utilized for data sets consisting of large amounts of high dimensionality vectors.
333+
334+
Here is how a BinaryQuantizedDenseVectorField can be defined in the schema:
335+
336+
[source,xml]
337+
<fieldType name="binary_quantized_vector" class="solr.BinaryQuantizedDenseVectorField" vectorDimension="4"/>
338+
<field name="vector" type="binary_quantized_vector" indexed="true" stored="true"/>
339+
340+
BinaryQuantizedDenseVectorField accepts the same parameters as `DenseVectorField` with the only notable exception being
341+
`similarityFunction`. Bit quantization uses its own distance calculation and so does not require nor use the `similarityFunction`
342+
param.
343+
319344
== Query Time
320345

321346
Apache Solr provides three query parsers that work with dense vector fields, that each support different ways of matching documents based on vector similarity: The `knn` query parser, the `vectorSimilarity` query parser and the `knn_text_to_vector` query parser.

0 commit comments

Comments
 (0)