apache
diff --git a/‎lucene/CHANGES.txt‎
Lines changed: 4 additions & 2 deletions b/‎lucene/CHANGES.txt‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎lucene/backward-codecs/src/java/module-info.java‎
Lines changed: 4 additions & 1 deletion b/‎lucene/backward-codecs/src/java/module-info.java‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎lucene/core/src/java/org/apache/lucene/codecs/lucene102/BinarizedByteVectorValues.java‎ renamed to ‎lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene102/BinarizedByteVectorValues.java‎
Lines changed: 5 additions & 5 deletions b/‎lucene/core/src/java/org/apache/lucene/codecs/lucene102/BinarizedByteVectorValues.java‎ renamed to ‎lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene102/BinarizedByteVectorValues.java‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene102/Lucene102BinaryFlatVectorsScorer.java‎
Lines changed: 141 additions & 0 deletions b/‎lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene102/Lucene102BinaryFlatVectorsScorer.java‎
Lines changed: 141 additions & 0 deletions
diff --git a/‎lucene/core/src/java/org/apache/lucene/codecs/lucene102/Lucene102BinaryQuantizedVectorsFormat.java‎ renamed to ‎lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene102/Lucene102BinaryQuantizedVectorsFormat.java‎
Lines changed: 8 additions & 8 deletions b/‎lucene/core/src/java/org/apache/lucene/codecs/lucene102/Lucene102BinaryQuantizedVectorsFormat.java‎ renamed to ‎lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene102/Lucene102BinaryQuantizedVectorsFormat.java‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎lucene/core/src/java/org/apache/lucene/codecs/lucene102/Lucene102BinaryQuantizedVectorsReader.java‎ renamed to ‎lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene102/Lucene102BinaryQuantizedVectorsReader.java‎
Lines changed: 14 additions & 6 deletions b/‎lucene/core/src/java/org/apache/lucene/codecs/lucene102/Lucene102BinaryQuantizedVectorsReader.java‎ renamed to ‎lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene102/Lucene102BinaryQuantizedVectorsReader.java‎
Lines changed: 14 additions & 6 deletions
diff --git a/‎lucene/core/src/java/org/apache/lucene/codecs/lucene102/Lucene102HnswBinaryQuantizedVectorsFormat.java‎ renamed to ‎lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene102/Lucene102HnswBinaryQuantizedVectorsFormat.java‎
Lines changed: 11 additions & 26 deletions b/‎lucene/core/src/java/org/apache/lucene/codecs/lucene102/Lucene102HnswBinaryQuantizedVectorsFormat.java‎ renamed to ‎lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene102/Lucene102HnswBinaryQuantizedVectorsFormat.java‎
Lines changed: 11 additions & 26 deletions
@@ -35,9 +35,11 @@ New Features
   `Lucene104HnswScalarQuantizedVectorsFormat` replaces the now legacy `Lucene99HnswScalarQuantizedVectorsFormat`
    (Trevor McCulloch)
 
- * GITHUB#15271: Extend `Lucene104ScalarQuantizedVectorsFormat` and `Lucene104HnswScalarQuantizedVectorsFormat` to
+ * GITHUB#15271, GITHUB#15353: Extend `Lucene104ScalarQuantizedVectorsFormat` and `Lucene104HnswScalarQuantizedVectorsFormat` to
    allow asymmetric quantization. The initially supported bits are single bit with 4 bit queries. This is a replacement
-   for the now legacy `Lucene102HnswBinaryQuantizedVectorsFormat` and `Lucene102BinaryQuantizedVectorsFormat`.
+   for the now legacy `Lucene102HnswBinaryQuantizedVectorsFormat` and `Lucene102BinaryQuantizedVectorsFormat`. To use
+   the new format for asymmetric binary quantization,
+   `Lucene104HnswScalarQuantizedVectorsFormat(ScalarEncoding.SINGLE_BIT_QUERY_NIBBLE, int, int)`
     (Ben Trent)
 
 Improvements
 
@@ -40,6 +40,7 @@
   exports org.apache.lucene.backward_codecs.lucene912;
   exports org.apache.lucene.backward_codecs.lucene100;
   exports org.apache.lucene.backward_codecs.lucene101;
+  exports org.apache.lucene.backward_codecs.lucene102;
   exports org.apache.lucene.backward_codecs.lucene103;
   exports org.apache.lucene.backward_codecs.packed;
   exports org.apache.lucene.backward_codecs.store;
@@ -61,7 +62,9 @@
       org.apache.lucene.backward_codecs.lucene94.Lucene94HnswVectorsFormat,
       org.apache.lucene.backward_codecs.lucene95.Lucene95HnswVectorsFormat,
       org.apache.lucene.backward_codecs.lucene99.Lucene99ScalarQuantizedVectorsFormat,
-      org.apache.lucene.backward_codecs.lucene99.Lucene99HnswScalarQuantizedVectorsFormat;
+      org.apache.lucene.backward_codecs.lucene99.Lucene99HnswScalarQuantizedVectorsFormat,
+      org.apache.lucene.backward_codecs.lucene102.Lucene102BinaryQuantizedVectorsFormat,
+      org.apache.lucene.backward_codecs.lucene102.Lucene102HnswBinaryQuantizedVectorsFormat;
   provides org.apache.lucene.codecs.Codec with
       org.apache.lucene.backward_codecs.lucene80.Lucene80Codec,
       org.apache.lucene.backward_codecs.lucene84.Lucene84Codec,
 
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.lucene.codecs.lucene102;
+package org.apache.lucene.backward_codecs.lucene102;
 
 import static org.apache.lucene.util.quantization.OptimizedScalarQuantizer.discretize;
 
@@ -51,15 +51,15 @@ abstract class BinarizedByteVectorValues extends ByteVectorValues {
    * @return the corrective terms
    * @throws IOException if an I/O error occurs
    */
-  public abstract OptimizedScalarQuantizer.QuantizationResult getCorrectiveTerms(int vectorOrd)
+  abstract OptimizedScalarQuantizer.QuantizationResult getCorrectiveTerms(int vectorOrd)
       throws IOException;
 
   /**
    * @return the quantizer used to quantize the vectors
    */
-  public abstract OptimizedScalarQuantizer getQuantizer();
+  abstract OptimizedScalarQuantizer getQuantizer();
 
-  public abstract float[] getCentroid() throws IOException;
+  abstract float[] getCentroid() throws IOException;
 
   int discretizedDimensions() {
     return discretize(dimension(), 64);
@@ -71,7 +71,7 @@ int discretizedDimensions() {
    * @param query the query vector
    * @return a {@link VectorScorer} instance or null
    */
-  public abstract VectorScorer scorer(float[] query) throws IOException;
+  abstract VectorScorer scorer(float[] query) throws IOException;
 
   @Override
   public abstract BinarizedByteVectorValues copy() throws IOException;
 
@@ -0,0 +1,141 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.backward_codecs.lucene102;
+
+import static org.apache.lucene.backward_codecs.lucene102.Lucene102BinaryQuantizedVectorsFormat.QUERY_BITS;
+import static org.apache.lucene.index.VectorSimilarityFunction.COSINE;
+import static org.apache.lucene.index.VectorSimilarityFunction.EUCLIDEAN;
+import static org.apache.lucene.index.VectorSimilarityFunction.MAXIMUM_INNER_PRODUCT;
+import static org.apache.lucene.util.quantization.OptimizedScalarQuantizer.transposeHalfByte;
+
+import java.io.IOException;
+import org.apache.lucene.codecs.hnsw.FlatVectorsScorer;
+import org.apache.lucene.index.KnnVectorValues;
+import org.apache.lucene.index.VectorSimilarityFunction;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.VectorUtil;
+import org.apache.lucene.util.hnsw.RandomVectorScorer;
+import org.apache.lucene.util.hnsw.RandomVectorScorerSupplier;
+import org.apache.lucene.util.quantization.OptimizedScalarQuantizer;
+import org.apache.lucene.util.quantization.OptimizedScalarQuantizer.QuantizationResult;
+
+/** Vector scorer over binarized vector values */
+public class Lucene102BinaryFlatVectorsScorer implements FlatVectorsScorer {
+  /** The delegate scorer for non-quantized vectors */
+  protected final FlatVectorsScorer nonQuantizedDelegate;
+
+  /** Scaling factor for 4-bit quantization */
+  protected static final float FOUR_BIT_SCALE = 1f / ((1 << 4) - 1);
+
+  /**
+   * Construct a new scorer
+   *
+   * @param nonQuantizedDelegate the delegate scorer for non-quantized vectors
+   */
+  public Lucene102BinaryFlatVectorsScorer(FlatVectorsScorer nonQuantizedDelegate) {
+    this.nonQuantizedDelegate = nonQuantizedDelegate;
+  }
+
+  @Override
+  public RandomVectorScorerSupplier getRandomVectorScorerSupplier(
+      VectorSimilarityFunction similarityFunction, KnnVectorValues vectorValues)
+      throws IOException {
+    throw new UnsupportedOperationException("Old codecs may only be used for reading");
+  }
+
+  @Override
+  public RandomVectorScorer getRandomVectorScorer(
+      VectorSimilarityFunction similarityFunction, KnnVectorValues vectorValues, float[] target)
+      throws IOException {
+    if (vectorValues instanceof BinarizedByteVectorValues binarizedVectors) {
+      OptimizedScalarQuantizer quantizer = binarizedVectors.getQuantizer();
+      float[] centroid = binarizedVectors.getCentroid();
+      // We make a copy as the quantization process mutates the input
+      float[] copy = ArrayUtil.copyOfSubArray(target, 0, target.length);
+      if (similarityFunction == COSINE) {
+        VectorUtil.l2normalize(copy);
+      }
+      target = copy;
+      byte[] initial = new byte[target.length];
+      byte[] quantized = new byte[QUERY_BITS * binarizedVectors.discretizedDimensions() / 8];
+      QuantizationResult queryCorrections =
+          quantizer.scalarQuantize(target, initial, (byte) 4, centroid);
+      transposeHalfByte(initial, quantized);
+      return new RandomVectorScorer.AbstractRandomVectorScorer(binarizedVectors) {
+        @Override
+        public float score(int node) throws IOException {
+          return quantizedScore(
+              quantized, queryCorrections, binarizedVectors, node, similarityFunction);
+        }
+      };
+    }
+    return nonQuantizedDelegate.getRandomVectorScorer(similarityFunction, vectorValues, target);
+  }
+
+  @Override
+  public RandomVectorScorer getRandomVectorScorer(
+      VectorSimilarityFunction similarityFunction, KnnVectorValues vectorValues, byte[] target)
+      throws IOException {
+    return nonQuantizedDelegate.getRandomVectorScorer(similarityFunction, vectorValues, target);
+  }
+
+  @Override
+  public String toString() {
+    return "Lucene102BinaryFlatVectorsScorer(nonQuantizedDelegate=" + nonQuantizedDelegate + ")";
+  }
+
+  static float quantizedScore(
+      byte[] quantizedQuery,
+      QuantizationResult queryCorrections,
+      BinarizedByteVectorValues targetVectors,
+      int targetOrd,
+      VectorSimilarityFunction similarityFunction)
+      throws IOException {
+    byte[] binaryCode = targetVectors.vectorValue(targetOrd);
+    float qcDist = VectorUtil.int4BitDotProduct(quantizedQuery, binaryCode);
+    QuantizationResult indexCorrections = targetVectors.getCorrectiveTerms(targetOrd);
+    float x1 = indexCorrections.quantizedComponentSum();
+    float ax = indexCorrections.lowerInterval();
+    // Here we assume `lx` is simply bit vectors, so the scaling isn't necessary
+    float lx = indexCorrections.upperInterval() - ax;
+    float ay = queryCorrections.lowerInterval();
+    float ly = (queryCorrections.upperInterval() - ay) * FOUR_BIT_SCALE;
+    float y1 = queryCorrections.quantizedComponentSum();
+    float score =
+        ax * ay * targetVectors.dimension() + ay * lx * x1 + ax * ly * y1 + lx * ly * qcDist;
+    // For euclidean, we need to invert the score and apply the additional correction, which is
+    // assumed to be the squared l2norm of the centroid centered vectors.
+    if (similarityFunction == EUCLIDEAN) {
+      score =
+          queryCorrections.additionalCorrection()
+              + indexCorrections.additionalCorrection()
+              - 2 * score;
+      return Math.max(1 / (1f + score), 0);
+    } else {
+      // For cosine and max inner product, we need to apply the additional correction, which is
+      // assumed to be the non-centered dot-product between the vector and the centroid
+      score +=
+          queryCorrections.additionalCorrection()
+              + indexCorrections.additionalCorrection()
+              - targetVectors.getCentroidDP();
+      if (similarityFunction == MAXIMUM_INNER_PRODUCT) {
+        return VectorUtil.scaleMaxInnerProductScore(score);
+      }
+      return Math.max((1f + score) / 2f, 0);
+    }
+  }
+}
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.lucene.codecs.lucene102;
+package org.apache.lucene.backward_codecs.lucene102;
 
 import java.io.IOException;
 import org.apache.lucene.codecs.hnsw.FlatVectorScorerUtil;
@@ -89,11 +89,11 @@
  */
 public class Lucene102BinaryQuantizedVectorsFormat extends FlatVectorsFormat {
 
-  public static final byte QUERY_BITS = 4;
-  public static final byte INDEX_BITS = 1;
+  static final byte QUERY_BITS = 4;
+  static final byte INDEX_BITS = 1;
 
-  public static final String BINARIZED_VECTOR_COMPONENT = "BVEC";
-  public static final String NAME = "Lucene102BinaryQuantizedVectorsFormat";
+  static final String BINARIZED_VECTOR_COMPONENT = "BVEC";
+  static final String NAME = "Lucene102BinaryQuantizedVectorsFormat";
 
   static final int VERSION_START = 0;
   static final int VERSION_CURRENT = VERSION_START;
@@ -103,7 +103,8 @@ public class Lucene102BinaryQuantizedVectorsFormat extends FlatVectorsFormat {
   static final String VECTOR_DATA_EXTENSION = "veb";
   static final int DIRECT_MONOTONIC_BLOCK_SHIFT = 16;
 
-  private static final FlatVectorsFormat rawVectorFormat =
+  /** The raw (unquantized) vector format used to read the original vectors. */
+  protected static final FlatVectorsFormat rawVectorFormat =
       new Lucene99FlatVectorsFormat(FlatVectorScorerUtil.getLucene99FlatVectorsScorer());
 
   private static final Lucene102BinaryFlatVectorsScorer scorer =
@@ -116,8 +117,7 @@ public Lucene102BinaryQuantizedVectorsFormat() {
 
   @Override
   public FlatVectorsWriter fieldsWriter(SegmentWriteState state) throws IOException {
-    return new Lucene102BinaryQuantizedVectorsWriter(
-        scorer, rawVectorFormat.fieldsWriter(state), state);
+    throw new UnsupportedOperationException("Old codecs may only be used for reading");
   }
 
   @Override
 
@@ -14,9 +14,9 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.lucene.codecs.lucene102;
+package org.apache.lucene.backward_codecs.lucene102;
 
-import static org.apache.lucene.codecs.lucene102.Lucene102BinaryQuantizedVectorsFormat.VECTOR_DATA_EXTENSION;
+import static org.apache.lucene.backward_codecs.lucene102.Lucene102BinaryQuantizedVectorsFormat.VECTOR_DATA_EXTENSION;
 import static org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsReader.readSimilarityFunction;
 import static org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsReader.readVectorEncoding;
 import static org.apache.lucene.util.quantization.OptimizedScalarQuantizer.discretize;
@@ -55,7 +55,7 @@
 import org.apache.lucene.util.quantization.OptimizedScalarQuantizer;
 
 /** Reader for binary quantized vectors in the Lucene 10.2 format. */
-class Lucene102BinaryQuantizedVectorsReader extends FlatVectorsReader {
+public class Lucene102BinaryQuantizedVectorsReader extends FlatVectorsReader {
 
   private static final long SHALLOW_SIZE =
       RamUsageEstimator.shallowSizeOfInstance(Lucene102BinaryQuantizedVectorsReader.class);
@@ -65,7 +65,15 @@ class Lucene102BinaryQuantizedVectorsReader extends FlatVectorsReader {
   private final FlatVectorsReader rawVectorsReader;
   private final Lucene102BinaryFlatVectorsScorer vectorScorer;
 
-  Lucene102BinaryQuantizedVectorsReader(
+  /**
+   * Creates a new reader for binary quantized vectors.
+   *
+   * @param state the segment read state
+   * @param rawVectorsReader the reader for the raw (non-quantized) vectors
+   * @param vectorsScorer the scorer for binary quantized vectors
+   * @throws IOException if an I/O error occurs
+   */
+  public Lucene102BinaryQuantizedVectorsReader(
       SegmentReadState state,
       FlatVectorsReader rawVectorsReader,
       Lucene102BinaryFlatVectorsScorer vectorsScorer)
@@ -101,7 +109,7 @@ class Lucene102BinaryQuantizedVectorsReader extends FlatVectorsReader {
           openDataInput(
               state,
               versionMeta,
-              Lucene102BinaryQuantizedVectorsFormat.VECTOR_DATA_EXTENSION,
+              VECTOR_DATA_EXTENSION,
               Lucene102BinaryQuantizedVectorsFormat.VECTOR_DATA_CODEC_NAME,
               // Quantized vectors are accessed randomly from their node ID stored in the HNSW
               // graph.
@@ -277,7 +285,7 @@ public Map<String, Long> getOffHeapByteSize(FieldInfo fieldInfo) {
     return KnnVectorsReader.mergeOffHeapByteSizeMaps(raw, quant);
   }
 
-  public float[] getCentroid(String field) {
+  float[] getCentroid(String field) {
     FieldEntry fieldEntry = fields.get(field);
     if (fieldEntry != null) {
       return fieldEntry.centroid;
 
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-package org.apache.lucene.codecs.lucene102;
+package org.apache.lucene.backward_codecs.lucene102;
 
 import static org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat.DEFAULT_BEAM_WIDTH;
 import static org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat.DEFAULT_MAX_CONN;
@@ -30,7 +30,6 @@
 import org.apache.lucene.codecs.hnsw.FlatVectorsFormat;
 import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat;
 import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsReader;
-import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsWriter;
 import org.apache.lucene.index.SegmentReadState;
 import org.apache.lucene.index.SegmentWriteState;
 import org.apache.lucene.search.TaskExecutor;
@@ -42,43 +41,36 @@
  */
 public class Lucene102HnswBinaryQuantizedVectorsFormat extends KnnVectorsFormat {
 
-  public static final String NAME = "Lucene102HnswBinaryQuantizedVectorsFormat";
+  static final String NAME = "Lucene102HnswBinaryQuantizedVectorsFormat";
 
   /**
    * Controls how many of the nearest neighbor candidates are connected to the new node. Defaults to
    * {@link Lucene99HnswVectorsFormat#DEFAULT_MAX_CONN}. See {@link HnswGraph} for more details.
    */
-  private final int maxConn;
+  protected final int maxConn;
 
   /**
    * The number of candidate neighbors to track while searching the graph for each newly inserted
    * node. Defaults to {@link Lucene99HnswVectorsFormat#DEFAULT_BEAM_WIDTH}. See {@link HnswGraph}
    * for details.
    */
-  private final int beamWidth;
+  protected final int beamWidth;
 
   /** The format for storing, reading, merging vectors on disk */
-  private static final FlatVectorsFormat flatVectorsFormat =
+  protected static final FlatVectorsFormat flatVectorsFormat =
       new Lucene102BinaryQuantizedVectorsFormat();
 
-  private final int numMergeWorkers;
-  private final TaskExecutor mergeExec;
+  /** Number of workers (threads) that will be used when doing merge. */
+  protected final int numMergeWorkers;
+
+  /** The {@link TaskExecutor} that will be used to do merge. */
+  protected final TaskExecutor mergeExec;
 
   /** Constructs a format using default graph construction parameters */
   public Lucene102HnswBinaryQuantizedVectorsFormat() {
     this(DEFAULT_MAX_CONN, DEFAULT_BEAM_WIDTH, DEFAULT_NUM_MERGE_WORKER, null);
   }
 
-  /**
-   * Constructs a format using the given graph construction parameters.
-   *
-   * @param maxConn the maximum number of connections to a node in the HNSW graph
-   * @param beamWidth the size of the queue maintained during graph construction.
-   */
-  public Lucene102HnswBinaryQuantizedVectorsFormat(int maxConn, int beamWidth) {
-    this(maxConn, beamWidth, DEFAULT_NUM_MERGE_WORKER, null);
-  }
-
   /**
    * Constructs a format using the given graph construction parameters and scalar quantization.
    *
@@ -122,14 +114,7 @@ public Lucene102HnswBinaryQuantizedVectorsFormat(
 
   @Override
   public KnnVectorsWriter fieldsWriter(SegmentWriteState state) throws IOException {
-    return new Lucene99HnswVectorsWriter(
-        state,
-        maxConn,
-        beamWidth,
-        flatVectorsFormat.fieldsWriter(state),
-        numMergeWorkers,
-        mergeExec,
-        0);
+    throw new UnsupportedOperationException("Old codecs may only be used for reading");
   }
 
   @Override