Backport change to improve off-heap byte vector scoring at query time (#15010)

kaivalnp · web-flow · commit 54002a4e6b52 · 2025-07-30T10:52:31.000-04:00
* Improve off-heap byte vector scoring at query time Cherry-pick of b3f4011 * Add CHANGES.txt entry --------- Co-authored-by: Kaival Parikh <kaivalp2000@gmail.com>
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
@@ -128,6 +128,8 @@ Optimizations
 
 * GITHUB#14991: Refactor for loop at PointRangeQuery hot path. (Ge Song)
 
+* GITHUB#15010: Improve off-heap KNN byte vector query performance in cases where indexing and search are performed by the same process. (Kaival Parikh)
+
 Changes in Runtime Behavior
 ---------------------
 * GITHUB#14823: Decrease TieredMergePolicy's default number of segments per
diff --git a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/Lucene99MemorySegmentByteVectorScorer.java b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/Lucene99MemorySegmentByteVectorScorer.java
@@ -32,7 +32,7 @@ abstract sealed class Lucene99MemorySegmentByteVectorScorer
 
   final int vectorByteSize;
   final MemorySegmentAccessInput input;
-  final MemorySegment query;
+  final byte[] query;
   byte[] scratch;
 
   /**
@@ -61,7 +61,7 @@ public static Optional<Lucene99MemorySegmentByteVectorScorer> create(
     super(values);
     this.input = input;
     this.vectorByteSize = values.getVectorByteLength();
-    this.query = MemorySegment.ofArray(queryVector);
+    this.query = queryVector;
   }
 
   final MemorySegment getSegment(int ord) throws IOException {
@@ -113,7 +113,7 @@ public float score(int node) throws IOException {
       checkOrdinal(node);
       // divide by 2 * 2^14 (maximum absolute value of product of 2 signed bytes) * len
       float raw = PanamaVectorUtilSupport.dotProduct(query, getSegment(node));
-      return 0.5f + raw / (float) (query.byteSize() * (1 << 15));
+      return 0.5f + raw / (float) (query.length * (1 << 15));
     }
   }
 
diff --git a/lucene/core/src/java21/org/apache/lucene/internal/vectorization/PanamaVectorUtilSupport.java b/lucene/core/src/java21/org/apache/lucene/internal/vectorization/PanamaVectorUtilSupport.java
@@ -309,46 +309,100 @@ private float squareDistanceBody(float[] a, float[] b, int limit) {
   // We also support 128 bit vectors, going 32 bits at a time.
   // This is slower but still faster than not vectorizing at all.
 
+  private interface ByteVectorLoader {
+    int length();
+
+    ByteVector load(VectorSpecies<Byte> species, int index);
+
+    byte tail(int index);
+  }
+
+  private record ArrayLoader(byte[] arr) implements ByteVectorLoader {
+    @Override
+    public int length() {
+      return arr.length;
+    }
+
+    @Override
+    public ByteVector load(VectorSpecies<Byte> species, int index) {
+      assert index + species.length() <= length();
+      return ByteVector.fromArray(species, arr, index);
+    }
+
+    @Override
+    public byte tail(int index) {
+      assert index <= length();
+      return arr[index];
+    }
+  }
+
+  private record MemorySegmentLoader(MemorySegment segment) implements ByteVectorLoader {
+    @Override
+    public int length() {
+      return Math.toIntExact(segment.byteSize());
+    }
+
+    @Override
+    public ByteVector load(VectorSpecies<Byte> species, int index) {
+      assert index + species.length() <= length();
+      return ByteVector.fromMemorySegment(species, segment, index, LITTLE_ENDIAN);
+    }
+
+    @Override
+    public byte tail(int index) {
+      assert index <= length();
+      return segment.get(JAVA_BYTE, index);
+    }
+  }
+
   @Override
   public int dotProduct(byte[] a, byte[] b) {
-    return dotProduct(MemorySegment.ofArray(a), MemorySegment.ofArray(b));
+    return dotProductBody(new ArrayLoader(a), new ArrayLoader(b));
+  }
+
+  public static int dotProduct(byte[] a, MemorySegment b) {
+    return dotProductBody(new ArrayLoader(a), new MemorySegmentLoader(b));
   }
 
   public static int dotProduct(MemorySegment a, MemorySegment b) {
-    assert a.byteSize() == b.byteSize();
+    return dotProductBody(new MemorySegmentLoader(a), new MemorySegmentLoader(b));
+  }
+
+  private static int dotProductBody(ByteVectorLoader a, ByteVectorLoader b) {
+    assert a.length() == b.length();
     int i = 0;
     int res = 0;
 
     // only vectorize if we'll at least enter the loop a single time, and we have at least 128-bit
     // vectors (256-bit on intel to dodge performance landmines)
-    if (a.byteSize() >= 16 && PanamaVectorConstants.HAS_FAST_INTEGER_VECTORS) {
+    if (a.length() >= 16 && PanamaVectorConstants.HAS_FAST_INTEGER_VECTORS) {
       // compute vectorized dot product consistent with VPDPBUSD instruction
       if (VECTOR_BITSIZE >= 512) {
-        i += BYTE_SPECIES.loopBound(a.byteSize());
+        i += BYTE_SPECIES.loopBound(a.length());
         res += dotProductBody512(a, b, i);
       } else if (VECTOR_BITSIZE == 256) {
-        i += BYTE_SPECIES.loopBound(a.byteSize());
+        i += BYTE_SPECIES.loopBound(a.length());
         res += dotProductBody256(a, b, i);
       } else {
         // tricky: we don't have SPECIES_32, so we workaround with "overlapping read"
-        i += ByteVector.SPECIES_64.loopBound(a.byteSize() - ByteVector.SPECIES_64.length());
+        i += ByteVector.SPECIES_64.loopBound(a.length() - ByteVector.SPECIES_64.length());
         res += dotProductBody128(a, b, i);
       }
     }
 
     // scalar tail
-    for (; i < a.byteSize(); i++) {
-      res += b.get(JAVA_BYTE, i) * a.get(JAVA_BYTE, i);
+    for (; i < a.length(); i++) {
+      res += a.tail(i) * b.tail(i);
     }
     return res;
   }
 
   /** vectorized dot product body (512 bit vectors) */
-  private static int dotProductBody512(MemorySegment a, MemorySegment b, int limit) {
+  private static int dotProductBody512(ByteVectorLoader a, ByteVectorLoader b, int limit) {
     IntVector acc = IntVector.zero(INT_SPECIES);
     for (int i = 0; i < limit; i += BYTE_SPECIES.length()) {
-      ByteVector va8 = ByteVector.fromMemorySegment(BYTE_SPECIES, a, i, LITTLE_ENDIAN);
-      ByteVector vb8 = ByteVector.fromMemorySegment(BYTE_SPECIES, b, i, LITTLE_ENDIAN);
+      ByteVector va8 = a.load(BYTE_SPECIES, i);
+      ByteVector vb8 = b.load(BYTE_SPECIES, i);
 
       // 16-bit multiply: avoid AVX-512 heavy multiply on zmm
       Vector<Short> va16 = va8.convertShape(B2S, SHORT_SPECIES, 0);
@@ -364,11 +418,11 @@ private static int dotProductBody512(MemorySegment a, MemorySegment b, int limit
   }
 
   /** vectorized dot product body (256 bit vectors) */
-  private static int dotProductBody256(MemorySegment a, MemorySegment b, int limit) {
+  private static int dotProductBody256(ByteVectorLoader a, ByteVectorLoader b, int limit) {
     IntVector acc = IntVector.zero(IntVector.SPECIES_256);
     for (int i = 0; i < limit; i += ByteVector.SPECIES_64.length()) {
-      ByteVector va8 = ByteVector.fromMemorySegment(ByteVector.SPECIES_64, a, i, LITTLE_ENDIAN);
-      ByteVector vb8 = ByteVector.fromMemorySegment(ByteVector.SPECIES_64, b, i, LITTLE_ENDIAN);
+      ByteVector va8 = a.load(ByteVector.SPECIES_64, i);
+      ByteVector vb8 = b.load(ByteVector.SPECIES_64, i);
 
       // 32-bit multiply and add into accumulator
       Vector<Integer> va32 = va8.convertShape(B2I, IntVector.SPECIES_256, 0);
@@ -380,13 +434,13 @@ private static int dotProductBody256(MemorySegment a, MemorySegment b, int limit
   }
 
   /** vectorized dot product body (128 bit vectors) */
-  private static int dotProductBody128(MemorySegment a, MemorySegment b, int limit) {
+  private static int dotProductBody128(ByteVectorLoader a, ByteVectorLoader b, int limit) {
     IntVector acc = IntVector.zero(IntVector.SPECIES_128);
     // 4 bytes at a time (re-loading half the vector each time!)
     for (int i = 0; i < limit; i += ByteVector.SPECIES_64.length() >> 1) {
       // load 8 bytes
-      ByteVector va8 = ByteVector.fromMemorySegment(ByteVector.SPECIES_64, a, i, LITTLE_ENDIAN);
-      ByteVector vb8 = ByteVector.fromMemorySegment(ByteVector.SPECIES_64, b, i, LITTLE_ENDIAN);
+      ByteVector va8 = a.load(ByteVector.SPECIES_64, i);
+      ByteVector vb8 = b.load(ByteVector.SPECIES_64, i);
 
       // process first "half" only: 16-bit multiply
       Vector<Short> va16 = va8.convert(B2S, 0);
@@ -578,28 +632,36 @@ private int int4DotProductBody128(byte[] a, byte[] b, int limit) {
 
   @Override
   public float cosine(byte[] a, byte[] b) {
-    return cosine(MemorySegment.ofArray(a), MemorySegment.ofArray(b));
+    return cosineBody(new ArrayLoader(a), new ArrayLoader(b));
   }
 
   public static float cosine(MemorySegment a, MemorySegment b) {
+    return cosineBody(new MemorySegmentLoader(a), new MemorySegmentLoader(b));
+  }
+
+  public static float cosine(byte[] a, MemorySegment b) {
+    return cosineBody(new ArrayLoader(a), new MemorySegmentLoader(b));
+  }
+
+  private static float cosineBody(ByteVectorLoader a, ByteVectorLoader b) {
     int i = 0;
     int sum = 0;
     int norm1 = 0;
     int norm2 = 0;
 
     // only vectorize if we'll at least enter the loop a single time, and we have at least 128-bit
     // vectors (256-bit on intel to dodge performance landmines)
-    if (a.byteSize() >= 16 && PanamaVectorConstants.HAS_FAST_INTEGER_VECTORS) {
+    if (a.length() >= 16 && PanamaVectorConstants.HAS_FAST_INTEGER_VECTORS) {
       final float[] ret;
       if (VECTOR_BITSIZE >= 512) {
-        i += BYTE_SPECIES.loopBound((int) a.byteSize());
+        i += BYTE_SPECIES.loopBound(a.length());
         ret = cosineBody512(a, b, i);
       } else if (VECTOR_BITSIZE == 256) {
-        i += BYTE_SPECIES.loopBound((int) a.byteSize());
+        i += BYTE_SPECIES.loopBound(a.length());
         ret = cosineBody256(a, b, i);
       } else {
         // tricky: we don't have SPECIES_32, so we workaround with "overlapping read"
-        i += ByteVector.SPECIES_64.loopBound(a.byteSize() - ByteVector.SPECIES_64.length());
+        i += ByteVector.SPECIES_64.loopBound(a.length() - ByteVector.SPECIES_64.length());
         ret = cosineBody128(a, b, i);
       }
       sum += ret[0];
@@ -608,9 +670,9 @@ public static float cosine(MemorySegment a, MemorySegment b) {
     }
 
     // scalar tail
-    for (; i < a.byteSize(); i++) {
-      byte elem1 = a.get(JAVA_BYTE, i);
-      byte elem2 = b.get(JAVA_BYTE, i);
+    for (; i < a.length(); i++) {
+      byte elem1 = a.tail(i);
+      byte elem2 = b.tail(i);
       sum += elem1 * elem2;
       norm1 += elem1 * elem1;
       norm2 += elem2 * elem2;
@@ -619,13 +681,13 @@ public static float cosine(MemorySegment a, MemorySegment b) {
   }
 
   /** vectorized cosine body (512 bit vectors) */
-  private static float[] cosineBody512(MemorySegment a, MemorySegment b, int limit) {
+  private static float[] cosineBody512(ByteVectorLoader a, ByteVectorLoader b, int limit) {
     IntVector accSum = IntVector.zero(INT_SPECIES);
     IntVector accNorm1 = IntVector.zero(INT_SPECIES);
     IntVector accNorm2 = IntVector.zero(INT_SPECIES);
     for (int i = 0; i < limit; i += BYTE_SPECIES.length()) {
-      ByteVector va8 = ByteVector.fromMemorySegment(BYTE_SPECIES, a, i, LITTLE_ENDIAN);
-      ByteVector vb8 = ByteVector.fromMemorySegment(BYTE_SPECIES, b, i, LITTLE_ENDIAN);
+      ByteVector va8 = a.load(BYTE_SPECIES, i);
+      ByteVector vb8 = b.load(BYTE_SPECIES, i);
 
       // 16-bit multiply: avoid AVX-512 heavy multiply on zmm
       Vector<Short> va16 = va8.convertShape(B2S, SHORT_SPECIES, 0);
@@ -649,13 +711,13 @@ private static float[] cosineBody512(MemorySegment a, MemorySegment b, int limit
   }
 
   /** vectorized cosine body (256 bit vectors) */
-  private static float[] cosineBody256(MemorySegment a, MemorySegment b, int limit) {
+  private static float[] cosineBody256(ByteVectorLoader a, ByteVectorLoader b, int limit) {
     IntVector accSum = IntVector.zero(IntVector.SPECIES_256);
     IntVector accNorm1 = IntVector.zero(IntVector.SPECIES_256);
     IntVector accNorm2 = IntVector.zero(IntVector.SPECIES_256);
     for (int i = 0; i < limit; i += ByteVector.SPECIES_64.length()) {
-      ByteVector va8 = ByteVector.fromMemorySegment(ByteVector.SPECIES_64, a, i, LITTLE_ENDIAN);
-      ByteVector vb8 = ByteVector.fromMemorySegment(ByteVector.SPECIES_64, b, i, LITTLE_ENDIAN);
+      ByteVector va8 = a.load(ByteVector.SPECIES_64, i);
+      ByteVector vb8 = b.load(ByteVector.SPECIES_64, i);
 
       // 16-bit multiply, and add into accumulators
       Vector<Integer> va32 = va8.convertShape(B2I, IntVector.SPECIES_256, 0);
@@ -674,13 +736,13 @@ private static float[] cosineBody256(MemorySegment a, MemorySegment b, int limit
   }
 
   /** vectorized cosine body (128 bit vectors) */
-  private static float[] cosineBody128(MemorySegment a, MemorySegment b, int limit) {
+  private static float[] cosineBody128(ByteVectorLoader a, ByteVectorLoader b, int limit) {
     IntVector accSum = IntVector.zero(IntVector.SPECIES_128);
     IntVector accNorm1 = IntVector.zero(IntVector.SPECIES_128);
     IntVector accNorm2 = IntVector.zero(IntVector.SPECIES_128);
     for (int i = 0; i < limit; i += ByteVector.SPECIES_64.length() >> 1) {
-      ByteVector va8 = ByteVector.fromMemorySegment(ByteVector.SPECIES_64, a, i, LITTLE_ENDIAN);
-      ByteVector vb8 = ByteVector.fromMemorySegment(ByteVector.SPECIES_64, b, i, LITTLE_ENDIAN);
+      ByteVector va8 = a.load(ByteVector.SPECIES_64, i);
+      ByteVector vb8 = b.load(ByteVector.SPECIES_64, i);
 
       // process first half only: 16-bit multiply
       Vector<Short> va16 = va8.convert(B2S, 0);
@@ -702,40 +764,48 @@ private static float[] cosineBody128(MemorySegment a, MemorySegment b, int limit
 
   @Override
   public int squareDistance(byte[] a, byte[] b) {
-    return squareDistance(MemorySegment.ofArray(a), MemorySegment.ofArray(b));
+    return squareDistanceBody(new ArrayLoader(a), new ArrayLoader(b));
   }
 
   public static int squareDistance(MemorySegment a, MemorySegment b) {
-    assert a.byteSize() == b.byteSize();
+    return squareDistanceBody(new MemorySegmentLoader(a), new MemorySegmentLoader(b));
+  }
+
+  public static int squareDistance(byte[] a, MemorySegment b) {
+    return squareDistanceBody(new ArrayLoader(a), new MemorySegmentLoader(b));
+  }
+
+  private static int squareDistanceBody(ByteVectorLoader a, ByteVectorLoader b) {
+    assert a.length() == b.length();
     int i = 0;
     int res = 0;
 
     // only vectorize if we'll at least enter the loop a single time, and we have at least 128-bit
     // vectors (256-bit on intel to dodge performance landmines)
-    if (a.byteSize() >= 16 && PanamaVectorConstants.HAS_FAST_INTEGER_VECTORS) {
+    if (a.length() >= 16 && PanamaVectorConstants.HAS_FAST_INTEGER_VECTORS) {
       if (VECTOR_BITSIZE >= 256) {
-        i += BYTE_SPECIES.loopBound((int) a.byteSize());
+        i += BYTE_SPECIES.loopBound(a.length());
         res += squareDistanceBody256(a, b, i);
       } else {
-        i += ByteVector.SPECIES_64.loopBound((int) a.byteSize());
+        i += ByteVector.SPECIES_64.loopBound(a.length());
         res += squareDistanceBody128(a, b, i);
       }
     }
 
     // scalar tail
-    for (; i < a.byteSize(); i++) {
-      int diff = a.get(JAVA_BYTE, i) - b.get(JAVA_BYTE, i);
+    for (; i < a.length(); i++) {
+      int diff = a.tail(i) - b.tail(i);
       res += diff * diff;
     }
     return res;
   }
 
   /** vectorized square distance body (256+ bit vectors) */
-  private static int squareDistanceBody256(MemorySegment a, MemorySegment b, int limit) {
+  private static int squareDistanceBody256(ByteVectorLoader a, ByteVectorLoader b, int limit) {
     IntVector acc = IntVector.zero(INT_SPECIES);
     for (int i = 0; i < limit; i += BYTE_SPECIES.length()) {
-      ByteVector va8 = ByteVector.fromMemorySegment(BYTE_SPECIES, a, i, LITTLE_ENDIAN);
-      ByteVector vb8 = ByteVector.fromMemorySegment(BYTE_SPECIES, b, i, LITTLE_ENDIAN);
+      ByteVector va8 = a.load(BYTE_SPECIES, i);
+      ByteVector vb8 = b.load(BYTE_SPECIES, i);
 
       // 32-bit sub, multiply, and add into accumulators
       // TODO: uses AVX-512 heavy multiply on zmm, should we just use 256-bit vectors on AVX-512?
@@ -749,14 +819,14 @@ private static int squareDistanceBody256(MemorySegment a, MemorySegment b, int l
   }
 
   /** vectorized square distance body (128 bit vectors) */
-  private static int squareDistanceBody128(MemorySegment a, MemorySegment b, int limit) {
+  private static int squareDistanceBody128(ByteVectorLoader a, ByteVectorLoader b, int limit) {
     // 128-bit implementation, which must "split up" vectors due to widening conversions
     // it doesn't help to do the overlapping read trick, due to 32-bit multiply in the formula
     IntVector acc1 = IntVector.zero(IntVector.SPECIES_128);
     IntVector acc2 = IntVector.zero(IntVector.SPECIES_128);
     for (int i = 0; i < limit; i += ByteVector.SPECIES_64.length()) {
-      ByteVector va8 = ByteVector.fromMemorySegment(ByteVector.SPECIES_64, a, i, LITTLE_ENDIAN);
-      ByteVector vb8 = ByteVector.fromMemorySegment(ByteVector.SPECIES_64, b, i, LITTLE_ENDIAN);
+      ByteVector va8 = a.load(ByteVector.SPECIES_64, i);
+      ByteVector vb8 = b.load(ByteVector.SPECIES_64, i);
 
       // 16-bit sub
       Vector<Short> va16 = va8.convertShape(B2S, ShortVector.SPECIES_128, 0);