Vectorize filterCompetitiveHits (#14896)

HUSTERGS · gesong.samuel · jpountz · web-flow · commit 251e7df0f584 · 2025-07-17T09:08:10.000+02:00
Co-authored-by: gesong.samuel &lt;gesong.samuel@bytedance.com&gt;
Co-authored-by: Adrien Grand &lt;jpountz@gmail.com&gt;
diff --git a/gradle/validation/forbidden-apis/non-standard/incubator-vector.txt b/gradle/validation/forbidden-apis/non-standard/incubator-vector.txt
@@ -7,7 +7,7 @@ jdk.incubator.vector.FloatVector#fma(**)
 jdk.incubator.vector.DoubleVector#fma(**)
 jdk.incubator.vector.VectorOperators#FMA
 
-@defaultMessage Potentially slow on some CPUs, please check the CPU has feature: Unsupported on NEON
+@defaultMessage Potentially slow on some CPUs, please check Constants.HAS_FAST_COMPRESS_MASK_CAST: Need SVE and AVX2 support
 jdk.incubator.vector.ByteVector#compress(**)
 jdk.incubator.vector.IntVector#compress(**)
 jdk.incubator.vector.ShortVector#compress(**)
@@ -18,3 +18,4 @@ jdk.incubator.vector.IntVector#expand(**)
 jdk.incubator.vector.ShortVector#expand(**)
 jdk.incubator.vector.LongVector#expand(**)
 jdk.incubator.vector.VectorOperators#EXPAND_BITS
+jdk.incubator.vector.VectorMask#cast(**)
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
@@ -212,6 +212,8 @@ Optimizations
 
 * GITHUB#14936: Don't do the filtered knn path when the provided filter is a MatchAllDocsQuery. (Ben Trent)
 
+* GITHUB#14896: Vectorize filterCompetitiveHits (Ge Song, Adrien Grand)
+
 Changes in Runtime Behavior
 ---------------------
 * GITHUB#14823: Decrease TieredMergePolicy's default number of segments per
diff --git a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/CompetitiveBenchmark.java b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/CompetitiveBenchmark.java
@@ -0,0 +1,169 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.benchmark.jmh;
+
+import java.util.Arrays;
+import java.util.SplittableRandom;
+import java.util.concurrent.TimeUnit;
+import java.util.function.IntSupplier;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.VectorUtil;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Level;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+
+@BenchmarkMode(Mode.Throughput)
+@OutputTimeUnit(TimeUnit.MILLISECONDS)
+@State(Scope.Benchmark)
+@Warmup(iterations = 3, time = 1)
+@Measurement(iterations = 5, time = 1)
+@Fork(
+    value = 1,
+    jvmArgsAppend = {
+      "-Xmx1g",
+      "-Xms1g",
+      "-XX:+AlwaysPreTouch",
+      "--add-modules",
+      "jdk.incubator.vector"
+    })
+public class CompetitiveBenchmark {
+
+  private final SplittableRandom R = new SplittableRandom(0);
+
+  @Param("128")
+  int size;
+
+  double[] scores;
+  int[] docs;
+
+  // scores generated by nextDouble() locate in range [0, 1), so we can tune this parameter and
+  // see how the performance changes depends on how selective the filter is.
+  @Param({"0", "0.2", "0.4", "0.5", "0.8"})
+  double minScoreInclusive;
+
+  @Setup(Level.Trial)
+  public void setUpTrial() {
+    scores = new double[size];
+    docs = new int[size];
+  }
+
+  @Setup(Level.Invocation)
+  public void setUpInvocation() {
+    for (int i = 0; i < size; i++) {
+      docs[i] = R.nextInt(Integer.MAX_VALUE);
+      scores[i] = R.nextDouble();
+    }
+  }
+
+  @Benchmark
+  public int baseline() {
+    int newSize = 0;
+    for (int i = 0; i < size; ++i) {
+      if (scores[i] >= minScoreInclusive) {
+        docs[newSize] = docs[i];
+        scores[newSize] = scores[i];
+        newSize++;
+      }
+    }
+    return newSize;
+  }
+
+  @Benchmark
+  public int branchlessCandidate() {
+    int newSize = 0;
+    for (int i = 0; i < size; ++i) {
+      int inc = scores[i] >= minScoreInclusive ? 1 : 0;
+      docs[newSize] = docs[i];
+      scores[newSize] = scores[i];
+      newSize += inc;
+    }
+    return newSize;
+  }
+
+  // This is an effort try to make the modification of newSize using cmov
+  // see https://github.com/apache/lucene/pull/14906
+  @Benchmark
+  public int branchlessCandidateCmov() {
+    int newSize = 0;
+    for (int i = 0; i < size; ++i) {
+      int doc = docs[i];
+      double score = scores[i];
+      docs[newSize] = doc;
+      scores[newSize] = score;
+      if (score >= minScoreInclusive) {
+        newSize++;
+      }
+    }
+    return newSize;
+  }
+
+  @Benchmark
+  public int vectorizedCandidate() {
+    return VectorUtil.filterByScore(docs, scores, minScoreInclusive, size);
+  }
+
+  public static void main(String[] args) {
+    CompetitiveBenchmark baseline = new CompetitiveBenchmark();
+    baseline.size = 128;
+    baseline.setUpTrial();
+    baseline.setUpInvocation();
+    int baselineSize = baseline.baseline();
+
+    CompetitiveBenchmark candidate = new CompetitiveBenchmark();
+    candidate.size = 128;
+    candidate.setUpTrial();
+    candidate.setUpInvocation();
+
+    for (IntSupplier s :
+        new IntSupplier[] {
+          candidate::branchlessCandidate,
+          candidate::vectorizedCandidate,
+          candidate::branchlessCandidateCmov
+        }) {
+
+      int candidateSize = s.getAsInt();
+
+      if (baselineSize != candidateSize) {
+        throw new IllegalArgumentException("incorrect size");
+      }
+
+      if (Arrays.equals(baseline.docs, 0, baselineSize, candidate.docs, 0, candidateSize)
+          == false) {
+        throw new IllegalArgumentException(
+            "incorrect docs,"
+                + "\nbaseline: "
+                + Arrays.toString(ArrayUtil.copyOfSubArray(baseline.docs, 0, baselineSize))
+                + "\ncandidate: "
+                + Arrays.toString(ArrayUtil.copyOfSubArray(candidate.docs, 0, candidateSize)));
+      }
+
+      if (Arrays.equals(baseline.scores, 0, baselineSize, candidate.scores, 0, candidateSize)
+          == false) {
+        throw new IllegalArgumentException("incorrect scores");
+      }
+    }
+  }
+}
diff --git a/lucene/core/src/java/org/apache/lucene/internal/vectorization/DefaultVectorUtilSupport.java b/lucene/core/src/java/org/apache/lucene/internal/vectorization/DefaultVectorUtilSupport.java
@@ -309,4 +309,20 @@ private float quantizeFloat(float v, byte[] dest, int destIndex) {
       return minQuantile * (v - minQuantile / 2.0F) + (dx - dxq) * dxq;
     }
   }
+
+  @Override
+  public int filterByScore(
+      int[] docBuffer, double[] scoreBuffer, double minScoreInclusive, int upTo) {
+    int newSize = 0;
+    for (int i = 0; i < upTo; ++i) {
+      int doc = docBuffer[i];
+      double score = scoreBuffer[i];
+      docBuffer[newSize] = doc;
+      scoreBuffer[newSize] = score;
+      if (score >= minScoreInclusive) {
+        newSize++;
+      }
+    }
+    return newSize;
+  }
 }
diff --git a/lucene/core/src/java/org/apache/lucene/internal/vectorization/VectorUtilSupport.java b/lucene/core/src/java/org/apache/lucene/internal/vectorization/VectorUtilSupport.java
@@ -100,4 +100,18 @@ float recalculateScalarQuantizationOffset(
       float alpha,
       float minQuantile,
       float maxQuantile);
+
+  /**
+   * filter both {@code docBuffer} and {@code scoreBuffer} with {@code minScoreInclusive}, each
+   * {@code docBuffer} and {@code scoreBuffer} of the same index forms a pair, pairs with score not
+   * greater than or equal to {@code minScoreInclusive} will be filtered out from the array.
+   *
+   * @param docBuffer doc buffer contains docs (or some other value forms a pair with {@code
+   *     scoreBuffer})
+   * @param scoreBuffer score buffer contains scores to be compared with {@code minScoreInclusive}
+   * @param minScoreInclusive minimal required score to not be filtered out
+   * @param upTo where the filter should end
+   * @return how many pairs left after filter
+   */
+  int filterByScore(int[] docBuffer, double[] scoreBuffer, double minScoreInclusive, int upTo);
 }
diff --git a/lucene/core/src/java/org/apache/lucene/search/ScorerUtil.java b/lucene/core/src/java/org/apache/lucene/search/ScorerUtil.java
@@ -26,6 +26,7 @@
 import org.apache.lucene.util.FixedBitSet;
 import org.apache.lucene.util.MathUtil;
 import org.apache.lucene.util.PriorityQueue;
+import org.apache.lucene.util.VectorUtil;
 
 /** Util class for Scorer related methods */
 class ScorerUtil {
@@ -155,17 +156,8 @@ static void filterCompetitiveHits(
       return;
     }
 
-    int newSize = 0;
-    for (int i = 0; i < buffer.size; ++i) {
-      int doc = buffer.docs[i];
-      double score = buffer.scores[i];
-      buffer.docs[newSize] = doc;
-      buffer.scores[newSize] = score;
-      if (score >= minRequiredScore) {
-        newSize += 1;
-      }
-    }
-    buffer.size = newSize;
+    buffer.size =
+        VectorUtil.filterByScore(buffer.docs, buffer.scores, minRequiredScore, buffer.size);
   }
 
   /**
diff --git a/lucene/core/src/java/org/apache/lucene/util/Constants.java b/lucene/core/src/java/org/apache/lucene/util/Constants.java
@@ -94,12 +94,23 @@ private static boolean is64Bit() {
   private static final boolean HAS_SSE4A =
       HotspotVMOptions.get("UseXmmI2F").map(Boolean::valueOf).orElse(false);
 
+  /** true for cpu with AVX support at least AVX2. */
+  private static final boolean HAS_AVX2 =
+      HotspotVMOptions.get("UseAVX").map(Integer::valueOf).orElse(0) >= 2;
+
+  /** true for arm cpu with SVE support. */
+  private static final boolean HAS_SVE =
+      HotspotVMOptions.get("UseSVE").map(Integer::valueOf).orElse(0) >= 1;
+
   /** true iff we know VFMA has faster throughput than separate vmul/vadd. */
   public static final boolean HAS_FAST_VECTOR_FMA = hasFastVectorFMA();
 
   /** true iff we know FMA has faster throughput than separate mul/add. */
   public static final boolean HAS_FAST_SCALAR_FMA = hasFastScalarFMA();
 
+  /** true iff we know Compress and Cast has fast throughput. */
+  public static final boolean HAS_FAST_COMPRESS_MASK_CAST = hasFastCompressMaskCast();
+
   private static boolean hasFastVectorFMA() {
     if (HAS_FMA) {
       String value = getSysProp("lucene.useVectorFMA", "auto");
@@ -152,6 +163,17 @@ private static boolean hasFastScalarFMA() {
     return false;
   }
 
+  private static boolean hasFastCompressMaskCast() {
+    if (OS_ARCH.equals("aarch64") && HAS_SVE) {
+      return true;
+    }
+
+    if (OS_ARCH.equals("amd64") && HAS_AVX2) {
+      return true;
+    }
+    return false;
+  }
+
   /**
    * The default {@link ReadAdvice} used for opening index files. It will be {@link
    * ReadAdvice#RANDOM} by default, unless set by system property {@code
diff --git a/lucene/core/src/java/org/apache/lucene/util/VectorUtil.java b/lucene/core/src/java/org/apache/lucene/util/VectorUtil.java
@@ -376,4 +376,25 @@ public static float recalculateOffset(
     return IMPL.recalculateScalarQuantizationOffset(
         vector, oldAlpha, oldMinQuantile, scale, alpha, minQuantile, maxQuantile);
   }
+
+  /**
+   * filter both {@code docBuffer} and {@code scoreBuffer} with {@code minScoreInclusive}, each
+   * {@code docBuffer} and {@code scoreBuffer} of the same index forms a pair, pairs with score not
+   * greater than or equal to {@code minScoreInclusive} will be filtered out from the array.
+   *
+   * @param docBuffer doc buffer contains docs (or some other value forms a pair with {@code
+   *     scoreBuffer})
+   * @param scoreBuffer score buffer contains scores to be compared with {@code minScoreInclusive}
+   * @param minScoreInclusive minimal required score to not be filtered out
+   * @param upTo where the filter should end
+   * @return how many pairs left after filter
+   */
+  public static int filterByScore(
+      int[] docBuffer, double[] scoreBuffer, double minScoreInclusive, int upTo) {
+    if (docBuffer.length != scoreBuffer.length || docBuffer.length < upTo) {
+      throw new IllegalArgumentException(
+          "docBuffer and scoreBuffer should keep same length and at least as long as upTo");
+    }
+    return IMPL.filterByScore(docBuffer, scoreBuffer, minScoreInclusive, upTo);
+  }
 }
diff --git a/lucene/core/src/java24/org/apache/lucene/internal/vectorization/PanamaVectorConstants.java b/lucene/core/src/java24/org/apache/lucene/internal/vectorization/PanamaVectorConstants.java
@@ -30,6 +30,7 @@ final class PanamaVectorConstants {
   static final boolean ENABLE_INTEGER_VECTORS;
 
   static final VectorSpecies<Integer> PRERERRED_INT_SPECIES;
+  static final VectorSpecies<Double> PREFERRED_DOUBLE_SPECIES;
 
   static {
     // default to platform supported bitsize
@@ -46,6 +47,8 @@ final class PanamaVectorConstants {
 
     PRERERRED_INT_SPECIES =
         VectorSpecies.of(int.class, VectorShape.forBitSize(PREFERRED_VECTOR_BITSIZE));
+    PREFERRED_DOUBLE_SPECIES =
+        VectorSpecies.of(double.class, VectorShape.forBitSize(PREFERRED_VECTOR_BITSIZE));
   }
 
   private PanamaVectorConstants() {}
diff --git a/lucene/core/src/java24/org/apache/lucene/internal/vectorization/PanamaVectorUtilSupport.java b/lucene/core/src/java24/org/apache/lucene/internal/vectorization/PanamaVectorUtilSupport.java
@@ -27,6 +27,7 @@
 
 import java.lang.foreign.MemorySegment;
 import jdk.incubator.vector.ByteVector;
+import jdk.incubator.vector.DoubleVector;
 import jdk.incubator.vector.FloatVector;
 import jdk.incubator.vector.IntVector;
 import jdk.incubator.vector.LongVector;
@@ -54,6 +55,11 @@ final class PanamaVectorUtilSupport implements VectorUtilSupport {
 
   // preferred vector sizes, which can be altered for testing
   private static final VectorSpecies<Float> FLOAT_SPECIES;
+  private static final VectorSpecies<Double> DOUBLE_SPECIES =
+      PanamaVectorConstants.PREFERRED_DOUBLE_SPECIES;
+  // This create a vector species which we make sure have exact half bits of DOUBLE_SPECIES
+  private static final VectorSpecies<Integer> INT_FOR_DOUBLE_SPECIES =
+      VectorSpecies.of(int.class, VectorShape.forBitSize(DOUBLE_SPECIES.vectorBitSize() / 2));
   private static final VectorSpecies<Integer> INT_SPECIES =
       PanamaVectorConstants.PRERERRED_INT_SPECIES;
   private static final VectorSpecies<Byte> BYTE_SPECIES;
@@ -998,4 +1004,33 @@ public float recalculateScalarQuantizationOffset(
 
     return correction;
   }
+
+  @SuppressForbidden(reason = "Uses compress and cast only where fast and carefully contained")
+  @Override
+  public int filterByScore(
+      int[] docBuffer, double[] scoreBuffer, double minScoreInclusive, int upTo) {
+    int newUpto = 0;
+    int i = 0;
+    if (Constants.HAS_FAST_COMPRESS_MASK_CAST) {
+      for (int bound = DOUBLE_SPECIES.loopBound(upTo); i < bound; i += DOUBLE_SPECIES.length()) {
+        DoubleVector scoreVector = DoubleVector.fromArray(DOUBLE_SPECIES, scoreBuffer, i);
+        IntVector docVector = IntVector.fromArray(INT_FOR_DOUBLE_SPECIES, docBuffer, i);
+        VectorMask<Double> mask = scoreVector.compare(VectorOperators.GE, minScoreInclusive);
+        scoreVector.compress(mask).intoArray(scoreBuffer, newUpto);
+        docVector.compress(mask.cast(INT_FOR_DOUBLE_SPECIES)).intoArray(docBuffer, newUpto);
+        newUpto += mask.trueCount();
+      }
+    }
+
+    for (; i < upTo; ++i) {
+      int doc = docBuffer[i];
+      double score = scoreBuffer[i];
+      docBuffer[newUpto] = doc;
+      scoreBuffer[newUpto] = score;
+      if (score >= minScoreInclusive) {
+        newUpto++;
+      }
+    }
+    return newUpto;
+  }
 }
diff --git a/lucene/core/src/test/org/apache/lucene/util/TestVectorUtil.java b/lucene/core/src/test/org/apache/lucene/util/TestVectorUtil.java