Skip to content

Commit 251e7df

Browse files
HUSTERGSgesong.samueljpountz
authored
Vectorize filterCompetitiveHits (#14896)
Co-authored-by: gesong.samuel <[email protected]> Co-authored-by: Adrien Grand <[email protected]>
1 parent d8b52ad commit 251e7df

File tree

11 files changed

+327
-12
lines changed

11 files changed

+327
-12
lines changed

gradle/validation/forbidden-apis/non-standard/incubator-vector.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ jdk.incubator.vector.FloatVector#fma(**)
77
jdk.incubator.vector.DoubleVector#fma(**)
88
jdk.incubator.vector.VectorOperators#FMA
99

10-
@defaultMessage Potentially slow on some CPUs, please check the CPU has feature: Unsupported on NEON
10+
@defaultMessage Potentially slow on some CPUs, please check Constants.HAS_FAST_COMPRESS_MASK_CAST: Need SVE and AVX2 support
1111
jdk.incubator.vector.ByteVector#compress(**)
1212
jdk.incubator.vector.IntVector#compress(**)
1313
jdk.incubator.vector.ShortVector#compress(**)
@@ -18,3 +18,4 @@ jdk.incubator.vector.IntVector#expand(**)
1818
jdk.incubator.vector.ShortVector#expand(**)
1919
jdk.incubator.vector.LongVector#expand(**)
2020
jdk.incubator.vector.VectorOperators#EXPAND_BITS
21+
jdk.incubator.vector.VectorMask#cast(**)

lucene/CHANGES.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,8 @@ Optimizations
212212

213213
* GITHUB#14936: Don't do the filtered knn path when the provided filter is a MatchAllDocsQuery. (Ben Trent)
214214

215+
* GITHUB#14896: Vectorize filterCompetitiveHits (Ge Song, Adrien Grand)
216+
215217
Changes in Runtime Behavior
216218
---------------------
217219
* GITHUB#14823: Decrease TieredMergePolicy's default number of segments per
Lines changed: 169 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
package org.apache.lucene.benchmark.jmh;
18+
19+
import java.util.Arrays;
20+
import java.util.SplittableRandom;
21+
import java.util.concurrent.TimeUnit;
22+
import java.util.function.IntSupplier;
23+
import org.apache.lucene.util.ArrayUtil;
24+
import org.apache.lucene.util.VectorUtil;
25+
import org.openjdk.jmh.annotations.Benchmark;
26+
import org.openjdk.jmh.annotations.BenchmarkMode;
27+
import org.openjdk.jmh.annotations.Fork;
28+
import org.openjdk.jmh.annotations.Level;
29+
import org.openjdk.jmh.annotations.Measurement;
30+
import org.openjdk.jmh.annotations.Mode;
31+
import org.openjdk.jmh.annotations.OutputTimeUnit;
32+
import org.openjdk.jmh.annotations.Param;
33+
import org.openjdk.jmh.annotations.Scope;
34+
import org.openjdk.jmh.annotations.Setup;
35+
import org.openjdk.jmh.annotations.State;
36+
import org.openjdk.jmh.annotations.Warmup;
37+
38+
@BenchmarkMode(Mode.Throughput)
39+
@OutputTimeUnit(TimeUnit.MILLISECONDS)
40+
@State(Scope.Benchmark)
41+
@Warmup(iterations = 3, time = 1)
42+
@Measurement(iterations = 5, time = 1)
43+
@Fork(
44+
value = 1,
45+
jvmArgsAppend = {
46+
"-Xmx1g",
47+
"-Xms1g",
48+
"-XX:+AlwaysPreTouch",
49+
"--add-modules",
50+
"jdk.incubator.vector"
51+
})
52+
public class CompetitiveBenchmark {
53+
54+
private final SplittableRandom R = new SplittableRandom(0);
55+
56+
@Param("128")
57+
int size;
58+
59+
double[] scores;
60+
int[] docs;
61+
62+
// scores generated by nextDouble() locate in range [0, 1), so we can tune this parameter and
63+
// see how the performance changes depends on how selective the filter is.
64+
@Param({"0", "0.2", "0.4", "0.5", "0.8"})
65+
double minScoreInclusive;
66+
67+
@Setup(Level.Trial)
68+
public void setUpTrial() {
69+
scores = new double[size];
70+
docs = new int[size];
71+
}
72+
73+
@Setup(Level.Invocation)
74+
public void setUpInvocation() {
75+
for (int i = 0; i < size; i++) {
76+
docs[i] = R.nextInt(Integer.MAX_VALUE);
77+
scores[i] = R.nextDouble();
78+
}
79+
}
80+
81+
@Benchmark
82+
public int baseline() {
83+
int newSize = 0;
84+
for (int i = 0; i < size; ++i) {
85+
if (scores[i] >= minScoreInclusive) {
86+
docs[newSize] = docs[i];
87+
scores[newSize] = scores[i];
88+
newSize++;
89+
}
90+
}
91+
return newSize;
92+
}
93+
94+
@Benchmark
95+
public int branchlessCandidate() {
96+
int newSize = 0;
97+
for (int i = 0; i < size; ++i) {
98+
int inc = scores[i] >= minScoreInclusive ? 1 : 0;
99+
docs[newSize] = docs[i];
100+
scores[newSize] = scores[i];
101+
newSize += inc;
102+
}
103+
return newSize;
104+
}
105+
106+
// This is an effort try to make the modification of newSize using cmov
107+
// see https://github.com/apache/lucene/pull/14906
108+
@Benchmark
109+
public int branchlessCandidateCmov() {
110+
int newSize = 0;
111+
for (int i = 0; i < size; ++i) {
112+
int doc = docs[i];
113+
double score = scores[i];
114+
docs[newSize] = doc;
115+
scores[newSize] = score;
116+
if (score >= minScoreInclusive) {
117+
newSize++;
118+
}
119+
}
120+
return newSize;
121+
}
122+
123+
@Benchmark
124+
public int vectorizedCandidate() {
125+
return VectorUtil.filterByScore(docs, scores, minScoreInclusive, size);
126+
}
127+
128+
public static void main(String[] args) {
129+
CompetitiveBenchmark baseline = new CompetitiveBenchmark();
130+
baseline.size = 128;
131+
baseline.setUpTrial();
132+
baseline.setUpInvocation();
133+
int baselineSize = baseline.baseline();
134+
135+
CompetitiveBenchmark candidate = new CompetitiveBenchmark();
136+
candidate.size = 128;
137+
candidate.setUpTrial();
138+
candidate.setUpInvocation();
139+
140+
for (IntSupplier s :
141+
new IntSupplier[] {
142+
candidate::branchlessCandidate,
143+
candidate::vectorizedCandidate,
144+
candidate::branchlessCandidateCmov
145+
}) {
146+
147+
int candidateSize = s.getAsInt();
148+
149+
if (baselineSize != candidateSize) {
150+
throw new IllegalArgumentException("incorrect size");
151+
}
152+
153+
if (Arrays.equals(baseline.docs, 0, baselineSize, candidate.docs, 0, candidateSize)
154+
== false) {
155+
throw new IllegalArgumentException(
156+
"incorrect docs,"
157+
+ "\nbaseline: "
158+
+ Arrays.toString(ArrayUtil.copyOfSubArray(baseline.docs, 0, baselineSize))
159+
+ "\ncandidate: "
160+
+ Arrays.toString(ArrayUtil.copyOfSubArray(candidate.docs, 0, candidateSize)));
161+
}
162+
163+
if (Arrays.equals(baseline.scores, 0, baselineSize, candidate.scores, 0, candidateSize)
164+
== false) {
165+
throw new IllegalArgumentException("incorrect scores");
166+
}
167+
}
168+
}
169+
}

lucene/core/src/java/org/apache/lucene/internal/vectorization/DefaultVectorUtilSupport.java

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -309,4 +309,20 @@ private float quantizeFloat(float v, byte[] dest, int destIndex) {
309309
return minQuantile * (v - minQuantile / 2.0F) + (dx - dxq) * dxq;
310310
}
311311
}
312+
313+
@Override
314+
public int filterByScore(
315+
int[] docBuffer, double[] scoreBuffer, double minScoreInclusive, int upTo) {
316+
int newSize = 0;
317+
for (int i = 0; i < upTo; ++i) {
318+
int doc = docBuffer[i];
319+
double score = scoreBuffer[i];
320+
docBuffer[newSize] = doc;
321+
scoreBuffer[newSize] = score;
322+
if (score >= minScoreInclusive) {
323+
newSize++;
324+
}
325+
}
326+
return newSize;
327+
}
312328
}

lucene/core/src/java/org/apache/lucene/internal/vectorization/VectorUtilSupport.java

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,4 +100,18 @@ float recalculateScalarQuantizationOffset(
100100
float alpha,
101101
float minQuantile,
102102
float maxQuantile);
103+
104+
/**
105+
* filter both {@code docBuffer} and {@code scoreBuffer} with {@code minScoreInclusive}, each
106+
* {@code docBuffer} and {@code scoreBuffer} of the same index forms a pair, pairs with score not
107+
* greater than or equal to {@code minScoreInclusive} will be filtered out from the array.
108+
*
109+
* @param docBuffer doc buffer contains docs (or some other value forms a pair with {@code
110+
* scoreBuffer})
111+
* @param scoreBuffer score buffer contains scores to be compared with {@code minScoreInclusive}
112+
* @param minScoreInclusive minimal required score to not be filtered out
113+
* @param upTo where the filter should end
114+
* @return how many pairs left after filter
115+
*/
116+
int filterByScore(int[] docBuffer, double[] scoreBuffer, double minScoreInclusive, int upTo);
103117
}

lucene/core/src/java/org/apache/lucene/search/ScorerUtil.java

Lines changed: 3 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
import org.apache.lucene.util.FixedBitSet;
2727
import org.apache.lucene.util.MathUtil;
2828
import org.apache.lucene.util.PriorityQueue;
29+
import org.apache.lucene.util.VectorUtil;
2930

3031
/** Util class for Scorer related methods */
3132
class ScorerUtil {
@@ -155,17 +156,8 @@ static void filterCompetitiveHits(
155156
return;
156157
}
157158

158-
int newSize = 0;
159-
for (int i = 0; i < buffer.size; ++i) {
160-
int doc = buffer.docs[i];
161-
double score = buffer.scores[i];
162-
buffer.docs[newSize] = doc;
163-
buffer.scores[newSize] = score;
164-
if (score >= minRequiredScore) {
165-
newSize += 1;
166-
}
167-
}
168-
buffer.size = newSize;
159+
buffer.size =
160+
VectorUtil.filterByScore(buffer.docs, buffer.scores, minRequiredScore, buffer.size);
169161
}
170162

171163
/**

lucene/core/src/java/org/apache/lucene/util/Constants.java

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,12 +94,23 @@ private static boolean is64Bit() {
9494
private static final boolean HAS_SSE4A =
9595
HotspotVMOptions.get("UseXmmI2F").map(Boolean::valueOf).orElse(false);
9696

97+
/** true for cpu with AVX support at least AVX2. */
98+
private static final boolean HAS_AVX2 =
99+
HotspotVMOptions.get("UseAVX").map(Integer::valueOf).orElse(0) >= 2;
100+
101+
/** true for arm cpu with SVE support. */
102+
private static final boolean HAS_SVE =
103+
HotspotVMOptions.get("UseSVE").map(Integer::valueOf).orElse(0) >= 1;
104+
97105
/** true iff we know VFMA has faster throughput than separate vmul/vadd. */
98106
public static final boolean HAS_FAST_VECTOR_FMA = hasFastVectorFMA();
99107

100108
/** true iff we know FMA has faster throughput than separate mul/add. */
101109
public static final boolean HAS_FAST_SCALAR_FMA = hasFastScalarFMA();
102110

111+
/** true iff we know Compress and Cast has fast throughput. */
112+
public static final boolean HAS_FAST_COMPRESS_MASK_CAST = hasFastCompressMaskCast();
113+
103114
private static boolean hasFastVectorFMA() {
104115
if (HAS_FMA) {
105116
String value = getSysProp("lucene.useVectorFMA", "auto");
@@ -152,6 +163,17 @@ private static boolean hasFastScalarFMA() {
152163
return false;
153164
}
154165

166+
private static boolean hasFastCompressMaskCast() {
167+
if (OS_ARCH.equals("aarch64") && HAS_SVE) {
168+
return true;
169+
}
170+
171+
if (OS_ARCH.equals("amd64") && HAS_AVX2) {
172+
return true;
173+
}
174+
return false;
175+
}
176+
155177
/**
156178
* The default {@link ReadAdvice} used for opening index files. It will be {@link
157179
* ReadAdvice#RANDOM} by default, unless set by system property {@code

lucene/core/src/java/org/apache/lucene/util/VectorUtil.java

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -376,4 +376,25 @@ public static float recalculateOffset(
376376
return IMPL.recalculateScalarQuantizationOffset(
377377
vector, oldAlpha, oldMinQuantile, scale, alpha, minQuantile, maxQuantile);
378378
}
379+
380+
/**
381+
* filter both {@code docBuffer} and {@code scoreBuffer} with {@code minScoreInclusive}, each
382+
* {@code docBuffer} and {@code scoreBuffer} of the same index forms a pair, pairs with score not
383+
* greater than or equal to {@code minScoreInclusive} will be filtered out from the array.
384+
*
385+
* @param docBuffer doc buffer contains docs (or some other value forms a pair with {@code
386+
* scoreBuffer})
387+
* @param scoreBuffer score buffer contains scores to be compared with {@code minScoreInclusive}
388+
* @param minScoreInclusive minimal required score to not be filtered out
389+
* @param upTo where the filter should end
390+
* @return how many pairs left after filter
391+
*/
392+
public static int filterByScore(
393+
int[] docBuffer, double[] scoreBuffer, double minScoreInclusive, int upTo) {
394+
if (docBuffer.length != scoreBuffer.length || docBuffer.length < upTo) {
395+
throw new IllegalArgumentException(
396+
"docBuffer and scoreBuffer should keep same length and at least as long as upTo");
397+
}
398+
return IMPL.filterByScore(docBuffer, scoreBuffer, minScoreInclusive, upTo);
399+
}
379400
}

lucene/core/src/java24/org/apache/lucene/internal/vectorization/PanamaVectorConstants.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ final class PanamaVectorConstants {
3030
static final boolean ENABLE_INTEGER_VECTORS;
3131

3232
static final VectorSpecies<Integer> PRERERRED_INT_SPECIES;
33+
static final VectorSpecies<Double> PREFERRED_DOUBLE_SPECIES;
3334

3435
static {
3536
// default to platform supported bitsize
@@ -46,6 +47,8 @@ final class PanamaVectorConstants {
4647

4748
PRERERRED_INT_SPECIES =
4849
VectorSpecies.of(int.class, VectorShape.forBitSize(PREFERRED_VECTOR_BITSIZE));
50+
PREFERRED_DOUBLE_SPECIES =
51+
VectorSpecies.of(double.class, VectorShape.forBitSize(PREFERRED_VECTOR_BITSIZE));
4952
}
5053

5154
private PanamaVectorConstants() {}

lucene/core/src/java24/org/apache/lucene/internal/vectorization/PanamaVectorUtilSupport.java

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727

2828
import java.lang.foreign.MemorySegment;
2929
import jdk.incubator.vector.ByteVector;
30+
import jdk.incubator.vector.DoubleVector;
3031
import jdk.incubator.vector.FloatVector;
3132
import jdk.incubator.vector.IntVector;
3233
import jdk.incubator.vector.LongVector;
@@ -54,6 +55,11 @@ final class PanamaVectorUtilSupport implements VectorUtilSupport {
5455

5556
// preferred vector sizes, which can be altered for testing
5657
private static final VectorSpecies<Float> FLOAT_SPECIES;
58+
private static final VectorSpecies<Double> DOUBLE_SPECIES =
59+
PanamaVectorConstants.PREFERRED_DOUBLE_SPECIES;
60+
// This create a vector species which we make sure have exact half bits of DOUBLE_SPECIES
61+
private static final VectorSpecies<Integer> INT_FOR_DOUBLE_SPECIES =
62+
VectorSpecies.of(int.class, VectorShape.forBitSize(DOUBLE_SPECIES.vectorBitSize() / 2));
5763
private static final VectorSpecies<Integer> INT_SPECIES =
5864
PanamaVectorConstants.PRERERRED_INT_SPECIES;
5965
private static final VectorSpecies<Byte> BYTE_SPECIES;
@@ -998,4 +1004,33 @@ public float recalculateScalarQuantizationOffset(
9981004

9991005
return correction;
10001006
}
1007+
1008+
@SuppressForbidden(reason = "Uses compress and cast only where fast and carefully contained")
1009+
@Override
1010+
public int filterByScore(
1011+
int[] docBuffer, double[] scoreBuffer, double minScoreInclusive, int upTo) {
1012+
int newUpto = 0;
1013+
int i = 0;
1014+
if (Constants.HAS_FAST_COMPRESS_MASK_CAST) {
1015+
for (int bound = DOUBLE_SPECIES.loopBound(upTo); i < bound; i += DOUBLE_SPECIES.length()) {
1016+
DoubleVector scoreVector = DoubleVector.fromArray(DOUBLE_SPECIES, scoreBuffer, i);
1017+
IntVector docVector = IntVector.fromArray(INT_FOR_DOUBLE_SPECIES, docBuffer, i);
1018+
VectorMask<Double> mask = scoreVector.compare(VectorOperators.GE, minScoreInclusive);
1019+
scoreVector.compress(mask).intoArray(scoreBuffer, newUpto);
1020+
docVector.compress(mask.cast(INT_FOR_DOUBLE_SPECIES)).intoArray(docBuffer, newUpto);
1021+
newUpto += mask.trueCount();
1022+
}
1023+
}
1024+
1025+
for (; i < upTo; ++i) {
1026+
int doc = docBuffer[i];
1027+
double score = scoreBuffer[i];
1028+
docBuffer[newUpto] = doc;
1029+
scoreBuffer[newUpto] = score;
1030+
if (score >= minScoreInclusive) {
1031+
newUpto++;
1032+
}
1033+
}
1034+
return newUpto;
1035+
}
10011036
}

0 commit comments

Comments
 (0)