Skip to content

Commit 67332f8

Browse files
authored
Improve halfbyte transposition performance, marginally improving bbq performance (#117350) (#118293)
The transposition of the bits in half-byte queries for BBQ is pretty convoluted and slow. This commit greatly simplifies & improves performance for this small part of bbq queries and indexing. Here are the results of a small JMH benchmark for this particular function. ``` TransposeBinBenchmark.transposeBinNew 1024 thrpt 5 857.779 ± 44.031 ops/ms TransposeBinBenchmark.transposeBinOrig 1024 thrpt 5 94.950 ± 2.898 ops/ms ``` While this is a huge improvement for this small function, the impact at query and index time is only marginal. But, the code simplification itself is enough to warrant this change in my opinion. (cherry picked from commit e90eb7a)
1 parent afd8d84 commit 67332f8

File tree

3 files changed

+32
-49
lines changed

3 files changed

+32
-49
lines changed

docs/changelog/117350.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pr: 117350
2+
summary: "Improve halfbyte transposition performance, marginally improving bbq performance"
3+
area: Vector Search
4+
type: enhancement
5+
issues: []

server/src/main/java/org/elasticsearch/index/codec/vectors/BQSpaceUtils.java

Lines changed: 25 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -23,56 +23,38 @@
2323
public class BQSpaceUtils {
2424

2525
public static final short B_QUERY = 4;
26-
// the first four bits masked
27-
private static final int B_QUERY_MASK = 15;
2826

2927
/**
3028
* Copied from Lucene, replace with Lucene's implementation sometime after Lucene 10
29+
* Transpose the query vector into a byte array allowing for efficient bitwise operations with the
30+
* index bit vectors. The idea here is to organize the query vector bits such that the first bit
31+
* of every dimension is in the first set dimensions bits, or (dimensions/8) bytes. The second,
32+
* third, and fourth bits are in the second, third, and fourth set of dimensions bits,
33+
* respectively. This allows for direct bitwise comparisons with the stored index vectors through
34+
* summing the bitwise results with the relative required bit shifts.
35+
*
3136
* @param q the query vector, assumed to be half-byte quantized with values between 0 and 15
32-
* @param dimensions the number of dimensions in the query vector
3337
* @param quantQueryByte the byte array to store the transposed query vector
3438
*/
35-
public static void transposeBin(byte[] q, int dimensions, byte[] quantQueryByte) {
36-
// TODO: rewrite this in Panama Vector API
37-
int qOffset = 0;
38-
final byte[] v1 = new byte[4];
39-
final byte[] v = new byte[32];
40-
for (int i = 0; i < dimensions; i += 32) {
41-
// for every four bytes we shift left (with remainder across those bytes)
42-
for (int j = 0; j < v.length; j += 4) {
43-
v[j] = (byte) (q[qOffset + j] << B_QUERY | ((q[qOffset + j] >>> B_QUERY) & B_QUERY_MASK));
44-
v[j + 1] = (byte) (q[qOffset + j + 1] << B_QUERY | ((q[qOffset + j + 1] >>> B_QUERY) & B_QUERY_MASK));
45-
v[j + 2] = (byte) (q[qOffset + j + 2] << B_QUERY | ((q[qOffset + j + 2] >>> B_QUERY) & B_QUERY_MASK));
46-
v[j + 3] = (byte) (q[qOffset + j + 3] << B_QUERY | ((q[qOffset + j + 3] >>> B_QUERY) & B_QUERY_MASK));
47-
}
48-
for (int j = 0; j < B_QUERY; j++) {
49-
moveMaskEpi8Byte(v, v1);
50-
for (int k = 0; k < 4; k++) {
51-
quantQueryByte[(B_QUERY - j - 1) * (dimensions / 8) + i / 8 + k] = v1[k];
52-
v1[k] = 0;
53-
}
54-
for (int k = 0; k < v.length; k += 4) {
55-
v[k] = (byte) (v[k] + v[k]);
56-
v[k + 1] = (byte) (v[k + 1] + v[k + 1]);
57-
v[k + 2] = (byte) (v[k + 2] + v[k + 2]);
58-
v[k + 3] = (byte) (v[k + 3] + v[k + 3]);
59-
}
60-
}
61-
qOffset += 32;
62-
}
63-
}
64-
65-
private static void moveMaskEpi8Byte(byte[] v, byte[] v1b) {
66-
int m = 0;
67-
for (int k = 0; k < v.length; k++) {
68-
if ((v[k] & 0b10000000) == 0b10000000) {
69-
v1b[m] |= 0b00000001;
70-
}
71-
if (k % 8 == 7) {
72-
m++;
73-
} else {
74-
v1b[m] <<= 1;
39+
public static void transposeHalfByte(byte[] q, byte[] quantQueryByte) {
40+
for (int i = 0; i < q.length;) {
41+
assert q[i] >= 0 && q[i] <= 15;
42+
int lowerByte = 0;
43+
int lowerMiddleByte = 0;
44+
int upperMiddleByte = 0;
45+
int upperByte = 0;
46+
for (int j = 7; j >= 0 && i < q.length; j--) {
47+
lowerByte |= (q[i] & 1) << j;
48+
lowerMiddleByte |= ((q[i] >> 1) & 1) << j;
49+
upperMiddleByte |= ((q[i] >> 2) & 1) << j;
50+
upperByte |= ((q[i] >> 3) & 1) << j;
51+
i++;
7552
}
53+
int index = ((i + 7) / 8) - 1;
54+
quantQueryByte[index] = (byte) lowerByte;
55+
quantQueryByte[index + quantQueryByte.length / 4] = (byte) lowerMiddleByte;
56+
quantQueryByte[index + quantQueryByte.length / 2] = (byte) upperMiddleByte;
57+
quantQueryByte[index + 3 * quantQueryByte.length / 4] = (byte) upperByte;
7658
}
7759
}
7860
}

server/src/main/java/org/elasticsearch/index/codec/vectors/es816/BinaryQuantizer.java

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -225,9 +225,7 @@ public QueryAndIndexResults quantizeQueryAndIndex(float[] vector, byte[] indexDe
225225

226226
// q¯ = Δ · q¯𝑢 + 𝑣𝑙 · 1𝐷
227227
// q¯ is an approximation of q′ (scalar quantized approximation)
228-
// FIXME: vectors need to be padded but that's expensive; update transponseBin to deal
229-
byteQuery = BQVectorUtils.pad(byteQuery, discretizedDimensions);
230-
BQSpaceUtils.transposeBin(byteQuery, discretizedDimensions, queryDestination);
228+
BQSpaceUtils.transposeHalfByte(byteQuery, queryDestination);
231229
QueryFactors factors = new QueryFactors(quantResult.quantizedSum, distToC, lower, width, normVmC, vDotC);
232230
final float[] indexCorrections;
233231
if (similarityFunction == EUCLIDEAN) {
@@ -368,9 +366,7 @@ public QueryFactors quantizeForQuery(float[] vector, byte[] destination, float[]
368366

369367
// q¯ = Δ · q¯𝑢 + 𝑣𝑙 · 1𝐷
370368
// q¯ is an approximation of q′ (scalar quantized approximation)
371-
// FIXME: vectors need to be padded but that's expensive; update transponseBin to deal
372-
byteQuery = BQVectorUtils.pad(byteQuery, discretizedDimensions);
373-
BQSpaceUtils.transposeBin(byteQuery, discretizedDimensions, destination);
369+
BQSpaceUtils.transposeHalfByte(byteQuery, destination);
374370

375371
QueryFactors factors;
376372
if (similarityFunction != EUCLIDEAN) {

0 commit comments

Comments
 (0)