Skip to content

Commit e90eb7a

Browse files
authored
Improve halfbyte transposition performance, marginally improving bbq performance (#117350)
The transposition of the bits in half-byte queries for BBQ is pretty convoluted and slow. This commit greatly simplifies & improves performance for this small part of bbq queries and indexing. Here are the results of a small JMH benchmark for this particular function. ``` TransposeBinBenchmark.transposeBinNew 1024 thrpt 5 857.779 ± 44.031 ops/ms TransposeBinBenchmark.transposeBinOrig 1024 thrpt 5 94.950 ± 2.898 ops/ms ``` While this is a huge improvement for this small function, the impact at query and index time is only marginal. But, the code simplification itself is enough to warrant this change in my opinion.
1 parent 4ecc751 commit e90eb7a

File tree

3 files changed

+32
-49
lines changed

3 files changed

+32
-49
lines changed

docs/changelog/117350.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pr: 117350
2+
summary: "Improve halfbyte transposition performance, marginally improving bbq performance"
3+
area: Vector Search
4+
type: enhancement
5+
issues: []

server/src/main/java/org/elasticsearch/index/codec/vectors/BQSpaceUtils.java

Lines changed: 25 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -23,56 +23,38 @@
2323
public class BQSpaceUtils {
2424

2525
public static final short B_QUERY = 4;
26-
// the first four bits masked
27-
private static final int B_QUERY_MASK = 15;
2826

2927
/**
3028
* Copied from Lucene, replace with Lucene's implementation sometime after Lucene 10
29+
* Transpose the query vector into a byte array allowing for efficient bitwise operations with the
30+
* index bit vectors. The idea here is to organize the query vector bits such that the first bit
31+
* of every dimension is in the first set dimensions bits, or (dimensions/8) bytes. The second,
32+
* third, and fourth bits are in the second, third, and fourth set of dimensions bits,
33+
* respectively. This allows for direct bitwise comparisons with the stored index vectors through
34+
* summing the bitwise results with the relative required bit shifts.
35+
*
3136
* @param q the query vector, assumed to be half-byte quantized with values between 0 and 15
32-
* @param dimensions the number of dimensions in the query vector
3337
* @param quantQueryByte the byte array to store the transposed query vector
3438
*/
35-
public static void transposeBin(byte[] q, int dimensions, byte[] quantQueryByte) {
36-
// TODO: rewrite this in Panama Vector API
37-
int qOffset = 0;
38-
final byte[] v1 = new byte[4];
39-
final byte[] v = new byte[32];
40-
for (int i = 0; i < dimensions; i += 32) {
41-
// for every four bytes we shift left (with remainder across those bytes)
42-
for (int j = 0; j < v.length; j += 4) {
43-
v[j] = (byte) (q[qOffset + j] << B_QUERY | ((q[qOffset + j] >>> B_QUERY) & B_QUERY_MASK));
44-
v[j + 1] = (byte) (q[qOffset + j + 1] << B_QUERY | ((q[qOffset + j + 1] >>> B_QUERY) & B_QUERY_MASK));
45-
v[j + 2] = (byte) (q[qOffset + j + 2] << B_QUERY | ((q[qOffset + j + 2] >>> B_QUERY) & B_QUERY_MASK));
46-
v[j + 3] = (byte) (q[qOffset + j + 3] << B_QUERY | ((q[qOffset + j + 3] >>> B_QUERY) & B_QUERY_MASK));
47-
}
48-
for (int j = 0; j < B_QUERY; j++) {
49-
moveMaskEpi8Byte(v, v1);
50-
for (int k = 0; k < 4; k++) {
51-
quantQueryByte[(B_QUERY - j - 1) * (dimensions / 8) + i / 8 + k] = v1[k];
52-
v1[k] = 0;
53-
}
54-
for (int k = 0; k < v.length; k += 4) {
55-
v[k] = (byte) (v[k] + v[k]);
56-
v[k + 1] = (byte) (v[k + 1] + v[k + 1]);
57-
v[k + 2] = (byte) (v[k + 2] + v[k + 2]);
58-
v[k + 3] = (byte) (v[k + 3] + v[k + 3]);
59-
}
60-
}
61-
qOffset += 32;
62-
}
63-
}
64-
65-
private static void moveMaskEpi8Byte(byte[] v, byte[] v1b) {
66-
int m = 0;
67-
for (int k = 0; k < v.length; k++) {
68-
if ((v[k] & 0b10000000) == 0b10000000) {
69-
v1b[m] |= 0b00000001;
70-
}
71-
if (k % 8 == 7) {
72-
m++;
73-
} else {
74-
v1b[m] <<= 1;
39+
public static void transposeHalfByte(byte[] q, byte[] quantQueryByte) {
40+
for (int i = 0; i < q.length;) {
41+
assert q[i] >= 0 && q[i] <= 15;
42+
int lowerByte = 0;
43+
int lowerMiddleByte = 0;
44+
int upperMiddleByte = 0;
45+
int upperByte = 0;
46+
for (int j = 7; j >= 0 && i < q.length; j--) {
47+
lowerByte |= (q[i] & 1) << j;
48+
lowerMiddleByte |= ((q[i] >> 1) & 1) << j;
49+
upperMiddleByte |= ((q[i] >> 2) & 1) << j;
50+
upperByte |= ((q[i] >> 3) & 1) << j;
51+
i++;
7552
}
53+
int index = ((i + 7) / 8) - 1;
54+
quantQueryByte[index] = (byte) lowerByte;
55+
quantQueryByte[index + quantQueryByte.length / 4] = (byte) lowerMiddleByte;
56+
quantQueryByte[index + quantQueryByte.length / 2] = (byte) upperMiddleByte;
57+
quantQueryByte[index + 3 * quantQueryByte.length / 4] = (byte) upperByte;
7658
}
7759
}
7860
}

server/src/main/java/org/elasticsearch/index/codec/vectors/BinaryQuantizer.java

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -223,9 +223,7 @@ public QueryAndIndexResults quantizeQueryAndIndex(float[] vector, byte[] indexDe
223223

224224
// q¯ = Δ · q¯𝑢 + 𝑣𝑙 · 1𝐷
225225
// q¯ is an approximation of q′ (scalar quantized approximation)
226-
// FIXME: vectors need to be padded but that's expensive; update transponseBin to deal
227-
byteQuery = BQVectorUtils.pad(byteQuery, discretizedDimensions);
228-
BQSpaceUtils.transposeBin(byteQuery, discretizedDimensions, queryDestination);
226+
BQSpaceUtils.transposeHalfByte(byteQuery, queryDestination);
229227
QueryFactors factors = new QueryFactors(quantResult.quantizedSum, distToC, lower, width, normVmC, vDotC);
230228
final float[] indexCorrections;
231229
if (similarityFunction == EUCLIDEAN) {
@@ -366,9 +364,7 @@ public QueryFactors quantizeForQuery(float[] vector, byte[] destination, float[]
366364

367365
// q¯ = Δ · q¯𝑢 + 𝑣𝑙 · 1𝐷
368366
// q¯ is an approximation of q′ (scalar quantized approximation)
369-
// FIXME: vectors need to be padded but that's expensive; update transponseBin to deal
370-
byteQuery = BQVectorUtils.pad(byteQuery, discretizedDimensions);
371-
BQSpaceUtils.transposeBin(byteQuery, discretizedDimensions, destination);
367+
BQSpaceUtils.transposeHalfByte(byteQuery, destination);
372368

373369
QueryFactors factors;
374370
if (similarityFunction != EUCLIDEAN) {

0 commit comments

Comments
 (0)