Skip to content

Commit 7a43b63

Browse files
authored
Merge branch 'main' into esql-inference-ccs-v2
2 parents a0b2abd + 3064da0 commit 7a43b63

File tree

107 files changed

+1476
-828
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

107 files changed

+1476
-828
lines changed

docs/changelog/139069.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pr: 139069
2+
summary: "[SIMD][x64] Optimized native bulk dot product scoring for Int7"
3+
area: Vector Search
4+
type: enhancement
5+
issues: []

libs/native/libraries/build.gradle

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ configurations {
1919
}
2020

2121
var zstdVersion = "1.5.5"
22-
var vecVersion = "1.0.18"
22+
var vecVersion = "1.0.19"
2323

2424
repositories {
2525
exclusiveContent {

libs/simdvec/native/publish_vec_binaries.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ if [ -z "$ARTIFACTORY_API_KEY" ]; then
2020
exit 1;
2121
fi
2222

23-
VERSION="1.0.18"
23+
VERSION="1.0.19"
2424
ARTIFACTORY_REPOSITORY="${ARTIFACTORY_REPOSITORY:-https://artifactory.elastic.dev/artifactory/elasticsearch-native/}"
2525
TEMP=$(mktemp -d)
2626

libs/simdvec/native/src/vec/c/aarch64/vec_1.cpp

Lines changed: 15 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616
#include <arm_neon.h>
1717
#include <math.h>
1818
#include "vec.h"
19+
#include "vec_common.h"
20+
#include "aarch64/aarch64_vec_common.h"
1921

2022
#ifndef DOT7U_STRIDE_BYTES_LEN
2123
#define DOT7U_STRIDE_BYTES_LEN 32 // Must be a power of 2
@@ -110,12 +112,16 @@ static inline void dot7u_inner_bulk(
110112
const int32_t count,
111113
f32_t* results
112114
) {
113-
size_t blk = dims & ~15;
114-
size_t c = 0;
115-
116-
// f32_t first_offset = int_bits_to_float(*((const int32_t*)(b + dims)));
117-
118-
// Process 4 vectors at a time
115+
const int blk = dims & ~15;
116+
int c = 0;
117+
118+
// Process 4 vectors at a time; this helps the CPU scheduler/prefetcher.
119+
// Loading multiple memory locations while computing gives the prefetcher
120+
// information on where the data to load will be next, and keeps the CPU
121+
// execution units busy.
122+
// Our benchmarks show that this "hint" is more effective than using
123+
// explicit prefetch instructions (e.g. __builtin_prefetch) on many ARM
124+
// processors (e.g. Graviton)
119125
for (; c + 3 < count; c += 4) {
120126
const int8_t* a0 = a + mapper(c, offsets) * pitch;
121127
const int8_t* a1 = a + mapper(c + 1, offsets) * pitch;
@@ -177,33 +183,23 @@ static inline void dot7u_inner_bulk(
177183
acc_scalar3 += a3[t] * bb;
178184
}
179185
}
180-
// f32_t second_offset_0 = int_bits_to_float(*((const int32_t*)(a0 + dims)));
181186
results[c + 0] = (f32_t)acc_scalar0;
182187
results[c + 1] = (f32_t)acc_scalar1;
183188
results[c + 2] = (f32_t)acc_scalar2;
184189
results[c + 3] = (f32_t)acc_scalar3;
185190
}
186191

187-
// Tail-handling: remaining 0..3 vectors
192+
// Tail-handling: remaining vectors
188193
for (; c < count; c++) {
189194
const int8_t* a0 = a + mapper(c, offsets) * pitch;
190195
results[c] = (f32_t)vec_dot7u(a0, b, dims);
191196
}
192197
}
193198

194-
static inline int64_t identity(const int32_t i, const int32_t* offsets) {
195-
return i;
196-
}
197-
198-
static inline int64_t index(const int32_t i, const int32_t* offsets) {
199-
return offsets[i];
200-
}
201-
202199
EXPORT void vec_dot7u_bulk(const int8_t* a, const int8_t* b, const int32_t dims, const int32_t count, f32_t* results) {
203-
dot7u_inner_bulk<identity>(a, b, dims, dims, NULL, count, results);
200+
dot7u_inner_bulk<identity_mapper>(a, b, dims, dims, NULL, count, results);
204201
}
205202

206-
207203
EXPORT void vec_dot7u_bulk_offsets(
208204
const int8_t* a,
209205
const int8_t* b,
@@ -212,7 +208,7 @@ EXPORT void vec_dot7u_bulk_offsets(
212208
const int32_t* offsets,
213209
const int32_t count,
214210
f32_t* results) {
215-
dot7u_inner_bulk<index>(a, b, dims, pitch, offsets, count, results);
211+
dot7u_inner_bulk<array_mapper>(a, b, dims, pitch, offsets, count, results);
216212
}
217213

218214
static inline int32_t sqr7u_inner(int8_t *a, int8_t *b, const int32_t dims) {

libs/simdvec/native/src/vec/c/amd64/vec_1.cpp

Lines changed: 44 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
#include <stdint.h>
1616
#include <math.h>
1717
#include "vec.h"
18+
#include "vec_common.h"
19+
#include "amd64/amd64_vec_common.h"
1820

1921
#include <emmintrin.h>
2022
#include <immintrin.h>
@@ -167,42 +169,56 @@ static inline void dot7u_inner_bulk(
167169
const int32_t count,
168170
f32_t* results
169171
) {
170-
if (dims > STRIDE_BYTES_LEN) {
171-
const int limit = dims & ~(STRIDE_BYTES_LEN - 1);
172-
for (int32_t c = 0; c < count; c++) {
173-
const int8_t* a0 = a + (mapper(c, offsets) * pitch);
174-
int i = limit;
175-
int32_t res = dot7u_inner(a0, b, i);
176-
for (; i < dims; i++) {
177-
res += a0[i] * b[i];
178-
}
179-
results[c] = (f32_t)res;
172+
const int blk = dims & ~(STRIDE_BYTES_LEN - 1);
173+
const int lines_to_fetch = dims / CACHE_LINE_SIZE + 1;
174+
int c = 0;
175+
176+
const int8_t* a0 = safe_mapper_offset<0, mapper>(a, pitch, offsets, count);
177+
const int8_t* a1 = safe_mapper_offset<1, mapper>(a, pitch, offsets, count);
178+
179+
// Process a batch of 2 vectors at a time, after instructing the CPU to
180+
// prefetch the next batch.
181+
// Prefetching multiple memory locations while computing keeps the CPU
182+
// execution units busy. For this "older" generation of x64 processors
183+
// (supporting AVX2, but not AVX-512), benchmarks show that a batch of 2
184+
// is ideal -- more, and it starts to hurt performances due to bandwidth
185+
for (; c + 3 < count; c += 2) {
186+
const int8_t* next_a0 = a + mapper(c + 2, offsets) * pitch;
187+
const int8_t* next_a1 = a + mapper(c + 3, offsets) * pitch;
188+
189+
prefetch(next_a0, lines_to_fetch);
190+
prefetch(next_a1, lines_to_fetch);
191+
192+
int32_t res0 = 0;
193+
int32_t res1 = 0;
194+
int i = 0;
195+
if (dims > STRIDE_BYTES_LEN) {
196+
i = blk;
197+
res0 = dot7u_inner(a0, b, i);
198+
res1 = dot7u_inner(a1, b, i);
180199
}
181-
} else {
182-
for (int32_t c = 0; c < count; c++) {
183-
const int8_t* a0 = a + (mapper(c, offsets) * pitch);
184-
int32_t res = 0;
185-
for (int32_t i = 0; i < dims; i++) {
186-
res += a0[i] * b[i];
187-
}
188-
results[c] = (f32_t)res;
200+
for (; i < dims; i++) {
201+
const int8_t bb = b[i];
202+
res0 += a0[i] * bb;
203+
res1 += a1[i] * bb;
189204
}
205+
results[c + 0] = (f32_t)res0;
206+
results[c + 1] = (f32_t)res1;
207+
a0 = next_a0;
208+
a1 = next_a1;
190209
}
191-
}
192210

193-
static inline int64_t identity(const int32_t i, const int32_t* offsets) {
194-
return i;
195-
}
196-
197-
static inline int64_t index(const int32_t i, const int32_t* offsets) {
198-
return offsets[i];
211+
// Tail-handling: remaining vectors
212+
for (; c < count; c++) {
213+
const int8_t* a0 = a + mapper(c, offsets) * pitch;
214+
results[c] = (f32_t)vec_dot7u(a0, b, dims);
215+
}
199216
}
200217

201218
EXPORT void vec_dot7u_bulk(const int8_t* a, const int8_t* b, const int32_t dims, const int32_t count, f32_t* results) {
202-
dot7u_inner_bulk<identity>(a, b, dims, dims, NULL, count, results);
219+
dot7u_inner_bulk<identity_mapper>(a, b, dims, dims, NULL, count, results);
203220
}
204221

205-
206222
EXPORT void vec_dot7u_bulk_offsets(
207223
const int8_t* a,
208224
const int8_t* b,
@@ -211,7 +227,7 @@ EXPORT void vec_dot7u_bulk_offsets(
211227
const int32_t* offsets,
212228
const int32_t count,
213229
f32_t* results) {
214-
dot7u_inner_bulk<index>(a, b, dims, pitch, offsets, count, results);
230+
dot7u_inner_bulk<array_mapper>(a, b, dims, pitch, offsets, count, results);
215231
}
216232

217233
static inline int32_t sqr7u_inner(int8_t *a, int8_t *b, const int32_t dims) {

libs/simdvec/native/src/vec/c/amd64/vec_2.cpp

Lines changed: 61 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -14,16 +14,19 @@
1414
#include <stddef.h>
1515
#include <stdint.h>
1616
#include <math.h>
17-
#include "vec.h"
1817

19-
// AVX-512 code
18+
// Force the preprocessor to pick up AVX-512 intrinsics, and the compiler to emit AVX-512 code
2019
#ifdef __clang__
2120
#pragma clang attribute push(__attribute__((target("arch=skylake-avx512"))), apply_to=function)
2221
#elif __GNUC__
2322
#pragma GCC push_options
2423
#pragma GCC target ("arch=skylake-avx512")
2524
#endif
2625

26+
#include "vec.h"
27+
#include "vec_common.h"
28+
#include "amd64/amd64_vec_common.h"
29+
2730
// Includes for intrinsics
2831
#ifdef _MSC_VER
2932
#include <intrin.h>
@@ -133,42 +136,70 @@ static inline void dot7u_inner_bulk(
133136
const int32_t count,
134137
f32_t* results
135138
) {
136-
if (dims > STRIDE_BYTES_LEN) {
137-
const int limit = dims & ~(STRIDE_BYTES_LEN - 1);
138-
for (int32_t c = 0; c < count; c++) {
139-
const int8_t* a0 = a + (mapper(c, offsets) * pitch);
140-
int i = limit;
141-
int32_t res = dot7u_inner_avx512(a0, b, i);
142-
for (; i < dims; i++) {
143-
res += a0[i] * b[i];
144-
}
145-
results[c] = (f32_t)res;
139+
const int blk = dims & ~(STRIDE_BYTES_LEN - 1);
140+
const int lines_to_fetch = dims / CACHE_LINE_SIZE + 1;
141+
int c = 0;
142+
143+
const int8_t* a0 = safe_mapper_offset<0, mapper>(a, pitch, offsets, count);
144+
const int8_t* a1 = safe_mapper_offset<1, mapper>(a, pitch, offsets, count);
145+
const int8_t* a2 = safe_mapper_offset<2, mapper>(a, pitch, offsets, count);
146+
const int8_t* a3 = safe_mapper_offset<3, mapper>(a, pitch, offsets, count);
147+
148+
// Process a batch of 4 vectors at a time, after instructing the CPU to
149+
// prefetch the next batch.
150+
// Prefetching multiple memory locations while computing keeps the CPU
151+
// execution units busy.
152+
for (; c + 7 < count; c += 4) {
153+
const int8_t* next_a0 = a + mapper(c + 4, offsets) * pitch;
154+
const int8_t* next_a1 = a + mapper(c + 5, offsets) * pitch;
155+
const int8_t* next_a2 = a + mapper(c + 6, offsets) * pitch;
156+
const int8_t* next_a3 = a + mapper(c + 7, offsets) * pitch;
157+
158+
prefetch(next_a0, lines_to_fetch);
159+
prefetch(next_a1, lines_to_fetch);
160+
prefetch(next_a2, lines_to_fetch);
161+
prefetch(next_a3, lines_to_fetch);
162+
163+
int32_t res0 = 0;
164+
int32_t res1 = 0;
165+
int32_t res2 = 0;
166+
int32_t res3 = 0;
167+
int i = 0;
168+
if (dims > STRIDE_BYTES_LEN) {
169+
i = blk;
170+
res0 = dot7u_inner_avx512(a0, b, i);
171+
res1 = dot7u_inner_avx512(a1, b, i);
172+
res2 = dot7u_inner_avx512(a2, b, i);
173+
res3 = dot7u_inner_avx512(a3, b, i);
146174
}
147-
} else {
148-
for (int32_t c = 0; c < count; c++) {
149-
const int8_t* a0 = a + (mapper(c, offsets) * pitch);
150-
int32_t res = 0;
151-
for (int32_t i = 0; i < dims; i++) {
152-
res += a0[i] * b[i];
153-
}
154-
results[c] = (f32_t)res;
175+
for (; i < dims; i++) {
176+
const int8_t bb = b[i];
177+
res0 += a0[i] * bb;
178+
res1 += a1[i] * bb;
179+
res2 += a2[i] * bb;
180+
res3 += a3[i] * bb;
155181
}
182+
results[c + 0] = (f32_t)res0;
183+
results[c + 1] = (f32_t)res1;
184+
results[c + 2] = (f32_t)res2;
185+
results[c + 3] = (f32_t)res3;
186+
a0 = next_a0;
187+
a1 = next_a1;
188+
a2 = next_a2;
189+
a3 = next_a3;
156190
}
157-
}
158191

159-
static inline int64_t identity(const int32_t i, const int32_t* offsets) {
160-
return i;
161-
}
162-
163-
static inline int64_t index(const int32_t i, const int32_t* offsets) {
164-
return offsets[i];
192+
// Tail-handling: remaining vectors
193+
for (; c < count; c++) {
194+
const int8_t* a0 = a + mapper(c, offsets) * pitch;
195+
results[c] = (f32_t)vec_dot7u_2(a0, b, dims);
196+
}
165197
}
166198

167199
EXPORT void vec_dot7u_bulk_2(const int8_t* a, const int8_t* b, const int32_t dims, const int32_t count, f32_t* results) {
168-
dot7u_inner_bulk<identity>(a, b, dims, dims, NULL, count, results);
200+
dot7u_inner_bulk<identity_mapper>(a, b, dims, dims, NULL, count, results);
169201
}
170202

171-
172203
EXPORT void vec_dot7u_bulk_offsets_2(
173204
const int8_t* a,
174205
const int8_t* b,
@@ -177,7 +208,7 @@ EXPORT void vec_dot7u_bulk_offsets_2(
177208
const int32_t* offsets,
178209
const int32_t count,
179210
f32_t* results) {
180-
dot7u_inner_bulk<index>(a, b, dims, pitch, offsets, count, results);
211+
dot7u_inner_bulk<array_mapper>(a, b, dims, pitch, offsets, count, results);
181212
}
182213

183214
template<int offsetRegs>
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
#ifndef AARCH64_VEC_COMMON_INCLUDED
2+
#define AARCH64_VEC_COMMON_INCLUDED
3+
4+
#include "vec_common.h"
5+
6+
#ifdef __APPLE__
7+
// Cache line size is 128 bytes on Apple M silicon
8+
// Source: sysctl -a hw machdep.cpu | grep hw.cachelinesize
9+
#define CACHE_LINE_SIZE 128
10+
#elif __linux__
11+
// We mostly care about ARMv8a like Neoverse N1 (e.g. Graviton 2) and V1 (e.g. Graviton 3), and ARMv9a
12+
// like Neoverse V2 (e.g. Graviton 4) architectures.
13+
// They all have cache lines of 64 bytes. See:
14+
// - https://developer.arm.com/documentation/100616/0401/L2-memory-system/About-the-L2-memory-system Graviton CPUs
15+
// - https://documentation-service.arm.com/static/66ace927882fec713ef4819f
16+
// - https://developer.arm.com/documentation/102375/latest
17+
#define CACHE_LINE_SIZE 64
18+
#else
19+
#error "Unsupported aarch64 platform"
20+
#endif
21+
22+
static inline void prefetch(const void* ptr, int lines) {
23+
const uintptr_t base = align_downwards<CACHE_LINE_SIZE>(ptr);
24+
for (int k = 0; k < lines; ++k) {
25+
__builtin_prefetch((void*)(base + k * CACHE_LINE_SIZE));
26+
}
27+
}
28+
29+
#endif // AARCH64_VEC_COMMON_INCLUDED
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
#ifndef AMD64_VEC_COMMON_INCLUDED
2+
#define AMD64_VEC_COMMON_INCLUDED
3+
4+
#include "vec_common.h"
5+
#include <immintrin.h>
6+
7+
#define CACHE_LINE_SIZE 64
8+
9+
static inline void prefetch(const void* ptr, int lines) {
10+
const uintptr_t base = align_downwards<CACHE_LINE_SIZE>(ptr);
11+
for (int k = 0; k < lines; ++k) {
12+
_mm_prefetch((void*)(base + k * CACHE_LINE_SIZE), _MM_HINT_T0);
13+
}
14+
}
15+
16+
#endif // AMD64_VEC_COMMON_INCLUDED

0 commit comments

Comments
 (0)