afoucret
diff --git a/‎docs/changelog/139069.yaml‎
Lines changed: 5 additions & 0 deletions b/‎docs/changelog/139069.yaml‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎libs/native/libraries/build.gradle‎
Lines changed: 1 addition & 1 deletion b/‎libs/native/libraries/build.gradle‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎libs/simdvec/native/publish_vec_binaries.sh‎
Lines changed: 1 addition & 1 deletion b/‎libs/simdvec/native/publish_vec_binaries.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎libs/simdvec/native/src/vec/c/aarch64/vec_1.cpp‎
Lines changed: 15 additions & 19 deletions b/‎libs/simdvec/native/src/vec/c/aarch64/vec_1.cpp‎
Lines changed: 15 additions & 19 deletions
diff --git a/‎libs/simdvec/native/src/vec/c/amd64/vec_1.cpp‎
Lines changed: 44 additions & 28 deletions b/‎libs/simdvec/native/src/vec/c/amd64/vec_1.cpp‎
Lines changed: 44 additions & 28 deletions
diff --git a/‎libs/simdvec/native/src/vec/c/amd64/vec_2.cpp‎
Lines changed: 61 additions & 30 deletions b/‎libs/simdvec/native/src/vec/c/amd64/vec_2.cpp‎
Lines changed: 61 additions & 30 deletions
diff --git a/‎libs/simdvec/native/src/vec/headers/aarch64/aarch64_vec_common.h‎
Lines changed: 29 additions & 0 deletions b/‎libs/simdvec/native/src/vec/headers/aarch64/aarch64_vec_common.h‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎libs/simdvec/native/src/vec/headers/amd64/amd64_vec_common.h‎
Lines changed: 16 additions & 0 deletions b/‎libs/simdvec/native/src/vec/headers/amd64/amd64_vec_common.h‎
Lines changed: 16 additions & 0 deletions
@@ -0,0 +1,5 @@
+pr: 139069
+summary: "[SIMD][x64] Optimized native bulk dot product scoring for Int7"
+area: Vector Search
+type: enhancement
+issues: []
@@ -19,7 +19,7 @@ configurations {
 }
 
 var zstdVersion = "1.5.5"
-var vecVersion = "1.0.18"
+var vecVersion = "1.0.19"
 
 repositories {
   exclusiveContent {
 
@@ -20,7 +20,7 @@ if [ -z "$ARTIFACTORY_API_KEY" ]; then
   exit 1;
 fi
 
-VERSION="1.0.18"
+VERSION="1.0.19"
 ARTIFACTORY_REPOSITORY="${ARTIFACTORY_REPOSITORY:-https://artifactory.elastic.dev/artifactory/elasticsearch-native/}"
 TEMP=$(mktemp -d)
 
 
@@ -16,6 +16,8 @@
 #include <arm_neon.h>
 #include <math.h>
 #include "vec.h"
+#include "vec_common.h"
+#include "aarch64/aarch64_vec_common.h"
 
 #ifndef DOT7U_STRIDE_BYTES_LEN
 #define DOT7U_STRIDE_BYTES_LEN 32 // Must be a power of 2
@@ -110,12 +112,16 @@ static inline void dot7u_inner_bulk(
     const int32_t count,
     f32_t* results
 ) {
-    size_t blk = dims & ~15;
-    size_t c = 0;
-
-    // f32_t first_offset = int_bits_to_float(*((const int32_t*)(b + dims)));
-
-    // Process 4 vectors at a time
+    const int blk = dims & ~15;
+    int c = 0;
+
+    // Process 4 vectors at a time; this helps the CPU scheduler/prefetcher.
+    // Loading multiple memory locations while computing gives the prefetcher
+    // information on where the data to load will be next, and keeps the CPU
+    // execution units busy.
+    // Our benchmarks show that this "hint" is more effective than using
+    // explicit prefetch instructions (e.g. __builtin_prefetch) on many ARM
+    // processors (e.g. Graviton)
     for (; c + 3 < count; c += 4) {
         const int8_t* a0 = a + mapper(c, offsets) * pitch;
         const int8_t* a1 = a + mapper(c + 1, offsets) * pitch;
@@ -177,33 +183,23 @@ static inline void dot7u_inner_bulk(
                 acc_scalar3 += a3[t] * bb;
             }
         }
-        // f32_t second_offset_0 = int_bits_to_float(*((const int32_t*)(a0 + dims)));
         results[c + 0] = (f32_t)acc_scalar0;
         results[c + 1] = (f32_t)acc_scalar1;
         results[c + 2] = (f32_t)acc_scalar2;
         results[c + 3] = (f32_t)acc_scalar3;
     }
 
-    // Tail-handling: remaining 0..3 vectors
+    // Tail-handling: remaining vectors
     for (; c < count; c++) {
         const int8_t* a0 = a + mapper(c, offsets) * pitch;
         results[c] = (f32_t)vec_dot7u(a0, b, dims);
     }
 }
 
-static inline int64_t identity(const int32_t i, const int32_t* offsets) {
-   return i;
-}
-
-static inline int64_t index(const int32_t i, const int32_t* offsets) {
-   return offsets[i];
-}
-
 EXPORT void vec_dot7u_bulk(const int8_t* a, const int8_t* b, const int32_t dims, const int32_t count, f32_t* results) {
-    dot7u_inner_bulk<identity>(a, b, dims, dims, NULL, count, results);
+    dot7u_inner_bulk<identity_mapper>(a, b, dims, dims, NULL, count, results);
 }
 
-
 EXPORT void vec_dot7u_bulk_offsets(
     const int8_t* a,
     const int8_t* b,
@@ -212,7 +208,7 @@ EXPORT void vec_dot7u_bulk_offsets(
     const int32_t* offsets,
     const int32_t count,
     f32_t* results) {
-    dot7u_inner_bulk<index>(a, b, dims, pitch, offsets, count, results);
+    dot7u_inner_bulk<array_mapper>(a, b, dims, pitch, offsets, count, results);
 }
 
 static inline int32_t sqr7u_inner(int8_t *a, int8_t *b, const int32_t dims) {
 
@@ -15,6 +15,8 @@
 #include <stdint.h>
 #include <math.h>
 #include "vec.h"
+#include "vec_common.h"
+#include "amd64/amd64_vec_common.h"
 
 #include <emmintrin.h>
 #include <immintrin.h>
@@ -167,42 +169,56 @@ static inline void dot7u_inner_bulk(
     const int32_t count,
     f32_t* results
 ) {
-    if (dims > STRIDE_BYTES_LEN) {
-        const int limit = dims & ~(STRIDE_BYTES_LEN - 1);
-        for (int32_t c = 0; c < count; c++) {
-            const int8_t* a0 = a + (mapper(c, offsets) * pitch);
-            int i = limit;
-            int32_t res = dot7u_inner(a0, b, i);
-            for (; i < dims; i++) {
-                res += a0[i] * b[i];
-            }
-            results[c] = (f32_t)res;
+    const int blk = dims & ~(STRIDE_BYTES_LEN - 1);
+    const int lines_to_fetch = dims / CACHE_LINE_SIZE + 1;
+    int c = 0;
+
+    const int8_t* a0 = safe_mapper_offset<0, mapper>(a, pitch, offsets, count);
+    const int8_t* a1 = safe_mapper_offset<1, mapper>(a, pitch, offsets, count);
+
+    // Process a batch of 2 vectors at a time, after instructing the CPU to
+    // prefetch the next batch.
+    // Prefetching multiple memory locations while computing keeps the CPU
+    // execution units busy. For this "older" generation of x64 processors
+    // (supporting AVX2, but not AVX-512), benchmarks show that a batch of 2
+    // is ideal -- more, and it starts to hurt performances due to bandwidth
+    for (; c + 3 < count; c += 2) {
+        const int8_t* next_a0 = a + mapper(c + 2, offsets) * pitch;
+        const int8_t* next_a1 = a + mapper(c + 3, offsets) * pitch;
+
+        prefetch(next_a0, lines_to_fetch);
+        prefetch(next_a1, lines_to_fetch);
+
+        int32_t res0 = 0;
+        int32_t res1 = 0;
+        int i = 0;
+        if (dims > STRIDE_BYTES_LEN) {
+            i = blk;
+            res0 = dot7u_inner(a0, b, i);
+            res1 = dot7u_inner(a1, b, i);
         }
-    } else {
-        for (int32_t c = 0; c < count; c++) {
-            const int8_t* a0 = a + (mapper(c, offsets) * pitch);
-            int32_t res = 0;
-            for (int32_t i = 0; i < dims; i++) {
-                res += a0[i] * b[i];
-            }
-            results[c] = (f32_t)res;
+        for (; i < dims; i++) {
+            const int8_t bb = b[i];
+            res0 += a0[i] * bb;
+            res1 += a1[i] * bb;
         }
+        results[c + 0] = (f32_t)res0;
+        results[c + 1] = (f32_t)res1;
+        a0 = next_a0;
+        a1 = next_a1;
     }
-}
 
-static inline int64_t identity(const int32_t i, const int32_t* offsets) {
-   return i;
-}
-
-static inline int64_t index(const int32_t i, const int32_t* offsets) {
-   return offsets[i];
+    // Tail-handling: remaining vectors
+    for (; c < count; c++) {
+        const int8_t* a0 = a + mapper(c, offsets) * pitch;
+        results[c] = (f32_t)vec_dot7u(a0, b, dims);
+    }
 }
 
 EXPORT void vec_dot7u_bulk(const int8_t* a, const int8_t* b, const int32_t dims, const int32_t count, f32_t* results) {
-    dot7u_inner_bulk<identity>(a, b, dims, dims, NULL, count, results);
+    dot7u_inner_bulk<identity_mapper>(a, b, dims, dims, NULL, count, results);
 }
 
-
 EXPORT void vec_dot7u_bulk_offsets(
     const int8_t* a,
     const int8_t* b,
@@ -211,7 +227,7 @@ EXPORT void vec_dot7u_bulk_offsets(
     const int32_t* offsets,
     const int32_t count,
     f32_t* results) {
-    dot7u_inner_bulk<index>(a, b, dims, pitch, offsets, count, results);
+    dot7u_inner_bulk<array_mapper>(a, b, dims, pitch, offsets, count, results);
 }
 
 static inline int32_t sqr7u_inner(int8_t *a, int8_t *b, const int32_t dims) {
 
@@ -14,16 +14,19 @@
 #include <stddef.h>
 #include <stdint.h>
 #include <math.h>
-#include "vec.h"
 
-// AVX-512 code
+// Force the preprocessor to pick up AVX-512 intrinsics, and the compiler to emit AVX-512 code
 #ifdef __clang__
 #pragma clang attribute push(__attribute__((target("arch=skylake-avx512"))), apply_to=function)
 #elif __GNUC__
 #pragma GCC push_options
 #pragma GCC target ("arch=skylake-avx512")
 #endif
 
+#include "vec.h"
+#include "vec_common.h"
+#include "amd64/amd64_vec_common.h"
+
 // Includes for intrinsics
 #ifdef _MSC_VER
 #include <intrin.h>
@@ -133,42 +136,70 @@ static inline void dot7u_inner_bulk(
     const int32_t count,
     f32_t* results
 ) {
-    if (dims > STRIDE_BYTES_LEN) {
-        const int limit = dims & ~(STRIDE_BYTES_LEN - 1);
-        for (int32_t c = 0; c < count; c++) {
-            const int8_t* a0 = a + (mapper(c, offsets) * pitch);
-            int i = limit;
-            int32_t res = dot7u_inner_avx512(a0, b, i);
-            for (; i < dims; i++) {
-                res += a0[i] * b[i];
-            }
-            results[c] = (f32_t)res;
+    const int blk = dims & ~(STRIDE_BYTES_LEN - 1);
+    const int lines_to_fetch = dims / CACHE_LINE_SIZE + 1;
+    int c = 0;
+
+    const int8_t* a0 = safe_mapper_offset<0, mapper>(a, pitch, offsets, count);
+    const int8_t* a1 = safe_mapper_offset<1, mapper>(a, pitch, offsets, count);
+    const int8_t* a2 = safe_mapper_offset<2, mapper>(a, pitch, offsets, count);
+    const int8_t* a3 = safe_mapper_offset<3, mapper>(a, pitch, offsets, count);
+
+    // Process a batch of 4 vectors at a time, after instructing the CPU to
+    // prefetch the next batch.
+    // Prefetching multiple memory locations while computing keeps the CPU
+    // execution units busy.
+    for (; c + 7 < count; c += 4) {
+        const int8_t* next_a0 = a + mapper(c + 4, offsets) * pitch;
+        const int8_t* next_a1 = a + mapper(c + 5, offsets) * pitch;
+        const int8_t* next_a2 = a + mapper(c + 6, offsets) * pitch;
+        const int8_t* next_a3 = a + mapper(c + 7, offsets) * pitch;
+
+        prefetch(next_a0, lines_to_fetch);
+        prefetch(next_a1, lines_to_fetch);
+        prefetch(next_a2, lines_to_fetch);
+        prefetch(next_a3, lines_to_fetch);
+
+        int32_t res0 = 0;
+        int32_t res1 = 0;
+        int32_t res2 = 0;
+        int32_t res3 = 0;
+        int i = 0;
+        if (dims > STRIDE_BYTES_LEN) {
+            i = blk;
+            res0 = dot7u_inner_avx512(a0, b, i);
+            res1 = dot7u_inner_avx512(a1, b, i);
+            res2 = dot7u_inner_avx512(a2, b, i);
+            res3 = dot7u_inner_avx512(a3, b, i);
         }
-    } else {
-        for (int32_t c = 0; c < count; c++) {
-            const int8_t* a0 = a + (mapper(c, offsets) * pitch);
-            int32_t res = 0;
-            for (int32_t i = 0; i < dims; i++) {
-                res += a0[i] * b[i];
-            }
-            results[c] = (f32_t)res;
+        for (; i < dims; i++) {
+            const int8_t bb = b[i];
+            res0 += a0[i] * bb;
+            res1 += a1[i] * bb;
+            res2 += a2[i] * bb;
+            res3 += a3[i] * bb;
         }
+        results[c + 0] = (f32_t)res0;
+        results[c + 1] = (f32_t)res1;
+        results[c + 2] = (f32_t)res2;
+        results[c + 3] = (f32_t)res3;
+        a0 = next_a0;
+        a1 = next_a1;
+        a2 = next_a2;
+        a3 = next_a3;
     }
-}
 
-static inline int64_t identity(const int32_t i, const int32_t* offsets) {
-   return i;
-}
-
-static inline int64_t index(const int32_t i, const int32_t* offsets) {
-   return offsets[i];
+    // Tail-handling: remaining vectors
+    for (; c < count; c++) {
+        const int8_t* a0 = a + mapper(c, offsets) * pitch;
+        results[c] = (f32_t)vec_dot7u_2(a0, b, dims);
+    }
 }
 
 EXPORT void vec_dot7u_bulk_2(const int8_t* a, const int8_t* b, const int32_t dims, const int32_t count, f32_t* results) {
-    dot7u_inner_bulk<identity>(a, b, dims, dims, NULL, count, results);
+    dot7u_inner_bulk<identity_mapper>(a, b, dims, dims, NULL, count, results);
 }
 
-
 EXPORT void vec_dot7u_bulk_offsets_2(
     const int8_t* a,
     const int8_t* b,
@@ -177,7 +208,7 @@ EXPORT void vec_dot7u_bulk_offsets_2(
     const int32_t* offsets,
     const int32_t count,
     f32_t* results) {
-    dot7u_inner_bulk<index>(a, b, dims, pitch, offsets, count, results);
+    dot7u_inner_bulk<array_mapper>(a, b, dims, pitch, offsets, count, results);
 }
 
 template<int offsetRegs>
 
@@ -0,0 +1,29 @@
+#ifndef AARCH64_VEC_COMMON_INCLUDED
+#define AARCH64_VEC_COMMON_INCLUDED
+
+#include "vec_common.h"
+
+#ifdef __APPLE__
+    // Cache line size is 128 bytes on Apple M silicon
+    // Source: sysctl -a hw machdep.cpu | grep hw.cachelinesize
+    #define CACHE_LINE_SIZE 128
+#elif __linux__
+    // We mostly care about ARMv8a like Neoverse N1 (e.g. Graviton 2) and V1 (e.g. Graviton 3), and ARMv9a
+    // like Neoverse V2 (e.g. Graviton 4) architectures.
+    // They all have cache lines of 64 bytes. See:
+    // - https://developer.arm.com/documentation/100616/0401/L2-memory-system/About-the-L2-memory-system Graviton CPUs
+    // - https://documentation-service.arm.com/static/66ace927882fec713ef4819f
+    // - https://developer.arm.com/documentation/102375/latest
+    #define CACHE_LINE_SIZE 64
+#else
+    #error "Unsupported aarch64 platform"
+#endif
+
+static inline void prefetch(const void* ptr, int lines) {
+    const uintptr_t base = align_downwards<CACHE_LINE_SIZE>(ptr);
+    for (int k = 0; k < lines; ++k) {
+        __builtin_prefetch((void*)(base + k * CACHE_LINE_SIZE));
+    }
+}
+
+#endif // AARCH64_VEC_COMMON_INCLUDED
@@ -0,0 +1,16 @@
+#ifndef AMD64_VEC_COMMON_INCLUDED
+#define AMD64_VEC_COMMON_INCLUDED
+
+#include "vec_common.h"
+#include <immintrin.h>
+
+#define CACHE_LINE_SIZE 64
+
+static inline void prefetch(const void* ptr, int lines) {
+    const uintptr_t base = align_downwards<CACHE_LINE_SIZE>(ptr);
+    for (int k = 0; k < lines; ++k) {
+        _mm_prefetch((void*)(base + k * CACHE_LINE_SIZE), _MM_HINT_T0);
+    }
+}
+
+#endif // AMD64_VEC_COMMON_INCLUDED
Original file line number	Diff line number	Diff line change
`@@ -19,7 +19,7 @@ configurations {`
`19`	`19`	`}`
`20`	`20`
`21`	`21`	`var zstdVersion = "1.5.5"`
`22`		`-var vecVersion = "1.0.18"`
	`22`	`+var vecVersion = "1.0.19"`
`23`	`23`
`24`	`24`	`repositories {`
`25`	`25`	`exclusiveContent {`