[SIMD] Move native/vec code to C++ (elastic#138525)

ldematte · ncordon · commit c9f4c8e83fe0 · 2025-11-26T12:37:07.000+01:00
Native code in the simdvec library is currently a mix of C and C++ code.
We found that C++ templates are helpful to reduce source code duplication while retaining a great (sometime even greater) code inlining and expansion (e.g. loop unrolling) that we use to maximize performance.

This PR moves all existing C code to C++; in general, it's just a matter of renaming + disabling name mangling, plus renaming of the exported functions. This last operation is needed as they name clash with the "extended" math functions (e.g. cosf32 which is the cosine function over float32_t types). Now all the exported symbols have a vec_ prefix.
diff --git a/libs/native/libraries/build.gradle b/libs/native/libraries/build.gradle
@@ -19,7 +19,7 @@ configurations {
 }
 
 var zstdVersion = "1.5.5"
-var vecVersion = "1.0.16"
+var vecVersion = "1.0.17"
 
 repositories {
   exclusiveContent {
diff --git a/libs/native/src/main/java/org/elasticsearch/nativeaccess/jdk/JdkVectorLibrary.java b/libs/native/src/main/java/org/elasticsearch/nativeaccess/jdk/JdkVectorLibrary.java
@@ -50,63 +50,63 @@ public final class JdkVectorLibrary implements VectorLibrary {
             if (caps > 0) {
                 if (caps == 2) {
                     dot7u$mh = downcallHandle(
-                        "dot7u_2",
+                        "vec_dot7u_2",
                         FunctionDescriptor.of(JAVA_INT, ADDRESS, ADDRESS, JAVA_INT),
                         LinkerHelperUtil.critical()
                     );
                     dot7uBulk$mh = downcallHandle(
-                        "dot7u_bulk_2",
+                        "vec_dot7u_bulk_2",
                         FunctionDescriptor.ofVoid(ADDRESS, ADDRESS, JAVA_INT, JAVA_INT, ADDRESS),
                         LinkerHelperUtil.critical()
                     );
                     sqr7u$mh = downcallHandle(
-                        "sqr7u_2",
+                        "vec_sqr7u_2",
                         FunctionDescriptor.of(JAVA_INT, ADDRESS, ADDRESS, JAVA_INT),
                         LinkerHelperUtil.critical()
                     );
                     cosf32$mh = downcallHandle(
-                        "cosf32_2",
+                        "vec_cosf32_2",
                         FunctionDescriptor.of(JAVA_FLOAT, ADDRESS, ADDRESS, JAVA_INT),
                         LinkerHelperUtil.critical()
                     );
                     dotf32$mh = downcallHandle(
-                        "dotf32_2",
+                        "vec_dotf32_2",
                         FunctionDescriptor.of(JAVA_FLOAT, ADDRESS, ADDRESS, JAVA_INT),
                         LinkerHelperUtil.critical()
                     );
                     sqrf32$mh = downcallHandle(
-                        "sqrf32_2",
+                        "vec_sqrf32_2",
                         FunctionDescriptor.of(JAVA_FLOAT, ADDRESS, ADDRESS, JAVA_INT),
                         LinkerHelperUtil.critical()
                     );
                 } else {
                     dot7u$mh = downcallHandle(
-                        "dot7u",
+                        "vec_dot7u",
                         FunctionDescriptor.of(JAVA_INT, ADDRESS, ADDRESS, JAVA_INT),
                         LinkerHelperUtil.critical()
                     );
                     dot7uBulk$mh = downcallHandle(
-                        "dot7u_bulk",
+                        "vec_dot7u_bulk",
                         FunctionDescriptor.ofVoid(ADDRESS, ADDRESS, JAVA_INT, JAVA_INT, ADDRESS),
                         LinkerHelperUtil.critical()
                     );
                     sqr7u$mh = downcallHandle(
-                        "sqr7u",
+                        "vec_sqr7u",
                         FunctionDescriptor.of(JAVA_INT, ADDRESS, ADDRESS, JAVA_INT),
                         LinkerHelperUtil.critical()
                     );
                     cosf32$mh = downcallHandle(
-                        "cosf32",
+                        "vec_cosf32",
                         FunctionDescriptor.of(JAVA_FLOAT, ADDRESS, ADDRESS, JAVA_INT),
                         LinkerHelperUtil.critical()
                     );
                     dotf32$mh = downcallHandle(
-                        "dotf32",
+                        "vec_dotf32",
                         FunctionDescriptor.of(JAVA_FLOAT, ADDRESS, ADDRESS, JAVA_INT),
                         LinkerHelperUtil.critical()
                     );
                     sqrf32$mh = downcallHandle(
-                        "sqrf32",
+                        "vec_sqrf32",
                         FunctionDescriptor.of(JAVA_FLOAT, ADDRESS, ADDRESS, JAVA_INT),
                         LinkerHelperUtil.critical()
                     );
diff --git a/libs/simdvec/native/build.gradle b/libs/simdvec/native/build.gradle
@@ -29,7 +29,7 @@ var os = org.gradle.internal.os.OperatingSystem.current()
 //  objdump --disassemble-symbols=_dot7u build/libs/vec/shared/aarch64/libvec.dylib
 // Note: symbol decoration may differ on Linux, i.e. the leading underscore is not present
 //
-// gcc -shared -fpic -o libvec.so -I src/vec/headers/ src/vec/c/vec.c -O3
+// g++ -shared -fpic -o libvec.so -I src/vec/headers/ src/vec/c/vec.c -O3
 
 group = 'org.elasticsearch'
 
@@ -47,12 +47,10 @@ model {
   toolChains {
     gcc(Gcc) {
       target("aarch64") {
-        cCompiler.executable = "/usr/bin/gcc"
-        cCompiler.withArguments { args -> args.addAll(["-O3", "-std=c11", "-march=armv8-a"]) }
+        cppCompiler.executable = "/usr/bin/g++"
+        cppCompiler.withArguments { args -> args.addAll(["-O3", "-march=armv8-a"]) }
       }
       target("amd64") {
-        cCompiler.executable = "/usr/bin/gcc"
-        cCompiler.withArguments { args -> args.addAll(["-O3", "-std=c11", "-march=core-avx2", "-Wno-incompatible-pointer-types"]) }
         cppCompiler.executable = "/usr/bin/g++"
         cppCompiler.withArguments { args -> args.addAll(["-O3", "-march=core-avx2"]) }
       }
@@ -61,17 +59,16 @@ model {
       eachPlatform { toolchain ->
         def platform = toolchain.getPlatform()
         if (platform.name == "x64") {
-          cCompiler.withArguments { args -> args.addAll(["/O2", "/LD", "-march=core-avx2"]) }
+          cppCompiler.withArguments { args -> args.addAll(["/O2", "/LD", "-march=core-avx2"]) }
         }
       }
     }
     clang(Clang) {
       target("aarch64") {
-        cCompiler.withArguments { args -> args.addAll(["-O3", "-std=c11", "-march=armv8-a"]) }
+        cppCompiler.withArguments { args -> args.addAll(["-O3", "-march=armv8-a"]) }
       }
 
       target("amd64") {
-        cCompiler.withArguments { args -> args.addAll(["-O3", "-std=c11", "-march=core-avx2"]) }
         cppCompiler.withArguments { args -> args.addAll(["-O3", "-march=core-avx2"]) }
       }
     }
diff --git a/libs/simdvec/native/publish_vec_binaries.sh b/libs/simdvec/native/publish_vec_binaries.sh
@@ -20,7 +20,7 @@ if [ -z "$ARTIFACTORY_API_KEY" ]; then
   exit 1;
 fi
 
-VERSION="1.0.16"
+VERSION="1.0.17"
 ARTIFACTORY_REPOSITORY="${ARTIFACTORY_REPOSITORY:-https://artifactory.elastic.dev/artifactory/elasticsearch-native/}"
 TEMP=$(mktemp -d)
 
diff --git a/libs/simdvec/native/src/vec/c/aarch64/vec_1.cpp b/libs/simdvec/native/src/vec/c/aarch64/vec_1.cpp
@@ -7,6 +7,11 @@
  * License v3.0 only", or the "Server Side Public License, v 1".
  */
 
+ // This file contains implementations for basic vector processing functionalities,
+ // including support for "1st tier" vector capabilities; in the case of ARM,
+ // this first tier include functions for processors supporting at least the NEON
+ // instruction set.
+
 #include <stddef.h>
 #include <arm_neon.h>
 #include <math.h>
@@ -48,7 +53,7 @@ EXPORT int vec_caps() {
 #endif
 }
 
-static inline int32_t dot7u_inner(int8_t* a, int8_t* b, const int32_t dims) {
+static inline int32_t dot7u_inner(const int8_t* a, const int8_t* b, const int32_t dims) {
     // We have contention in the instruction pipeline on the accumulation
     // registers if we use too few.
     int32x4_t acc1 = vdupq_n_s32(0);
@@ -82,7 +87,7 @@ static inline int32_t dot7u_inner(int8_t* a, int8_t* b, const int32_t dims) {
     return vaddvq_s32(vaddq_s32(acc5, acc6));
 }
 
-EXPORT int32_t dot7u(int8_t* a, int8_t* b, const int32_t dims) {
+EXPORT int32_t vec_dot7u(int8_t* a, int8_t* b, const int32_t dims) {
     int32_t res = 0;
     int i = 0;
     if (dims > DOT7U_STRIDE_BYTES_LEN) {
@@ -95,7 +100,7 @@ EXPORT int32_t dot7u(int8_t* a, int8_t* b, const int32_t dims) {
     return res;
 }
 
-EXPORT void dot7u_bulk(int8_t* a, const int8_t* b, const int32_t dims, const int32_t count, float_t* results) {
+EXPORT void vec_dot7u_bulk(int8_t* a, const int8_t* b, const int32_t dims, const int32_t count, f32_t* results) {
     int32_t res = 0;
     if (dims > DOT7U_STRIDE_BYTES_LEN) {
         const int limit = dims & ~(DOT7U_STRIDE_BYTES_LEN - 1);
@@ -105,7 +110,7 @@ EXPORT void dot7u_bulk(int8_t* a, const int8_t* b, const int32_t dims, const int
             for (; i < dims; i++) {
                 res += a[i] * b[i];
             }
-            results[c] = (float_t)res;
+            results[c] = (f32_t)res;
             a += dims;
         }
     } else {
@@ -114,7 +119,7 @@ EXPORT void dot7u_bulk(int8_t* a, const int8_t* b, const int32_t dims, const int
             for (int32_t i = 0; i < dims; i++) {
                 res += a[i] * b[i];
             }
-            results[c] = (float_t)res;
+            results[c] = (f32_t)res;
             a += dims;
         }
     }
@@ -145,7 +150,7 @@ static inline int32_t sqr7u_inner(int8_t *a, int8_t *b, const int32_t dims) {
     return vaddvq_s32(vaddq_s32(acc5, acc6));
 }
 
-EXPORT int32_t sqr7u(int8_t* a, int8_t* b, const int32_t dims) {
+EXPORT int32_t vec_sqr7u(int8_t* a, int8_t* b, const int32_t dims) {
     int32_t res = 0;
     int i = 0;
     if (dims > SQR7U_STRIDE_BYTES_LEN) {
@@ -164,7 +169,7 @@ EXPORT int32_t sqr7u(int8_t* a, int8_t* b, const int32_t dims) {
 // const f32_t *a  pointer to the first float vector
 // const f32_t *b  pointer to the second float vector
 // const int32_t elementCount  the number of floating point elements
-EXPORT f32_t dotf32(const f32_t *a, const f32_t *b, const int32_t elementCount) {
+EXPORT f32_t vec_dotf32(const f32_t *a, const f32_t *b, const int32_t elementCount) {
     float32x4_t sum0 = vdupq_n_f32(0.0f);
     float32x4_t sum1 = vdupq_n_f32(0.0f);
     float32x4_t sum2 = vdupq_n_f32(0.0f);
@@ -205,7 +210,7 @@ EXPORT f32_t dotf32(const f32_t *a, const f32_t *b, const int32_t elementCount)
 // const f32_t *a  pointer to the first float vector
 // const f32_t *b  pointer to the second float vector
 // const int32_t elementCount  the number of floating point elements
-EXPORT f32_t cosf32(const f32_t *a, const f32_t *b, const int32_t elementCount) {
+EXPORT f32_t vec_cosf32(const f32_t *a, const f32_t *b, const int32_t elementCount) {
     float32x4_t sum0 = vdupq_n_f32(0.0f);
     float32x4_t sum1 = vdupq_n_f32(0.0f);
     float32x4_t sum2 = vdupq_n_f32(0.0f);
@@ -277,7 +282,7 @@ EXPORT f32_t cosf32(const f32_t *a, const f32_t *b, const int32_t elementCount)
     return dot / denom;
 }
 
-EXPORT f32_t sqrf32(const f32_t *a, const f32_t *b, const int32_t elementCount) {
+EXPORT f32_t vec_sqrf32(const f32_t *a, const f32_t *b, const int32_t elementCount) {
     float32x4_t sum0 = vdupq_n_f32(0.0f);
     float32x4_t sum1 = vdupq_n_f32(0.0f);
     float32x4_t sum2 = vdupq_n_f32(0.0f);
diff --git a/libs/simdvec/native/src/vec/c/amd64/vec_1.cpp b/libs/simdvec/native/src/vec/c/amd64/vec_1.cpp
@@ -7,6 +7,10 @@
  * License v3.0 only", or the "Server Side Public License, v 1".
  */
 
+  // This file contains implementations for basic vector processing functionalities,
+  // including support for "1st tier" vector capabilities; in the case of x64,
+  // this first tier include functions for processors supporting at least AVX2.
+
 #include <stddef.h>
 #include <stdint.h>
 #include <math.h>
@@ -116,7 +120,7 @@ EXPORT int vec_caps() {
     return 0;
 }
 
-static inline int32_t dot7u_inner(int8_t* a, int8_t* b, const int32_t dims) {
+static inline int32_t dot7u_inner(const int8_t* a, const int8_t* b, const int32_t dims) {
     const __m256i ones = _mm256_set1_epi16(1);
 
     // Init accumulator(s) with 0
@@ -125,8 +129,8 @@ static inline int32_t dot7u_inner(int8_t* a, int8_t* b, const int32_t dims) {
 #pragma GCC unroll 4
     for(int i = 0; i < dims; i += STRIDE_BYTES_LEN) {
         // Load packed 8-bit integers
-        __m256i va1 = _mm256_loadu_si256(a + i);
-        __m256i vb1 = _mm256_loadu_si256(b + i);
+        __m256i va1 = _mm256_loadu_si256((const __m256i_u *)(a + i));
+        __m256i vb1 = _mm256_loadu_si256((const __m256i_u *)(b + i));
 
         // Perform multiplication and create 16-bit values
         // Vertically multiply each unsigned 8-bit integer from va with the corresponding
@@ -140,7 +144,7 @@ static inline int32_t dot7u_inner(int8_t* a, int8_t* b, const int32_t dims) {
     return hsum_i32_8(acc1);
 }
 
-EXPORT int32_t dot7u(int8_t* a, int8_t* b, const int32_t dims) {
+EXPORT int32_t vec_dot7u(int8_t* a, int8_t* b, const int32_t dims) {
     int32_t res = 0;
     int i = 0;
     if (dims > STRIDE_BYTES_LEN) {
@@ -153,7 +157,7 @@ EXPORT int32_t dot7u(int8_t* a, int8_t* b, const int32_t dims) {
     return res;
 }
 
-EXPORT void dot7u_bulk(int8_t* a, const int8_t* b, const int32_t dims, const int32_t count, float_t* results) {
+EXPORT void vec_dot7u_bulk(int8_t* a, const int8_t* b, const int32_t dims, const int32_t count, f32_t* results) {
     int32_t res = 0;
     if (dims > STRIDE_BYTES_LEN) {
         const int limit = dims & ~(STRIDE_BYTES_LEN - 1);
@@ -163,7 +167,7 @@ EXPORT void dot7u_bulk(int8_t* a, const int8_t* b, const int32_t dims, const int
             for (; i < dims; i++) {
                 res += a[i] * b[i];
             }
-            results[c] = (float_t)res;
+            results[c] = (f32_t)res;
             a += dims;
         }
     } else {
@@ -172,7 +176,7 @@ EXPORT void dot7u_bulk(int8_t* a, const int8_t* b, const int32_t dims, const int
             for (int32_t i = 0; i < dims; i++) {
                 res += a[i] * b[i];
             }
-            results[c] = (float_t)res;
+            results[c] = (f32_t)res;
             a += dims;
         }
     }
@@ -187,8 +191,8 @@ static inline int32_t sqr7u_inner(int8_t *a, int8_t *b, const int32_t dims) {
 #pragma GCC unroll 4
     for(int i = 0; i < dims; i += STRIDE_BYTES_LEN) {
         // Load packed 8-bit integers
-        __m256i va1 = _mm256_loadu_si256(a + i);
-        __m256i vb1 = _mm256_loadu_si256(b + i);
+        __m256i va1 = _mm256_loadu_si256((const __m256i_u *)(a + i));
+        __m256i vb1 = _mm256_loadu_si256((const __m256i_u *)(b + i));
 
         const __m256i dist1 = _mm256_sub_epi8(va1, vb1);
         const __m256i abs_dist1 = _mm256_sign_epi8(dist1, dist1);
@@ -200,7 +204,7 @@ static inline int32_t sqr7u_inner(int8_t *a, int8_t *b, const int32_t dims) {
     return hsum_i32_8(acc1);
 }
 
-EXPORT int32_t sqr7u(int8_t* a, int8_t* b, const int32_t dims) {
+EXPORT int32_t vec_sqr7u(int8_t* a, int8_t* b, const int32_t dims) {
     int32_t res = 0;
     int i = 0;
     if (dims > STRIDE_BYTES_LEN) {
@@ -236,7 +240,7 @@ static inline f32_t hsum_f32_8(const __m256 v) {
 // const f32_t *a  pointer to the first float vector
 // const f32_t *b  pointer to the second float vector
 // const int32_t elementCount  the number of floating point elements
-EXPORT f32_t cosf32(const f32_t *a, const f32_t *b, const int32_t elementCount) {
+EXPORT f32_t vec_cosf32(const f32_t *a, const f32_t *b, const int32_t elementCount) {
     __m256 dot0 = _mm256_setzero_ps();
     __m256 dot1 = _mm256_setzero_ps();
     __m256 dot2 = _mm256_setzero_ps();
@@ -309,7 +313,7 @@ EXPORT f32_t cosf32(const f32_t *a, const f32_t *b, const int32_t elementCount)
 // const f32_t *a  pointer to the first float vector
 // const f32_t *b  pointer to the second float vector
 // const int32_t elementCount  the number of floating point elements
-EXPORT f32_t dotf32(const f32_t *a, const f32_t *b, const int32_t elementCount) {
+EXPORT f32_t vec_dotf32(const f32_t *a, const f32_t *b, const int32_t elementCount) {
     __m256 acc0 = _mm256_setzero_ps();
     __m256 acc1 = _mm256_setzero_ps();
     __m256 acc2 = _mm256_setzero_ps();
@@ -339,7 +343,7 @@ EXPORT f32_t dotf32(const f32_t *a, const f32_t *b, const int32_t elementCount)
 // const f32_t *a  pointer to the first float vector
 // const f32_t *b  pointer to the second float vector
 // const int32_t elementCount  the number of floating point elements
-EXPORT f32_t sqrf32(const f32_t *a, const f32_t *b, const int32_t elementCount) {
+EXPORT f32_t vec_sqrf32(const f32_t *a, const f32_t *b, const int32_t elementCount) {
     __m256 sum0 = _mm256_setzero_ps();
     __m256 sum1 = _mm256_setzero_ps();
     __m256 sum2 = _mm256_setzero_ps();
diff --git a/libs/simdvec/native/src/vec/c/amd64/vec_2.cpp b/libs/simdvec/native/src/vec/c/amd64/vec_2.cpp
diff --git a/libs/simdvec/native/src/vec/headers/vec.h b/libs/simdvec/native/src/vec/headers/vec.h

Original file line number	Diff line number	Diff line change
`@@ -19,7 +19,7 @@ configurations {`
`19`	`19`	`}`
`20`	`20`
`21`	`21`	`var zstdVersion = "1.5.5"`
`22`		`-var vecVersion = "1.0.16"`
	`22`	`+var vecVersion = "1.0.17"`
`23`	`23`
`24`	`24`	`repositories {`
`25`	`25`	`exclusiveContent {`
Original file line number	Diff line number	Diff line change
`@@ -29,7 +29,7 @@ var os = org.gradle.internal.os.OperatingSystem.current()`
`29`	`29`	`// objdump --disassemble-symbols=_dot7u build/libs/vec/shared/aarch64/libvec.dylib`
`30`	`30`	`// Note: symbol decoration may differ on Linux, i.e. the leading underscore is not present`
`31`	`31`	`//`
`32`		`-// gcc -shared -fpic -o libvec.so -I src/vec/headers/ src/vec/c/vec.c -O3`
	`32`	`+// g++ -shared -fpic -o libvec.so -I src/vec/headers/ src/vec/c/vec.c -O3`
`33`	`33`
`34`	`34`	`group = 'org.elasticsearch'`
`35`	`35`
`@@ -47,12 +47,10 @@ model {`
`47`	`47`	`toolChains {`
`48`	`48`	`gcc(Gcc) {`
`49`	`49`	`target("aarch64") {`
`50`		`- cCompiler.executable = "/usr/bin/gcc"`
`51`		`- cCompiler.withArguments { args -> args.addAll(["-O3", "-std=c11", "-march=armv8-a"]) }`
	`50`	`+ cppCompiler.executable = "/usr/bin/g++"`
	`51`	`+ cppCompiler.withArguments { args -> args.addAll(["-O3", "-march=armv8-a"]) }`
`52`	`52`	`}`
`53`	`53`	`target("amd64") {`
`54`		`- cCompiler.executable = "/usr/bin/gcc"`
`55`		`- cCompiler.withArguments { args -> args.addAll(["-O3", "-std=c11", "-march=core-avx2", "-Wno-incompatible-pointer-types"]) }`
`56`	`54`	`cppCompiler.executable = "/usr/bin/g++"`
`57`	`55`	`cppCompiler.withArguments { args -> args.addAll(["-O3", "-march=core-avx2"]) }`
`58`	`56`	`}`
`@@ -61,17 +59,16 @@ model {`
`61`	`59`	`eachPlatform { toolchain ->`
`62`	`60`	`def platform = toolchain.getPlatform()`
`63`	`61`	`if (platform.name == "x64") {`
`64`		`- cCompiler.withArguments { args -> args.addAll(["/O2", "/LD", "-march=core-avx2"]) }`
	`62`	`+ cppCompiler.withArguments { args -> args.addAll(["/O2", "/LD", "-march=core-avx2"]) }`
`65`	`63`	`}`
`66`	`64`	`}`
`67`	`65`	`}`
`68`	`66`	`clang(Clang) {`
`69`	`67`	`target("aarch64") {`
`70`		`- cCompiler.withArguments { args -> args.addAll(["-O3", "-std=c11", "-march=armv8-a"]) }`
	`68`	`+ cppCompiler.withArguments { args -> args.addAll(["-O3", "-march=armv8-a"]) }`
`71`	`69`	`}`
`72`	`70`
`73`	`71`	`target("amd64") {`
`74`		`- cCompiler.withArguments { args -> args.addAll(["-O3", "-std=c11", "-march=core-avx2"]) }`
`75`	`72`	`cppCompiler.withArguments { args -> args.addAll(["-O3", "-march=core-avx2"]) }`
`76`	`73`	`}`
`77`	`74`	`}`