RedisAI
diff --git a/‎src/VecSim/spaces/IP/IP_SVE_INT8.h‎
Lines changed: 102 additions & 0 deletions b/‎src/VecSim/spaces/IP/IP_SVE_INT8.h‎
Lines changed: 102 additions & 0 deletions
diff --git a/‎src/VecSim/spaces/IP/IP_SVE_UINT8.h‎
Lines changed: 102 additions & 0 deletions b/‎src/VecSim/spaces/IP/IP_SVE_UINT8.h‎
Lines changed: 102 additions & 0 deletions
diff --git a/‎src/VecSim/spaces/IP_space.cpp‎
Lines changed: 65 additions & 8 deletions b/‎src/VecSim/spaces/IP_space.cpp‎
Lines changed: 65 additions & 8 deletions
@@ -0,0 +1,102 @@
+/*
+ *Copyright Redis Ltd. 2021 - present
+ *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
+ *the Server Side Public License v1 (SSPLv1).
+ */
+#pragma once
+#include "VecSim/spaces/space_includes.h"
+#include <arm_sve.h>
+
+inline void InnerProductStep(const int8_t *&pVect1, const int8_t *&pVect2, size_t &offset,
+                             svint32_t &sum, const size_t chunk) {
+    svbool_t pg = svptrue_b8();
+
+    // Load int8 vectors
+    svint8_t v1_i8 = svld1_s8(pg, pVect1 + offset);
+    svint8_t v2_i8 = svld1_s8(pg, pVect2 + offset);
+
+    sum = svdot_s32(sum, v1_i8, v2_i8);
+
+    offset += chunk; // Move to the next set of int8 elements
+}
+
+template <bool partial_chunk, unsigned char additional_steps>
+float INT8_InnerProductImp(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    const int8_t *pVect1 = reinterpret_cast<const int8_t *>(pVect1v);
+    const int8_t *pVect2 = reinterpret_cast<const int8_t *>(pVect2v);
+
+    size_t offset = 0;
+    const size_t vl = svcntb();
+    const size_t chunk_size = 4 * vl;
+
+    // Each innerProductStep adds maximum 2^8 & 2^8 = 2^16
+    // Therefore, on a single accumulator, we can perform 2^15 steps before overflowing
+    // That scenario will happen only is the dimension of the vector is larger than 16*4*2^15 = 2^21
+    // (16 int8 in 1 SVE register) * (4 accumulators) * (2^15 steps)
+    // We can safely assume that the dimension is smaller than that
+    // So using int32_t is safe
+
+    svint32_t sum0 = svdup_s32(0);
+    svint32_t sum1 = svdup_s32(0);
+    svint32_t sum2 = svdup_s32(0);
+    svint32_t sum3 = svdup_s32(0);
+
+    size_t num_chunks = dimension / chunk_size;
+
+    for (size_t i = 0; i < num_chunks; ++i) {
+        InnerProductStep(pVect1, pVect2, offset, sum0, vl);
+        InnerProductStep(pVect1, pVect2, offset, sum1, vl);
+        InnerProductStep(pVect1, pVect2, offset, sum2, vl);
+        InnerProductStep(pVect1, pVect2, offset, sum3, vl);
+    }
+
+    // Process remaining complete SVE vectors that didn't fit into the main loop
+    // These are full vector operations (0-3 elements)
+    if constexpr (additional_steps > 0) {
+        if constexpr (additional_steps >= 1) {
+            InnerProductStep(pVect1, pVect2, offset, sum0, vl);
+        }
+        if constexpr (additional_steps >= 2) {
+            InnerProductStep(pVect1, pVect2, offset, sum1, vl);
+        }
+        if constexpr (additional_steps >= 3) {
+            InnerProductStep(pVect1, pVect2, offset, sum2, vl);
+        }
+    }
+
+    if constexpr (partial_chunk) {
+        svbool_t pg = svwhilelt_b8_u64(offset, dimension);
+
+        svint8_t v1_i8 = svld1_s8(pg, pVect1 + offset); // Load int8 vectors
+        svint8_t v2_i8 = svld1_s8(pg, pVect2 + offset); // Load int8 vectors
+
+        sum3 = svdot_s32(sum3, v1_i8, v2_i8);
+
+        pVect1 += vl;
+        pVect2 += vl;
+    }
+
+    sum0 = svadd_s32_x(svptrue_b32(), sum0, sum1);
+    sum2 = svadd_s32_x(svptrue_b32(), sum2, sum3);
+
+    // Perform vector addition in parallel and Horizontal sum
+    int32_t sum_all = svaddv_s32(svptrue_b32(), svadd_s32_x(svptrue_b32(), sum0, sum2));
+
+    return sum_all;
+}
+
+template <bool partial_chunk, unsigned char additional_steps>
+float INT8_InnerProductSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    return 1.0f -
+           INT8_InnerProductImp<partial_chunk, additional_steps>(pVect1v, pVect2v, dimension);
+}
+
+template <bool partial_chunk, unsigned char additional_steps>
+float INT8_CosineSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    float ip = INT8_InnerProductImp<partial_chunk, additional_steps>(pVect1v, pVect2v, dimension);
+    float norm_v1 =
+        *reinterpret_cast<const float *>(static_cast<const int8_t *>(pVect1v) + dimension);
+    float norm_v2 =
+        *reinterpret_cast<const float *>(static_cast<const int8_t *>(pVect2v) + dimension);
+    return 1.0f - ip / (norm_v1 * norm_v2);
+}
@@ -0,0 +1,102 @@
+/*
+ *Copyright Redis Ltd. 2021 - present
+ *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
+ *the Server Side Public License v1 (SSPLv1).
+ */
+#pragma once
+#include "VecSim/spaces/space_includes.h"
+#include <arm_sve.h>
+
+inline void InnerProductStep(const uint8_t *&pVect1, const uint8_t *&pVect2, size_t &offset,
+                             svuint32_t &sum, const size_t chunk) {
+    svbool_t pg = svptrue_b8();
+
+    // Load uint8 vectors
+    svuint8_t v1_ui8 = svld1_u8(pg, pVect1 + offset);
+    svuint8_t v2_ui8 = svld1_u8(pg, pVect2 + offset);
+
+    sum = svdot_u32(sum, v1_ui8, v2_ui8);
+
+    offset += chunk; // Move to the next set of uint8 elements
+}
+
+template <bool partial_chunk, unsigned char additional_steps>
+float UINT8_InnerProductImp(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    const uint8_t *pVect1 = reinterpret_cast<const uint8_t *>(pVect1v);
+    const uint8_t *pVect2 = reinterpret_cast<const uint8_t *>(pVect2v);
+
+    size_t offset = 0;
+    const size_t vl = svcntb();
+    const size_t chunk_size = 4 * vl;
+
+    // Each innerProductStep adds maximum 2^8 & 2^8 = 2^16
+    // Therefore, on a single accumulator, we can perform 2^16 steps before overflowing
+    // That scenario will happen only is the dimension of the vector is larger than 16*4*2^16 = 2^22
+    // (16 uint8 in 1 SVE register) * (4 accumulators) * (2^16 steps)
+    // We can safely assume that the dimension is smaller than that
+    // So using int32_t is safe
+
+    svuint32_t sum0 = svdup_u32(0);
+    svuint32_t sum1 = svdup_u32(0);
+    svuint32_t sum2 = svdup_u32(0);
+    svuint32_t sum3 = svdup_u32(0);
+
+    size_t num_chunks = dimension / chunk_size;
+
+    for (size_t i = 0; i < num_chunks; ++i) {
+        InnerProductStep(pVect1, pVect2, offset, sum0, vl);
+        InnerProductStep(pVect1, pVect2, offset, sum1, vl);
+        InnerProductStep(pVect1, pVect2, offset, sum2, vl);
+        InnerProductStep(pVect1, pVect2, offset, sum3, vl);
+    }
+
+    // Process remaining complete SVE vectors that didn't fit into the main loop
+    // These are full vector operations (0-3 elements)
+    if constexpr (additional_steps > 0) {
+        if constexpr (additional_steps >= 1) {
+            InnerProductStep(pVect1, pVect2, offset, sum0, vl);
+        }
+        if constexpr (additional_steps >= 2) {
+            InnerProductStep(pVect1, pVect2, offset, sum1, vl);
+        }
+        if constexpr (additional_steps >= 3) {
+            InnerProductStep(pVect1, pVect2, offset, sum2, vl);
+        }
+    }
+
+    if constexpr (partial_chunk) {
+        svbool_t pg = svwhilelt_b8_u64(offset, dimension);
+
+        svuint8_t v1_ui8 = svld1_u8(pg, pVect1 + offset); // Load uint8 vectors
+        svuint8_t v2_ui8 = svld1_u8(pg, pVect2 + offset); // Load uint8 vectors
+
+        sum3 = svdot_u32(sum3, v1_ui8, v2_ui8);
+
+        pVect1 += vl;
+        pVect2 += vl;
+    }
+
+    sum0 = svadd_u32_x(svptrue_b32(), sum0, sum1);
+    sum2 = svadd_u32_x(svptrue_b32(), sum2, sum3);
+
+    // Perform vector addition in parallel and Horizontal sum
+    int32_t sum_all = svaddv_u32(svptrue_b32(), svadd_u32_x(svptrue_b32(), sum0, sum2));
+
+    return sum_all;
+}
+
+template <bool partial_chunk, unsigned char additional_steps>
+float UINT8_InnerProductSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    return 1.0f -
+           UINT8_InnerProductImp<partial_chunk, additional_steps>(pVect1v, pVect2v, dimension);
+}
+
+template <bool partial_chunk, unsigned char additional_steps>
+float UINT8_CosineSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    float ip = UINT8_InnerProductImp<partial_chunk, additional_steps>(pVect1v, pVect2v, dimension);
+    float norm_v1 =
+        *reinterpret_cast<const float *>(static_cast<const uint8_t *>(pVect1v) + dimension);
+    float norm_v2 =
+        *reinterpret_cast<const float *>(static_cast<const uint8_t *>(pVect2v) + dimension);
+    return 1.0f - ip / (norm_v1 * norm_v2);
+}
@@ -276,12 +276,27 @@ dist_func_t<float> IP_INT8_GetDistFunc(size_t dim, unsigned char *alignment, con
     }
 
     dist_func_t<float> ret_dist_func = INT8_InnerProduct;
+
+    auto features = getCpuOptimizationFeatures(arch_opt);
+
+#ifdef CPU_FEATURES_ARCH_AARCH64
+#ifdef OPT_SVE2
+    if (features.sve2) {
+        return Choose_INT8_IP_implementation_SVE2(dim);
+    }
+#endif
+#ifdef OPT_SVE
+    if (features.sve) {
+        return Choose_INT8_IP_implementation_SVE(dim);
+    }
+#endif
+#endif
+#ifdef CPU_FEATURES_ARCH_X86_64
     // Optimizations assume at least 32 int8. If we have less, we use the naive implementation.
     if (dim < 32) {
         return ret_dist_func;
     }
-#ifdef CPU_FEATURES_ARCH_X86_64
-    auto features = getCpuOptimizationFeatures(arch_opt);
+
 #ifdef OPT_AVX512_F_BW_VL_VNNI
     if (features.avx512f && features.avx512bw && features.avx512vl && features.avx512vnni) {
         if (dim % 32 == 0) // no point in aligning if we have an offsetting residual
@@ -301,12 +316,26 @@ dist_func_t<float> Cosine_INT8_GetDistFunc(size_t dim, unsigned char *alignment,
     }
 
     dist_func_t<float> ret_dist_func = INT8_Cosine;
+
+    auto features = getCpuOptimizationFeatures(arch_opt);
+
+#ifdef CPU_FEATURES_ARCH_AARCH64
+#ifdef OPT_SVE2
+    if (features.sve2) {
+        return Choose_INT8_Cosine_implementation_SVE2(dim);
+    }
+#endif
+#ifdef OPT_SVE
+    if (features.sve) {
+        return Choose_INT8_Cosine_implementation_SVE(dim);
+    }
+#endif
+#endif
+#ifdef CPU_FEATURES_ARCH_X86_64
     // Optimizations assume at least 32 int8. If we have less, we use the naive implementation.
     if (dim < 32) {
         return ret_dist_func;
     }
-#ifdef CPU_FEATURES_ARCH_X86_64
-    auto features = getCpuOptimizationFeatures(arch_opt);
 #ifdef OPT_AVX512_F_BW_VL_VNNI
     if (features.avx512f && features.avx512bw && features.avx512vl && features.avx512vnni) {
         // For int8 vectors with cosine distance, the extra float for the norm shifts alignment to
@@ -329,12 +358,26 @@ dist_func_t<float> IP_UINT8_GetDistFunc(size_t dim, unsigned char *alignment,
     }
 
     dist_func_t<float> ret_dist_func = UINT8_InnerProduct;
+
+    auto features = getCpuOptimizationFeatures(arch_opt);
+
+#ifdef CPU_FEATURES_ARCH_AARCH64
+#ifdef OPT_SVE2
+    if (features.sve2) {
+        return Choose_UINT8_IP_implementation_SVE2(dim);
+    }
+#endif
+#ifdef OPT_SVE
+    if (features.sve) {
+        return Choose_UINT8_IP_implementation_SVE(dim);
+    }
+#endif
+#endif
+#ifdef CPU_FEATURES_ARCH_X86_64
     // Optimizations assume at least 32 uint8. If we have less, we use the naive implementation.
     if (dim < 32) {
         return ret_dist_func;
     }
-#ifdef CPU_FEATURES_ARCH_X86_64
-    auto features = getCpuOptimizationFeatures(arch_opt);
 #ifdef OPT_AVX512_F_BW_VL_VNNI
     if (features.avx512f && features.avx512bw && features.avx512vl && features.avx512vnni) {
         if (dim % 32 == 0) // no point in aligning if we have an offsetting residual
@@ -354,12 +397,26 @@ dist_func_t<float> Cosine_UINT8_GetDistFunc(size_t dim, unsigned char *alignment
     }
 
     dist_func_t<float> ret_dist_func = UINT8_Cosine;
+
+    auto features = getCpuOptimizationFeatures(arch_opt);
+
+#ifdef CPU_FEATURES_ARCH_AARCH64
+#ifdef OPT_SVE2
+    if (features.sve2) {
+        return Choose_UINT8_Cosine_implementation_SVE2(dim);
+    }
+#endif
+#ifdef OPT_SVE
+    if (features.sve) {
+        return Choose_UINT8_Cosine_implementation_SVE(dim);
+    }
+#endif
+#endif
+#ifdef CPU_FEATURES_ARCH_X86_64
     // Optimizations assume at least 32 uint8. If we have less, we use the naive implementation.
     if (dim < 32) {
         return ret_dist_func;
     }
-#ifdef CPU_FEATURES_ARCH_X86_64
-    auto features = getCpuOptimizationFeatures(arch_opt);
 #ifdef OPT_AVX512_F_BW_VL_VNNI
     if (features.avx512f && features.avx512bw && features.avx512vl && features.avx512vnni) {
         // For uint8 vectors with cosine distance, the extra float for the norm shifts alignment to