diff --git a/src/VecSim/spaces/IP/IP_SVE_INT8.h b/src/VecSim/spaces/IP/IP_SVE_INT8.h
new file mode 100644
index 000000000..8dc1df313
--- /dev/null
+++ b/src/VecSim/spaces/IP/IP_SVE_INT8.h
@@ -0,0 +1,102 @@
+/*
+ *Copyright Redis Ltd. 2021 - present
+ *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
+ *the Server Side Public License v1 (SSPLv1).
+ */
+#pragma once
+#include "VecSim/spaces/space_includes.h"
+#include <arm_sve.h>
+
+inline void InnerProductStep(const int8_t *&pVect1, const int8_t *&pVect2, size_t &offset,
+                             svint32_t &sum, const size_t chunk) {
+    svbool_t pg = svptrue_b8();
+
+    // Load int8 vectors
+    svint8_t v1_i8 = svld1_s8(pg, pVect1 + offset);
+    svint8_t v2_i8 = svld1_s8(pg, pVect2 + offset);
+
+    sum = svdot_s32(sum, v1_i8, v2_i8);
+
+    offset += chunk; // Move to the next set of int8 elements
+}
+
+template <bool partial_chunk, unsigned char additional_steps>
+float INT8_InnerProductImp(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    const int8_t *pVect1 = reinterpret_cast<const int8_t *>(pVect1v);
+    const int8_t *pVect2 = reinterpret_cast<const int8_t *>(pVect2v);
+
+    size_t offset = 0;
+    const size_t vl = svcntb();
+    const size_t chunk_size = 4 * vl;
+
+    // Each innerProductStep adds maximum 2^8 & 2^8 = 2^16
+    // Therefore, on a single accumulator, we can perform 2^15 steps before overflowing
+    // That scenario will happen only is the dimension of the vector is larger than 16*4*2^15 = 2^21
+    // (16 int8 in 1 SVE register) * (4 accumulators) * (2^15 steps)
+    // We can safely assume that the dimension is smaller than that
+    // So using int32_t is safe
+
+    svint32_t sum0 = svdup_s32(0);
+    svint32_t sum1 = svdup_s32(0);
+    svint32_t sum2 = svdup_s32(0);
+    svint32_t sum3 = svdup_s32(0);
+
+    size_t num_chunks = dimension / chunk_size;
+
+    for (size_t i = 0; i < num_chunks; ++i) {
+        InnerProductStep(pVect1, pVect2, offset, sum0, vl);
+        InnerProductStep(pVect1, pVect2, offset, sum1, vl);
+        InnerProductStep(pVect1, pVect2, offset, sum2, vl);
+        InnerProductStep(pVect1, pVect2, offset, sum3, vl);
+    }
+
+    // Process remaining complete SVE vectors that didn't fit into the main loop
+    // These are full vector operations (0-3 elements)
+    if constexpr (additional_steps > 0) {
+        if constexpr (additional_steps >= 1) {
+            InnerProductStep(pVect1, pVect2, offset, sum0, vl);
+        }
+        if constexpr (additional_steps >= 2) {
+            InnerProductStep(pVect1, pVect2, offset, sum1, vl);
+        }
+        if constexpr (additional_steps >= 3) {
+            InnerProductStep(pVect1, pVect2, offset, sum2, vl);
+        }
+    }
+
+    if constexpr (partial_chunk) {
+        svbool_t pg = svwhilelt_b8_u64(offset, dimension);
+
+        svint8_t v1_i8 = svld1_s8(pg, pVect1 + offset); // Load int8 vectors
+        svint8_t v2_i8 = svld1_s8(pg, pVect2 + offset); // Load int8 vectors
+
+        sum3 = svdot_s32(sum3, v1_i8, v2_i8);
+
+        pVect1 += vl;
+        pVect2 += vl;
+    }
+
+    sum0 = svadd_s32_x(svptrue_b32(), sum0, sum1);
+    sum2 = svadd_s32_x(svptrue_b32(), sum2, sum3);
+
+    // Perform vector addition in parallel and Horizontal sum
+    int32_t sum_all = svaddv_s32(svptrue_b32(), svadd_s32_x(svptrue_b32(), sum0, sum2));
+
+    return sum_all;
+}
+
+template <bool partial_chunk, unsigned char additional_steps>
+float INT8_InnerProductSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    return 1.0f -
+           INT8_InnerProductImp<partial_chunk, additional_steps>(pVect1v, pVect2v, dimension);
+}
+
+template <bool partial_chunk, unsigned char additional_steps>
+float INT8_CosineSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    float ip = INT8_InnerProductImp<partial_chunk, additional_steps>(pVect1v, pVect2v, dimension);
+    float norm_v1 =
+        *reinterpret_cast<const float *>(static_cast<const int8_t *>(pVect1v) + dimension);
+    float norm_v2 =
+        *reinterpret_cast<const float *>(static_cast<const int8_t *>(pVect2v) + dimension);
+    return 1.0f - ip / (norm_v1 * norm_v2);
+}
diff --git a/src/VecSim/spaces/IP/IP_SVE_UINT8.h b/src/VecSim/spaces/IP/IP_SVE_UINT8.h
new file mode 100644
index 000000000..daaa8267a
--- /dev/null
+++ b/src/VecSim/spaces/IP/IP_SVE_UINT8.h
@@ -0,0 +1,102 @@
+/*
+ *Copyright Redis Ltd. 2021 - present
+ *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
+ *the Server Side Public License v1 (SSPLv1).
+ */
+#pragma once
+#include "VecSim/spaces/space_includes.h"
+#include <arm_sve.h>
+
+inline void InnerProductStep(const uint8_t *&pVect1, const uint8_t *&pVect2, size_t &offset,
+                             svuint32_t &sum, const size_t chunk) {
+    svbool_t pg = svptrue_b8();
+
+    // Load uint8 vectors
+    svuint8_t v1_ui8 = svld1_u8(pg, pVect1 + offset);
+    svuint8_t v2_ui8 = svld1_u8(pg, pVect2 + offset);
+
+    sum = svdot_u32(sum, v1_ui8, v2_ui8);
+
+    offset += chunk; // Move to the next set of uint8 elements
+}
+
+template <bool partial_chunk, unsigned char additional_steps>
+float UINT8_InnerProductImp(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    const uint8_t *pVect1 = reinterpret_cast<const uint8_t *>(pVect1v);
+    const uint8_t *pVect2 = reinterpret_cast<const uint8_t *>(pVect2v);
+
+    size_t offset = 0;
+    const size_t vl = svcntb();
+    const size_t chunk_size = 4 * vl;
+
+    // Each innerProductStep adds maximum 2^8 & 2^8 = 2^16
+    // Therefore, on a single accumulator, we can perform 2^16 steps before overflowing
+    // That scenario will happen only is the dimension of the vector is larger than 16*4*2^16 = 2^22
+    // (16 uint8 in 1 SVE register) * (4 accumulators) * (2^16 steps)
+    // We can safely assume that the dimension is smaller than that
+    // So using int32_t is safe
+
+    svuint32_t sum0 = svdup_u32(0);
+    svuint32_t sum1 = svdup_u32(0);
+    svuint32_t sum2 = svdup_u32(0);
+    svuint32_t sum3 = svdup_u32(0);
+
+    size_t num_chunks = dimension / chunk_size;
+
+    for (size_t i = 0; i < num_chunks; ++i) {
+        InnerProductStep(pVect1, pVect2, offset, sum0, vl);
+        InnerProductStep(pVect1, pVect2, offset, sum1, vl);
+        InnerProductStep(pVect1, pVect2, offset, sum2, vl);
+        InnerProductStep(pVect1, pVect2, offset, sum3, vl);
+    }
+
+    // Process remaining complete SVE vectors that didn't fit into the main loop
+    // These are full vector operations (0-3 elements)
+    if constexpr (additional_steps > 0) {
+        if constexpr (additional_steps >= 1) {
+            InnerProductStep(pVect1, pVect2, offset, sum0, vl);
+        }
+        if constexpr (additional_steps >= 2) {
+            InnerProductStep(pVect1, pVect2, offset, sum1, vl);
+        }
+        if constexpr (additional_steps >= 3) {
+            InnerProductStep(pVect1, pVect2, offset, sum2, vl);
+        }
+    }
+
+    if constexpr (partial_chunk) {
+        svbool_t pg = svwhilelt_b8_u64(offset, dimension);
+
+        svuint8_t v1_ui8 = svld1_u8(pg, pVect1 + offset); // Load uint8 vectors
+        svuint8_t v2_ui8 = svld1_u8(pg, pVect2 + offset); // Load uint8 vectors
+
+        sum3 = svdot_u32(sum3, v1_ui8, v2_ui8);
+
+        pVect1 += vl;
+        pVect2 += vl;
+    }
+
+    sum0 = svadd_u32_x(svptrue_b32(), sum0, sum1);
+    sum2 = svadd_u32_x(svptrue_b32(), sum2, sum3);
+
+    // Perform vector addition in parallel and Horizontal sum
+    int32_t sum_all = svaddv_u32(svptrue_b32(), svadd_u32_x(svptrue_b32(), sum0, sum2));
+
+    return sum_all;
+}
+
+template <bool partial_chunk, unsigned char additional_steps>
+float UINT8_InnerProductSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    return 1.0f -
+           UINT8_InnerProductImp<partial_chunk, additional_steps>(pVect1v, pVect2v, dimension);
+}
+
+template <bool partial_chunk, unsigned char additional_steps>
+float UINT8_CosineSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    float ip = UINT8_InnerProductImp<partial_chunk, additional_steps>(pVect1v, pVect2v, dimension);
+    float norm_v1 =
+        *reinterpret_cast<const float *>(static_cast<const uint8_t *>(pVect1v) + dimension);
+    float norm_v2 =
+        *reinterpret_cast<const float *>(static_cast<const uint8_t *>(pVect2v) + dimension);
+    return 1.0f - ip / (norm_v1 * norm_v2);
+}
diff --git a/src/VecSim/spaces/IP_space.cpp b/src/VecSim/spaces/IP_space.cpp
index e930ee143..df1f656b5 100644
--- a/src/VecSim/spaces/IP_space.cpp
+++ b/src/VecSim/spaces/IP_space.cpp
@@ -276,12 +276,27 @@ dist_func_t<float> IP_INT8_GetDistFunc(size_t dim, unsigned char *alignment, con
     }
 
     dist_func_t<float> ret_dist_func = INT8_InnerProduct;
+
+    auto features = getCpuOptimizationFeatures(arch_opt);
+
+#ifdef CPU_FEATURES_ARCH_AARCH64
+#ifdef OPT_SVE2
+    if (features.sve2) {
+        return Choose_INT8_IP_implementation_SVE2(dim);
+    }
+#endif
+#ifdef OPT_SVE
+    if (features.sve) {
+        return Choose_INT8_IP_implementation_SVE(dim);
+    }
+#endif
+#endif
+#ifdef CPU_FEATURES_ARCH_X86_64
     // Optimizations assume at least 32 int8. If we have less, we use the naive implementation.
     if (dim < 32) {
         return ret_dist_func;
     }
-#ifdef CPU_FEATURES_ARCH_X86_64
-    auto features = getCpuOptimizationFeatures(arch_opt);
+
 #ifdef OPT_AVX512_F_BW_VL_VNNI
     if (features.avx512f && features.avx512bw && features.avx512vl && features.avx512vnni) {
         if (dim % 32 == 0) // no point in aligning if we have an offsetting residual
@@ -301,12 +316,26 @@ dist_func_t<float> Cosine_INT8_GetDistFunc(size_t dim, unsigned char *alignment,
     }
 
     dist_func_t<float> ret_dist_func = INT8_Cosine;
+
+    auto features = getCpuOptimizationFeatures(arch_opt);
+
+#ifdef CPU_FEATURES_ARCH_AARCH64
+#ifdef OPT_SVE2
+    if (features.sve2) {
+        return Choose_INT8_Cosine_implementation_SVE2(dim);
+    }
+#endif
+#ifdef OPT_SVE
+    if (features.sve) {
+        return Choose_INT8_Cosine_implementation_SVE(dim);
+    }
+#endif
+#endif
+#ifdef CPU_FEATURES_ARCH_X86_64
     // Optimizations assume at least 32 int8. If we have less, we use the naive implementation.
     if (dim < 32) {
         return ret_dist_func;
     }
-#ifdef CPU_FEATURES_ARCH_X86_64
-    auto features = getCpuOptimizationFeatures(arch_opt);
 #ifdef OPT_AVX512_F_BW_VL_VNNI
     if (features.avx512f && features.avx512bw && features.avx512vl && features.avx512vnni) {
         // For int8 vectors with cosine distance, the extra float for the norm shifts alignment to
@@ -329,12 +358,26 @@ dist_func_t<float> IP_UINT8_GetDistFunc(size_t dim, unsigned char *alignment,
     }
 
     dist_func_t<float> ret_dist_func = UINT8_InnerProduct;
+
+    auto features = getCpuOptimizationFeatures(arch_opt);
+
+#ifdef CPU_FEATURES_ARCH_AARCH64
+#ifdef OPT_SVE2
+    if (features.sve2) {
+        return Choose_UINT8_IP_implementation_SVE2(dim);
+    }
+#endif
+#ifdef OPT_SVE
+    if (features.sve) {
+        return Choose_UINT8_IP_implementation_SVE(dim);
+    }
+#endif
+#endif
+#ifdef CPU_FEATURES_ARCH_X86_64
     // Optimizations assume at least 32 uint8. If we have less, we use the naive implementation.
     if (dim < 32) {
         return ret_dist_func;
     }
-#ifdef CPU_FEATURES_ARCH_X86_64
-    auto features = getCpuOptimizationFeatures(arch_opt);
 #ifdef OPT_AVX512_F_BW_VL_VNNI
     if (features.avx512f && features.avx512bw && features.avx512vl && features.avx512vnni) {
         if (dim % 32 == 0) // no point in aligning if we have an offsetting residual
@@ -354,12 +397,26 @@ dist_func_t<float> Cosine_UINT8_GetDistFunc(size_t dim, unsigned char *alignment
     }
 
     dist_func_t<float> ret_dist_func = UINT8_Cosine;
+
+    auto features = getCpuOptimizationFeatures(arch_opt);
+
+#ifdef CPU_FEATURES_ARCH_AARCH64
+#ifdef OPT_SVE2
+    if (features.sve2) {
+        return Choose_UINT8_Cosine_implementation_SVE2(dim);
+    }
+#endif
+#ifdef OPT_SVE
+    if (features.sve) {
+        return Choose_UINT8_Cosine_implementation_SVE(dim);
+    }
+#endif
+#endif
+#ifdef CPU_FEATURES_ARCH_X86_64
     // Optimizations assume at least 32 uint8. If we have less, we use the naive implementation.
     if (dim < 32) {
         return ret_dist_func;
     }
-#ifdef CPU_FEATURES_ARCH_X86_64
-    auto features = getCpuOptimizationFeatures(arch_opt);
 #ifdef OPT_AVX512_F_BW_VL_VNNI
     if (features.avx512f && features.avx512bw && features.avx512vl && features.avx512vnni) {
         // For uint8 vectors with cosine distance, the extra float for the norm shifts alignment to
diff --git a/src/VecSim/spaces/L2/L2_SVE_INT8.h b/src/VecSim/spaces/L2/L2_SVE_INT8.h
new file mode 100644
index 000000000..eaad44a53
--- /dev/null
+++ b/src/VecSim/spaces/L2/L2_SVE_INT8.h
@@ -0,0 +1,89 @@
+/*
+ *Copyright Redis Ltd. 2021 - present
+ *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
+ *the Server Side Public License v1 (SSPLv1).
+ */
+
+#include "VecSim/spaces/space_includes.h"
+#include <arm_sve.h>
+
+// Aligned step using svptrue_b8()
+inline void L2SquareStep(const int8_t *&pVect1, const int8_t *&pVect2, size_t &offset,
+                         svuint32_t &sum, const size_t chunk) {
+    svbool_t pg = svptrue_b8();
+    // Note: Because all the bits are 1, the extention to 16 and 32 bits does not make a difference
+    // Otherwise, pg should be recalculated for 16 and 32 operations
+
+    svint8_t v1_i8 = svld1_s8(pg, pVect1 + offset); // Load int8 vectors from pVect1
+    svint8_t v2_i8 = svld1_s8(pg, pVect2 + offset); // Load int8 vectors from pVect2
+
+    // The result of svabd can be reinterpreted as uint8
+    svuint8_t abs_diff = svreinterpret_u8_s8(svabd_s8_x(pg, v1_i8, v2_i8));
+
+    sum = svdot_u32(sum, abs_diff, abs_diff);
+    offset += chunk; // Move to the next set of int8 elements
+}
+
+template <bool partial_chunk, unsigned char additional_steps>
+float INT8_L2SqrSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    const int8_t *pVect1 = reinterpret_cast<const int8_t *>(pVect1v);
+    const int8_t *pVect2 = reinterpret_cast<const int8_t *>(pVect2v);
+
+    // number of uint8 per SVE register (we use uint accumulators)
+    const size_t vl = svcntb();
+    const size_t chunk_size = 4 * vl;
+    svbool_t all = svptrue_b8();
+
+    // Each L2SquareStep adds maximum (2^8)^2 = 2^16
+    // Therefor, on a single accumulator, we can perform 2^16 steps before overflowing
+    // That scenario will happen only is the dimension of the vector is larger than 16*4*2^16 = 2^22
+    // (16 uint8 in 1 SVE register) * (4 accumulators) * (2^16 steps)
+    // We can safely assume that the dimension is smaller than that
+    // So using uint32_t is safe
+
+    svuint32_t sum0 = svdup_u32(0);
+    svuint32_t sum1 = svdup_u32(0);
+    svuint32_t sum2 = svdup_u32(0);
+    svuint32_t sum3 = svdup_u32(0);
+
+    size_t offset = 0;
+    size_t num_main_blocks = dimension / chunk_size;
+
+    for (size_t i = 0; i < num_main_blocks; ++i) {
+        L2SquareStep(pVect1, pVect2, offset, sum0, vl);
+        L2SquareStep(pVect1, pVect2, offset, sum1, vl);
+        L2SquareStep(pVect1, pVect2, offset, sum2, vl);
+        L2SquareStep(pVect1, pVect2, offset, sum3, vl);
+    }
+
+    if constexpr (additional_steps > 0) {
+        if constexpr (additional_steps >= 1) {
+            L2SquareStep(pVect1, pVect2, offset, sum0, vl);
+        }
+        if constexpr (additional_steps >= 2) {
+            L2SquareStep(pVect1, pVect2, offset, sum1, vl);
+        }
+        if constexpr (additional_steps >= 3) {
+            L2SquareStep(pVect1, pVect2, offset, sum2, vl);
+        }
+    }
+
+    if constexpr (partial_chunk) {
+
+        svbool_t pg = svwhilelt_b8_u64(offset, dimension);
+
+        svint8_t v1_i8 = svld1_s8(pg, pVect1 + offset); // Load int8 vectors from pVect1
+        svint8_t v2_i8 = svld1_s8(pg, pVect2 + offset); // Load int8 vectors from pVect2
+
+        // The result of svabd can be reinterpreted as uint8
+        svuint8_t abs_diff = svreinterpret_u8_s8(svabd_s8_x(pg, v1_i8, v2_i8));
+
+        // Can sum with taking into account pg because svld1 will set inactive lanes to 0
+        sum3 = svdot_u32(sum3, abs_diff, abs_diff);
+    }
+
+    sum0 = svadd_u32_x(all, sum0, sum1);
+    sum2 = svadd_u32_x(all, sum2, sum3);
+    svuint32_t sum_all = svadd_u32_x(all, sum0, sum2);
+    return svaddv_u32(svptrue_b32(), sum_all);
+}
diff --git a/src/VecSim/spaces/L2/L2_SVE_UINT8.h b/src/VecSim/spaces/L2/L2_SVE_UINT8.h
new file mode 100644
index 000000000..bf56d9ad1
--- /dev/null
+++ b/src/VecSim/spaces/L2/L2_SVE_UINT8.h
@@ -0,0 +1,86 @@
+/*
+ *Copyright Redis Ltd. 2021 - present
+ *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
+ *the Server Side Public License v1 (SSPLv1).
+ */
+
+#include "VecSim/spaces/space_includes.h"
+#include <arm_sve.h>
+
+// Aligned step using svptrue_b8()
+inline void L2SquareStep(const uint8_t *&pVect1, const uint8_t *&pVect2, size_t &offset,
+                         svuint32_t &sum, const size_t chunk) {
+    svbool_t pg = svptrue_b8();
+    // Note: Because all the bits are 1, the extention to 16 and 32 bits does not make a difference
+    // Otherwise, pg should be recalculated for 16 and 32 operations
+
+    svuint8_t v1_ui8 = svld1_u8(pg, pVect1 + offset); // Load uint8 vectors from pVect1
+    svuint8_t v2_ui8 = svld1_u8(pg, pVect2 + offset); // Load uint8 vectors from pVect2
+
+    svuint8_t abs_diff = svabd_u8_x(pg, v1_ui8, v2_ui8);
+
+    sum = svdot_u32(sum, abs_diff, abs_diff);
+
+    offset += chunk; // Move to the next set of uint8 elements
+}
+
+template <bool partial_chunk, unsigned char additional_steps>
+float UINT8_L2SqrSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    const uint8_t *pVect1 = reinterpret_cast<const uint8_t *>(pVect1v);
+    const uint8_t *pVect2 = reinterpret_cast<const uint8_t *>(pVect2v);
+
+    // number of uint8 per SVE register
+    const size_t vl = svcntb();
+    const size_t chunk_size = 4 * vl;
+    svbool_t all = svptrue_b8();
+
+    // Each L2SquareStep adds maximum (2^8)^2 = 2^16
+    // Therefor, on a single accumulator, we can perform 2^16 steps before overflowing
+    // That scenario will happen only is the dimension of the vector is larger than 16*4*2^16 = 2^22
+    // (16 uint8 in 1 SVE register) * (4 accumulators) * (2^16 steps)
+    // We can safely assume that the dimension is smaller than that
+    // So using uint32_t is safe
+
+    svuint32_t sum0 = svdup_u32(0);
+    svuint32_t sum1 = svdup_u32(0);
+    svuint32_t sum2 = svdup_u32(0);
+    svuint32_t sum3 = svdup_u32(0);
+
+    size_t offset = 0;
+    size_t num_main_blocks = dimension / chunk_size;
+
+    for (size_t i = 0; i < num_main_blocks; ++i) {
+        L2SquareStep(pVect1, pVect2, offset, sum0, vl);
+        L2SquareStep(pVect1, pVect2, offset, sum1, vl);
+        L2SquareStep(pVect1, pVect2, offset, sum2, vl);
+        L2SquareStep(pVect1, pVect2, offset, sum3, vl);
+    }
+
+    if constexpr (additional_steps > 0) {
+        if constexpr (additional_steps >= 1) {
+            L2SquareStep(pVect1, pVect2, offset, sum0, vl);
+        }
+        if constexpr (additional_steps >= 2) {
+            L2SquareStep(pVect1, pVect2, offset, sum1, vl);
+        }
+        if constexpr (additional_steps >= 3) {
+            L2SquareStep(pVect1, pVect2, offset, sum2, vl);
+        }
+    }
+
+    if constexpr (partial_chunk) {
+
+        svbool_t pg = svwhilelt_b8_u64(offset, dimension);
+        svuint8_t v1_ui8 = svld1_u8(pg, pVect1 + offset); // Load uint8 vectors from pVect1
+        svuint8_t v2_ui8 = svld1_u8(pg, pVect2 + offset); // Load uint8 vectors from pVect2
+
+        svuint8_t abs_diff = svabd_u8_x(pg, v1_ui8, v2_ui8);
+
+        sum3 = svdot_u32(sum3, abs_diff, abs_diff);
+    }
+
+    sum0 = svadd_u32_x(all, sum0, sum1);
+    sum2 = svadd_u32_x(all, sum2, sum3);
+    svuint32_t sum_all = svadd_u32_x(all, sum0, sum2);
+    return svaddv_u32(svptrue_b32(), sum_all);
+}
diff --git a/src/VecSim/spaces/L2_space.cpp b/src/VecSim/spaces/L2_space.cpp
index 7acf2eb6d..56bee3216 100644
--- a/src/VecSim/spaces/L2_space.cpp
+++ b/src/VecSim/spaces/L2_space.cpp
@@ -267,12 +267,26 @@ dist_func_t<float> L2_INT8_GetDistFunc(size_t dim, unsigned char *alignment, con
     }
 
     dist_func_t<float> ret_dist_func = INT8_L2Sqr;
+
+    auto features = getCpuOptimizationFeatures(arch_opt);
+
+#ifdef CPU_FEATURES_ARCH_AARCH64
+#ifdef OPT_SVE2
+    if (features.sve2) {
+        return Choose_INT8_L2_implementation_SVE2(dim);
+    }
+#endif
+#ifdef OPT_SVE
+    if (features.sve) {
+        return Choose_INT8_L2_implementation_SVE(dim);
+    }
+#endif
+#endif
+#ifdef CPU_FEATURES_ARCH_X86_64
     // Optimizations assume at least 32 int8. If we have less, we use the naive implementation.
     if (dim < 32) {
         return ret_dist_func;
     }
-#ifdef CPU_FEATURES_ARCH_X86_64
-    auto features = getCpuOptimizationFeatures(arch_opt);
 #ifdef OPT_AVX512_F_BW_VL_VNNI
     if (features.avx512f && features.avx512bw && features.avx512vl && features.avx512vnni) {
         if (dim % 32 == 0) // no point in aligning if we have an offsetting residual
@@ -296,8 +310,22 @@ dist_func_t<float> L2_UINT8_GetDistFunc(size_t dim, unsigned char *alignment,
     if (dim < 32) {
         return ret_dist_func;
     }
-#ifdef CPU_FEATURES_ARCH_X86_64
+
     auto features = getCpuOptimizationFeatures(arch_opt);
+
+#ifdef CPU_FEATURES_ARCH_AARCH64
+#ifdef OPT_SVE2
+    if (features.sve2) {
+        return Choose_UINT8_L2_implementation_SVE2(dim);
+    }
+#endif
+#ifdef OPT_SVE
+    if (features.sve) {
+        return Choose_UINT8_L2_implementation_SVE(dim);
+    }
+#endif
+#endif
+#ifdef CPU_FEATURES_ARCH_X86_64
 #ifdef OPT_AVX512_F_BW_VL_VNNI
     if (features.avx512f && features.avx512bw && features.avx512vl && features.avx512vnni) {
         if (dim % 32 == 0) // no point in aligning if we have an offsetting residual
diff --git a/src/VecSim/spaces/functions/SVE.cpp b/src/VecSim/spaces/functions/SVE.cpp
index e3ebc5209..d0ec5e514 100644
--- a/src/VecSim/spaces/functions/SVE.cpp
+++ b/src/VecSim/spaces/functions/SVE.cpp
@@ -15,6 +15,12 @@
 #include "VecSim/spaces/IP/IP_SVE_FP64.h"
 #include "VecSim/spaces/L2/L2_SVE_FP64.h"
 
+#include "VecSim/spaces/L2/L2_SVE_INT8.h"
+#include "VecSim/spaces/IP/IP_SVE_INT8.h"
+
+#include "VecSim/spaces/L2/L2_SVE_UINT8.h"
+#include "VecSim/spaces/IP/IP_SVE_UINT8.h"
+
 namespace spaces {
 
 #include "implementation_chooser.h"
@@ -52,6 +58,42 @@ dist_func_t<double> Choose_FP64_L2_implementation_SVE(size_t dim) {
     return ret_dist_func;
 }
 
+dist_func_t<float> Choose_INT8_L2_implementation_SVE(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, INT8_L2SqrSIMD_SVE, dim, svcntb);
+    return ret_dist_func;
+}
+
+dist_func_t<float> Choose_INT8_IP_implementation_SVE(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, INT8_InnerProductSIMD_SVE, dim, svcntb);
+    return ret_dist_func;
+}
+
+dist_func_t<float> Choose_INT8_Cosine_implementation_SVE(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, INT8_CosineSIMD_SVE, dim, svcntb);
+    return ret_dist_func;
+}
+
+dist_func_t<float> Choose_UINT8_L2_implementation_SVE(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, UINT8_L2SqrSIMD_SVE, dim, svcntb);
+    return ret_dist_func;
+}
+
+dist_func_t<float> Choose_UINT8_IP_implementation_SVE(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, UINT8_InnerProductSIMD_SVE, dim, svcntb);
+    return ret_dist_func;
+}
+
+dist_func_t<float> Choose_UINT8_Cosine_implementation_SVE(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, UINT8_CosineSIMD_SVE, dim, svcntb);
+    return ret_dist_func;
+}
+
 #include "implementation_chooser_cleanup.h"
 
 } // namespace spaces
diff --git a/src/VecSim/spaces/functions/SVE.h b/src/VecSim/spaces/functions/SVE.h
index 35d1ffe4c..e06214a11 100644
--- a/src/VecSim/spaces/functions/SVE.h
+++ b/src/VecSim/spaces/functions/SVE.h
@@ -19,4 +19,16 @@ dist_func_t<float> Choose_FP16_L2_implementation_SVE(size_t dim);
 dist_func_t<double> Choose_FP64_IP_implementation_SVE(size_t dim);
 dist_func_t<double> Choose_FP64_L2_implementation_SVE(size_t dim);
 
+dist_func_t<float> Choose_INT8_L2_implementation_SVE(size_t dim);
+
+dist_func_t<float> Choose_INT8_IP_implementation_SVE(size_t dim);
+
+dist_func_t<float> Choose_INT8_Cosine_implementation_SVE(size_t dim);
+
+dist_func_t<float> Choose_UINT8_L2_implementation_SVE(size_t dim);
+
+dist_func_t<float> Choose_UINT8_IP_implementation_SVE(size_t dim);
+
+dist_func_t<float> Choose_UINT8_Cosine_implementation_SVE(size_t dim);
+
 } // namespace spaces
diff --git a/src/VecSim/spaces/functions/SVE2.cpp b/src/VecSim/spaces/functions/SVE2.cpp
index c52c99da9..57f649a6c 100644
--- a/src/VecSim/spaces/functions/SVE2.cpp
+++ b/src/VecSim/spaces/functions/SVE2.cpp
@@ -14,6 +14,10 @@
 
 #include "VecSim/spaces/IP/IP_SVE_FP64.h"
 #include "VecSim/spaces/L2/L2_SVE_FP64.h"
+#include "VecSim/spaces/L2/L2_SVE_INT8.h"  // SVE2 implementation is identical to SVE
+#include "VecSim/spaces/IP/IP_SVE_INT8.h"  // SVE2 implementation is identical to SVE
+#include "VecSim/spaces/L2/L2_SVE_UINT8.h" // SVE2 implementation is identical to SVE
+#include "VecSim/spaces/IP/IP_SVE_UINT8.h" // SVE2 implementation is identical to SVE
 
 namespace spaces {
 
@@ -52,6 +56,42 @@ dist_func_t<double> Choose_FP64_L2_implementation_SVE2(size_t dim) {
     return ret_dist_func;
 }
 
+dist_func_t<float> Choose_INT8_L2_implementation_SVE2(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, INT8_L2SqrSIMD_SVE, dim, svcntb);
+    return ret_dist_func;
+}
+
+dist_func_t<float> Choose_INT8_IP_implementation_SVE2(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, INT8_InnerProductSIMD_SVE, dim, svcntb);
+    return ret_dist_func;
+}
+
+dist_func_t<float> Choose_INT8_Cosine_implementation_SVE2(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, INT8_CosineSIMD_SVE, dim, svcntb);
+    return ret_dist_func;
+}
+
+dist_func_t<float> Choose_UINT8_L2_implementation_SVE2(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, UINT8_L2SqrSIMD_SVE, dim, svcntb);
+    return ret_dist_func;
+}
+
+dist_func_t<float> Choose_UINT8_IP_implementation_SVE2(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, UINT8_InnerProductSIMD_SVE, dim, svcntb);
+    return ret_dist_func;
+}
+
+dist_func_t<float> Choose_UINT8_Cosine_implementation_SVE2(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, UINT8_CosineSIMD_SVE, dim, svcntb);
+    return ret_dist_func;
+}
+
 #include "implementation_chooser_cleanup.h"
 
 } // namespace spaces
diff --git a/src/VecSim/spaces/functions/SVE2.h b/src/VecSim/spaces/functions/SVE2.h
index 67e36dc20..d9f89dfd0 100644
--- a/src/VecSim/spaces/functions/SVE2.h
+++ b/src/VecSim/spaces/functions/SVE2.h
@@ -19,4 +19,16 @@ dist_func_t<float> Choose_FP16_L2_implementation_SVE2(size_t dim);
 dist_func_t<double> Choose_FP64_IP_implementation_SVE2(size_t dim);
 dist_func_t<double> Choose_FP64_L2_implementation_SVE2(size_t dim);
 
+dist_func_t<float> Choose_INT8_L2_implementation_SVE2(size_t dim);
+
+dist_func_t<float> Choose_INT8_IP_implementation_SVE2(size_t dim);
+
+dist_func_t<float> Choose_INT8_Cosine_implementation_SVE2(size_t dim);
+
+dist_func_t<float> Choose_UINT8_L2_implementation_SVE2(size_t dim);
+
+dist_func_t<float> Choose_UINT8_IP_implementation_SVE2(size_t dim);
+
+dist_func_t<float> Choose_UINT8_Cosine_implementation_SVE2(size_t dim);
+
 } // namespace spaces
diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_int8.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_int8.cpp
index 11eb0908b..f39ce1100 100644
--- a/tests/benchmark/spaces_benchmarks/bm_spaces_int8.cpp
+++ b/tests/benchmark/spaces_benchmarks/bm_spaces_int8.cpp
@@ -51,6 +51,20 @@ INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_Integers_INT8, INT8, AVX512F_BW
 
 #endif // x86_64
 
+#ifdef CPU_FEATURES_ARCH_AARCH64
+cpu_features::Aarch64Features opt = cpu_features::GetAarch64Info().features;
+#ifdef OPT_SVE2
+bool sve2_supported = opt.sve2; // Check for SVE support
+INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_Integers_INT8, INT8, SVE2, 32, sve2_supported);
+INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_Integers_INT8, INT8, SVE2, 32, sve2_supported);
+#endif
+#ifdef OPT_SVE
+bool sve_supported = opt.sve; // Check for SVE support
+INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_Integers_INT8, INT8, SVE, 32, sve2_supported);
+INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_Integers_INT8, INT8, SVE, 32, sve2_supported);
+#endif
+#endif
+
 INITIALIZE_NAIVE_BM(BM_VecSimSpaces_Integers_INT8, INT8, InnerProduct, 32);
 INITIALIZE_NAIVE_BM(BM_VecSimSpaces_Integers_INT8, INT8, Cosine, 32);
 INITIALIZE_NAIVE_BM(BM_VecSimSpaces_Integers_INT8, INT8, L2Sqr, 32);
diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_uint8.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_uint8.cpp
index 248259165..71ab1d6e8 100644
--- a/tests/benchmark/spaces_benchmarks/bm_spaces_uint8.cpp
+++ b/tests/benchmark/spaces_benchmarks/bm_spaces_uint8.cpp
@@ -51,6 +51,20 @@ INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_Integers_UINT8, UINT8, AVX512F_
 
 #endif // x86_64
 
+#ifdef CPU_FEATURES_ARCH_AARCH64
+cpu_features::Aarch64Features opt = cpu_features::GetAarch64Info().features;
+#ifdef OPT_SVE2
+bool sve2_supported = opt.sve2; // Check for SVE support
+INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_Integers_UINT8, UINT8, SVE2, 32, sve2_supported);
+INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_Integers_UINT8, UINT8, SVE2, 32, sve2_supported);
+#endif
+#ifdef OPT_SVE
+bool sve_supported = opt.sve; // Check for SVE support
+INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_Integers_UINT8, UINT8, SVE, 32, sve2_supported);
+INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_Integers_UINT8, UINT8, SVE, 32, sve2_supported);
+#endif
+#endif
+
 INITIALIZE_NAIVE_BM(BM_VecSimSpaces_Integers_UINT8, UINT8, InnerProduct, 32);
 INITIALIZE_NAIVE_BM(BM_VecSimSpaces_Integers_UINT8, UINT8, Cosine, 32);
 INITIALIZE_NAIVE_BM(BM_VecSimSpaces_Integers_UINT8, UINT8, L2Sqr, 32);
diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp
index 19487bc8d..c809e14ca 100644
--- a/tests/unit/test_spaces.cpp
+++ b/tests/unit/test_spaces.cpp
@@ -1304,6 +1304,30 @@ TEST_P(INT8SpacesOptimizationTest, INT8L2SqrTest) {
         optimization.avx512f = optimization.avx512bw = optimization.avx512vl =
             optimization.avx512vnni = 0;
     }
+#endif
+#ifdef OPT_SVE2
+    if (optimization.sve2) {
+        unsigned char alignment = 0;
+        arch_opt_func = L2_INT8_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_INT8_L2_implementation_SVE2(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_EQ(baseline, arch_opt_func(v1, v2, dim)) << "SVE2 with dim " << dim;
+        ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim;
+        // Unset sve2 flag as well, so we'll choose the next option (default).
+        optimization.sve2 = 0;
+    }
+#endif
+#ifdef OPT_SVE
+    if (optimization.sve) {
+        unsigned char alignment = 0;
+        arch_opt_func = L2_INT8_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_INT8_L2_implementation_SVE(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_EQ(baseline, arch_opt_func(v1, v2, dim)) << "SVE with dim " << dim;
+        ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim;
+        // Unset sve flag as well, so we'll choose the next option (default).
+        optimization.sve = 0;
+    }
 #endif
     unsigned char alignment = 0;
     arch_opt_func = L2_INT8_GetDistFunc(dim, &alignment, &optimization);
@@ -1340,6 +1364,30 @@ TEST_P(INT8SpacesOptimizationTest, INT8InnerProductTest) {
         optimization.avx512f = optimization.avx512bw = optimization.avx512vl =
             optimization.avx512vnni = 0;
     }
+#endif
+#ifdef OPT_SVE2
+    if (optimization.sve2) {
+        unsigned char alignment = 0;
+        arch_opt_func = IP_INT8_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_INT8_IP_implementation_SVE2(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_EQ(baseline, arch_opt_func(v1, v2, dim)) << "SVE2 with dim " << dim;
+        ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim;
+        // Unset sve flag as well, so we'll choose the next option (default).
+        optimization.sve2 = 0;
+    }
+#endif
+#ifdef OPT_SVE
+    if (optimization.sve) {
+        unsigned char alignment = 0;
+        arch_opt_func = IP_INT8_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_INT8_IP_implementation_SVE(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_EQ(baseline, arch_opt_func(v1, v2, dim)) << "SVE with dim " << dim;
+        ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim;
+        // Unset sve flag as well, so we'll choose the next option (default).
+        optimization.sve = 0;
+    }
 #endif
     unsigned char alignment = 0;
     arch_opt_func = IP_INT8_GetDistFunc(dim, &alignment, &optimization);
@@ -1377,6 +1425,30 @@ TEST_P(INT8SpacesOptimizationTest, INT8CosineTest) {
         optimization.avx512f = optimization.avx512bw = optimization.avx512vl =
             optimization.avx512vnni = 0;
     }
+#endif
+#ifdef OPT_SVE2
+    if (optimization.sve2) {
+        unsigned char alignment = 0;
+        arch_opt_func = Cosine_INT8_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_INT8_Cosine_implementation_SVE2(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_EQ(baseline, arch_opt_func(v1, v2, dim)) << "SVE2 with dim " << dim;
+        ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim;
+        // Unset sve flag as well, so we'll choose the next option (default).
+        optimization.sve2 = 0;
+    }
+#endif
+#ifdef OPT_SVE
+    if (optimization.sve) {
+        unsigned char alignment = 0;
+        arch_opt_func = Cosine_INT8_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_INT8_Cosine_implementation_SVE(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_EQ(baseline, arch_opt_func(v1, v2, dim)) << "SVE with dim " << dim;
+        ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim;
+        // Unset sve flag as well, so we'll choose the next option (default).
+        optimization.sve = 0;
+    }
 #endif
     unsigned char alignment = 0;
     arch_opt_func = Cosine_INT8_GetDistFunc(dim, &alignment, &optimization);
@@ -1418,6 +1490,30 @@ TEST_P(UINT8SpacesOptimizationTest, UINT8L2SqrTest) {
         optimization.avx512f = optimization.avx512bw = optimization.avx512vl =
             optimization.avx512vnni = 0;
     }
+#endif
+#ifdef OPT_SVE2
+    if (optimization.sve2) {
+        unsigned char alignment = 0;
+        arch_opt_func = L2_UINT8_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_UINT8_L2_implementation_SVE2(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_EQ(baseline, arch_opt_func(v1, v2, dim)) << "SVE2 with dim " << dim;
+        ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim;
+        // Unset sve2 flag as well, so we'll choose the next option (default).
+        optimization.sve2 = 0;
+    }
+#endif
+#ifdef OPT_SVE
+    if (optimization.sve) {
+        unsigned char alignment = 0;
+        arch_opt_func = L2_UINT8_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_UINT8_L2_implementation_SVE(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_EQ(baseline, arch_opt_func(v1, v2, dim)) << "SVE with dim " << dim;
+        ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim;
+        // Unset sve flag as well, so we'll choose the next option (default).
+        optimization.sve = 0;
+    }
 #endif
     unsigned char alignment = 0;
     arch_opt_func = L2_UINT8_GetDistFunc(dim, &alignment, &optimization);
@@ -1454,6 +1550,30 @@ TEST_P(UINT8SpacesOptimizationTest, UINT8InnerProductTest) {
         optimization.avx512f = optimization.avx512bw = optimization.avx512vl =
             optimization.avx512vnni = 0;
     }
+#endif
+#ifdef OPT_SVE2
+    if (optimization.sve2) {
+        unsigned char alignment = 0;
+        arch_opt_func = IP_UINT8_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_UINT8_IP_implementation_SVE2(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_EQ(baseline, arch_opt_func(v1, v2, dim)) << "SVE2 with dim " << dim;
+        ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim;
+        // Unset sve2 flag as well, so we'll choose the next option (default).
+        optimization.sve2 = 0;
+    }
+#endif
+#ifdef OPT_SVE
+    if (optimization.sve) {
+        unsigned char alignment = 0;
+        arch_opt_func = IP_UINT8_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_UINT8_IP_implementation_SVE(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_EQ(baseline, arch_opt_func(v1, v2, dim)) << "SVE with dim " << dim;
+        ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim;
+        // Unset sve flag as well, so we'll choose the next option (default).
+        optimization.sve = 0;
+    }
 #endif
     unsigned char alignment = 0;
     arch_opt_func = IP_UINT8_GetDistFunc(dim, &alignment, &optimization);
@@ -1477,6 +1597,30 @@ TEST_P(UINT8SpacesOptimizationTest, UINT8CosineTest) {
 
     dist_func_t<float> arch_opt_func;
     float baseline = UINT8_Cosine(v1, v2, dim);
+#ifdef OPT_SVE2
+    if (optimization.sve2) {
+        unsigned char alignment = 0;
+        arch_opt_func = Cosine_UINT8_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_UINT8_Cosine_implementation_SVE2(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_EQ(baseline, arch_opt_func(v1, v2, dim)) << "SVE2 with dim " << dim;
+        ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim;
+        // Unset sve2 flag as well, so we'll choose the next option (default).
+        optimization.sve2 = 0;
+    }
+#endif
+#ifdef OPT_SVE
+    if (optimization.sve) {
+        unsigned char alignment = 0;
+        arch_opt_func = Cosine_UINT8_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_UINT8_Cosine_implementation_SVE(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_EQ(baseline, arch_opt_func(v1, v2, dim)) << "SVE with dim " << dim;
+        ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim;
+        // Unset sve flag as well, so we'll choose the next option (default).
+        optimization.sve = 0;
+    }
+#endif
 #ifdef OPT_AVX512_F_BW_VL_VNNI
     if (optimization.avx512f && optimization.avx512bw && optimization.avx512vl &&
         optimization.avx512vnni) {
@@ -1492,12 +1636,90 @@ TEST_P(UINT8SpacesOptimizationTest, UINT8CosineTest) {
             optimization.avx512vnni = 0;
     }
 #endif
+
     unsigned char alignment = 0;
     arch_opt_func = Cosine_UINT8_GetDistFunc(dim, &alignment, &optimization);
     ASSERT_EQ(arch_opt_func, UINT8_Cosine) << "Unexpected distance function chosen for dim " << dim;
     ASSERT_EQ(baseline, arch_opt_func(v1, v2, dim)) << "No optimization with dim " << dim;
     ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim;
 }
+TEST_P(UINT8SpacesOptimizationTest, UINT8_full_range_test) {
+    auto optimization = getCpuOptimizationFeatures();
+    constexpr size_t dim = 512;
+
+    uint8_t v1[dim + sizeof(float)];
+    uint8_t v2[dim + sizeof(float)];
+
+    // v1: 0..255 followed by 255..0
+    for (size_t i = 0; i < 256; i++) {
+        v1[i] = static_cast<uint8_t>(i);
+        v1[256 + i] = static_cast<uint8_t>(255 - i);
+    }
+
+    // v2: 255..0 followed by 0..255
+    for (size_t i = 0; i < 256; i++) {
+        v2[i] = static_cast<uint8_t>(255 - i);
+        v2[256 + i] = static_cast<uint8_t>(i);
+    }
+
+    // write the norm at the end of the vector
+    *(float *)(v1 + dim) = test_utils::integral_compute_norm(v1, dim);
+    *(float *)(v2 + dim) = test_utils::integral_compute_norm(v2, dim);
+
+    float baseline_l2 = UINT8_L2Sqr(v1, v2, dim);
+    float baseline_ip = UINT8_InnerProduct(v1, v2, dim);
+    float baseline_cosine = UINT8_Cosine(v1, v2, dim);
+
+    dist_func_t<float> arch_opt_func;
+
+#ifdef OPT_SVE2
+    if (optimization.sve2) {
+        unsigned char alignment = 0;
+
+        arch_opt_func = Choose_UINT8_L2_implementation_SVE2(dim);
+        ASSERT_EQ(baseline_l2, arch_opt_func(v1, v2, dim)) << "L2 SVE2 with dim " << dim;
+        arch_opt_func = Choose_UINT8_IP_implementation_SVE2(dim);
+        ASSERT_EQ(baseline_ip, arch_opt_func(v1, v2, dim)) << "IP SVE2 with dim " << dim;
+        arch_opt_func = Choose_UINT8_Cosine_implementation_SVE2(dim);
+        ASSERT_EQ(baseline_cosine, arch_opt_func(v1, v2, dim)) << "Cosine SVE2 with dim " << dim;
+
+        // Unset sve2 flag as well, so we'll choose the next option (default).
+        optimization.sve2 = 0;
+    }
+#endif
+#ifdef OPT_SVE
+    if (optimization.sve) {
+        unsigned char alignment = 0;
+
+        arch_opt_func = Choose_UINT8_L2_implementation_SVE(dim);
+        ASSERT_EQ(baseline_l2, arch_opt_func(v1, v2, dim)) << "L2 SVE with dim " << dim;
+        arch_opt_func = Choose_UINT8_IP_implementation_SVE(dim);
+        ASSERT_EQ(baseline_ip, arch_opt_func(v1, v2, dim)) << "IP SVE with dim " << dim;
+        arch_opt_func = Choose_UINT8_Cosine_implementation_SVE(dim);
+        ASSERT_EQ(baseline_cosine, arch_opt_func(v1, v2, dim)) << "Cosine SVE with dim " << dim;
+
+        // Unset sve flag as well, so we'll choose the next option (default).
+        optimization.sve = 0;
+    }
+#endif
+#ifdef OPT_AVX512_F_BW_VL_VNNI
+    if (optimization.avx512f && optimization.avx512bw && optimization.avx512vl &&
+        optimization.avx512vnni) {
+        unsigned char alignment = 0;
+
+        arch_opt_func = Choose_UINT8_L2_implementation_AVX512F_BW_VL_VNNI(dim);
+        ASSERT_EQ(baseline_l2, arch_opt_func(v1, v2, dim)) << "L2 AVX512 with dim " << dim;
+        arch_opt_func = Choose_UINT8_IP_implementation_AVX512F_BW_VL_VNNI(dim);
+        ASSERT_EQ(baseline_ip, arch_opt_func(v1, v2, dim)) << "IP AVX512 with dim " << dim;
+        arch_opt_func = Choose_UINT8_Cosine_implementation_AVX512F_BW_VL_VNNI(dim);
+        ASSERT_EQ(baseline_cosine, arch_opt_func(v1, v2, dim)) << "Cosine AVX512 with dim " << dim;
+
+        // Unset optimizations flag, so we'll choose the next optimization.
+        optimization.avx512f = optimization.avx512bw = optimization.avx512vl =
+            optimization.avx512vnni = 0;
+    }
+#endif
+}
 
 INSTANTIATE_TEST_SUITE_P(UINT8OptFuncs, UINT8SpacesOptimizationTest,
                          testing::Range(32UL, 64 * 2UL + 1));