diff --git a/cmake/aarch64InstructionFlags.cmake b/cmake/aarch64InstructionFlags.cmake
index 9810485aa..7d6e72165 100644
--- a/cmake/aarch64InstructionFlags.cmake
+++ b/cmake/aarch64InstructionFlags.cmake
@@ -8,6 +8,7 @@ CHECK_CXX_COMPILER_FLAG("-march=armv7-a+neon" CXX_ARMV7_NEON)
 CHECK_CXX_COMPILER_FLAG("-march=armv8-a" CXX_ARMV8A)
 CHECK_CXX_COMPILER_FLAG("-march=armv8-a+sve" CXX_SVE)
 CHECK_CXX_COMPILER_FLAG("-march=armv9-a+sve2" CXX_SVE2)
+CHECK_CXX_COMPILER_FLAG("-march=armv8.2-a+fp16fml" CXX_NEON_HP)
 CHECK_CXX_COMPILER_FLAG("-march=armv8.2-a+bf16" CXX_NEON_BF16)
 CHECK_CXX_COMPILER_FLAG("-march=armv8.2-a+sve+bf16" CXX_SVE_BF16)
 
@@ -17,7 +18,12 @@ if(CXX_SVE2)
   add_compile_definitions(OPT_SVE2)
 endif()
 if (CXX_ARMV8A OR CXX_ARMV7_NEON)
+  message(STATUS "Using ARMv8.0-a with NEON")
   add_compile_definitions(OPT_NEON)
+  endif()
+if (CXX_NEON_HP)
+  message(STATUS "Using ARMv8.2-a with NEON half-percision extension")
+  add_compile_definitions(OPT_NEON_HP)
 endif()
 if (CXX_NEON_BF16)
   add_compile_definitions(OPT_NEON_BF16)
diff --git a/src/VecSim/spaces/CMakeLists.txt b/src/VecSim/spaces/CMakeLists.txt
index f0e354265..2674e711d 100644
--- a/src/VecSim/spaces/CMakeLists.txt
+++ b/src/VecSim/spaces/CMakeLists.txt
@@ -91,6 +91,13 @@ if (CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "(aarch64)|(arm64)|(ARM64)|(armv.*)")
 		list(APPEND OPTIMIZATIONS functions/NEON.cpp)
 	endif()
 
+	# NEON half-precision support
+	if (CXX_NEON_HP AND CXX_ARMV8A)
+		message("Building with NEON+HP")
+		set_source_files_properties(functions/NEON_HP.cpp PROPERTIES COMPILE_FLAGS "-march=armv8.2-a+fp16fml")
+		list(APPEND OPTIMIZATIONS functions/NEON_HP.cpp)
+  endif()
+
 	# NEON bfloat16 support
 	if (CXX_NEON_BF16)
 		message("Building with NEON + BF16")
diff --git a/src/VecSim/spaces/IP/IP_NEON_FP16.h b/src/VecSim/spaces/IP/IP_NEON_FP16.h
new file mode 100644
index 000000000..797b93ae6
--- /dev/null
+++ b/src/VecSim/spaces/IP/IP_NEON_FP16.h
@@ -0,0 +1,93 @@
+/*
+ *Copyright Redis Ltd. 2021 - present
+ *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
+ *the Server Side Public License v1 (SSPLv1).
+ */
+
+#include <arm_neon.h>
+
+inline void InnerProduct_Step(const float16_t *&vec1, const float16_t *&vec2, float16x8_t &acc) {
+    // Load half-precision vectors
+    float16x8_t v1 = vld1q_f16(vec1);
+    float16x8_t v2 = vld1q_f16(vec2);
+    vec1 += 8;
+    vec2 += 8;
+
+    // Multiply and accumulate
+    acc = vfmaq_f16(acc, v1, v2);
+}
+
+template <unsigned char residual> // 0..31
+float FP16_InnerProduct_NEON_HP(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    const auto *vec1 = static_cast<const float16_t *>(pVect1v);
+    const auto *vec2 = static_cast<const float16_t *>(pVect2v);
+    const auto *const v1End = vec1 + dimension;
+    float16x8_t acc1 = vdupq_n_f16(0.0f);
+    float16x8_t acc2 = vdupq_n_f16(0.0f);
+    float16x8_t acc3 = vdupq_n_f16(0.0f);
+    float16x8_t acc4 = vdupq_n_f16(0.0f);
+
+    // First, handle the partial chunk residual
+    if constexpr (residual % 8) {
+        auto constexpr chunk_residual = residual % 8;
+        // TODO: spacial cases for some residuals and benchmark if its better
+        constexpr uint16x8_t mask = {
+            0xFFFF,
+            (chunk_residual >= 2) ? 0xFFFF : 0,
+            (chunk_residual >= 3) ? 0xFFFF : 0,
+            (chunk_residual >= 4) ? 0xFFFF : 0,
+            (chunk_residual >= 5) ? 0xFFFF : 0,
+            (chunk_residual >= 6) ? 0xFFFF : 0,
+            (chunk_residual >= 7) ? 0xFFFF : 0,
+            0,
+        };
+
+        // Load partial vectors
+        float16x8_t v1 = vld1q_f16(vec1);
+        float16x8_t v2 = vld1q_f16(vec2);
+
+        // Apply mask to both vectors
+        float16x8_t masked_v1 = vbslq_f16(mask, v1, acc1); // `acc1` should be all zeros here
+        float16x8_t masked_v2 = vbslq_f16(mask, v2, acc2); // `acc2` should be all zeros here
+
+        // Multiply and accumulate
+        acc1 = vfmaq_f16(acc1, masked_v1, masked_v2);
+
+        // Advance pointers
+        vec1 += chunk_residual;
+        vec2 += chunk_residual;
+    }
+
+    // Handle (residual - (residual % 8)) in chunks of 8 float16
+    if constexpr (residual >= 8)
+        InnerProduct_Step(vec1, vec2, acc2);
+    if constexpr (residual >= 16)
+        InnerProduct_Step(vec1, vec2, acc3);
+    if constexpr (residual >= 24)
+        InnerProduct_Step(vec1, vec2, acc4);
+
+    // Process the rest of the vectors (the full chunks part)
+    while (vec1 < v1End) {
+        // TODO: use `vld1q_f16_x4` for quad-loading?
+        InnerProduct_Step(vec1, vec2, acc1);
+        InnerProduct_Step(vec1, vec2, acc2);
+        InnerProduct_Step(vec1, vec2, acc3);
+        InnerProduct_Step(vec1, vec2, acc4);
+    }
+
+    // Accumulate accumulators
+    acc1 = vpaddq_f16(acc1, acc3);
+    acc2 = vpaddq_f16(acc2, acc4);
+    acc1 = vpaddq_f16(acc1, acc2);
+
+    // Horizontal sum of the accumulated values
+    float32x4_t sum_f32 = vcvt_f32_f16(vget_low_f16(acc1));
+    sum_f32 = vaddq_f32(sum_f32, vcvt_f32_f16(vget_high_f16(acc1)));
+
+    // Pairwise add to get horizontal sum
+    float32x2_t sum_2 = vadd_f32(vget_low_f32(sum_f32), vget_high_f32(sum_f32));
+    sum_2 = vpadd_f32(sum_2, sum_2);
+
+    // Extract result
+    return 1.0f - vget_lane_f32(sum_2, 0);
+}
diff --git a/src/VecSim/spaces/IP/IP_SVE_FP16.h b/src/VecSim/spaces/IP/IP_SVE_FP16.h
new file mode 100644
index 000000000..2ac849920
--- /dev/null
+++ b/src/VecSim/spaces/IP/IP_SVE_FP16.h
@@ -0,0 +1,72 @@
+/*
+ *Copyright Redis Ltd. 2021 - present
+ *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
+ *the Server Side Public License v1 (SSPLv1).
+ */
+
+#include <arm_sve.h>
+
+inline void InnerProduct_Step(const float16_t *vec1, const float16_t *vec2, svfloat16_t &acc,
+                              size_t &offset, const size_t chunk) {
+    svbool_t all = svptrue_b16();
+
+    // Load half-precision vectors.
+    svfloat16_t v1 = svld1_f16(all, vec1 + offset);
+    svfloat16_t v2 = svld1_f16(all, vec2 + offset);
+    // Compute multiplications and add to the accumulator
+    acc = svmla_f16_x(all, acc, v1, v2);
+
+    // Move to next chunk
+    offset += chunk;
+}
+
+template <bool partial_chunk, unsigned char additional_steps> // [t/f, 0..3]
+float FP16_InnerProduct_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    const auto *vec1 = static_cast<const float16_t *>(pVect1v);
+    const auto *vec2 = static_cast<const float16_t *>(pVect2v);
+    const size_t chunk = svcnth(); // number of 16-bit elements in a register
+    svbool_t all = svptrue_b16();
+    svfloat16_t acc1 = svdup_f16(0.0f);
+    svfloat16_t acc2 = svdup_f16(0.0f);
+    svfloat16_t acc3 = svdup_f16(0.0f);
+    svfloat16_t acc4 = svdup_f16(0.0f);
+    size_t offset = 0;
+
+    // Process all full vectors
+    const size_t full_iterations = dimension / chunk / 4;
+    for (size_t iter = 0; iter < full_iterations; iter++) {
+        InnerProduct_Step(vec1, vec2, acc1, offset, chunk);
+        InnerProduct_Step(vec1, vec2, acc2, offset, chunk);
+        InnerProduct_Step(vec1, vec2, acc3, offset, chunk);
+        InnerProduct_Step(vec1, vec2, acc4, offset, chunk);
+    }
+
+    // Perform between 0 and 3 additional steps, according to `additional_steps` value
+    if constexpr (additional_steps >= 1)
+        InnerProduct_Step(vec1, vec2, acc1, offset, chunk);
+    if constexpr (additional_steps >= 2)
+        InnerProduct_Step(vec1, vec2, acc2, offset, chunk);
+    if constexpr (additional_steps >= 3)
+        InnerProduct_Step(vec1, vec2, acc3, offset, chunk);
+
+    // Handle the tail with the residual predicate
+    if constexpr (partial_chunk) {
+        svbool_t pg = svwhilelt_b16_u64(offset, dimension);
+
+        // Load half-precision vectors.
+        svfloat16_t v1 = svld1_f16(pg, vec1 + offset);
+        svfloat16_t v2 = svld1_f16(pg, vec2 + offset);
+        // Compute multiplications and add to the accumulator.
+        // use the existing value of `acc` for the inactive elements (by the `m` suffix)
+        acc4 = svmla_f16_m(pg, acc4, v1, v2);
+    }
+
+    // Accumulate accumulators
+    acc1 = svadd_f16_x(all, acc1, acc3);
+    acc2 = svadd_f16_x(all, acc2, acc4);
+    acc1 = svadd_f16_x(all, acc1, acc2);
+
+    // Reduce the accumulated sum.
+    float result = svaddv_f16(all, acc1);
+    return 1.0f - result;
+}
diff --git a/src/VecSim/spaces/IP_space.cpp b/src/VecSim/spaces/IP_space.cpp
index 1d23c587b..e930ee143 100644
--- a/src/VecSim/spaces/IP_space.cpp
+++ b/src/VecSim/spaces/IP_space.cpp
@@ -20,6 +20,7 @@
 #include "VecSim/spaces/functions/AVX2.h"
 #include "VecSim/spaces/functions/SSE3.h"
 #include "VecSim/spaces/functions/NEON.h"
+#include "VecSim/spaces/functions/NEON_HP.h"
 #include "VecSim/spaces/functions/NEON_BF16.h"
 #include "VecSim/spaces/functions/SVE.h"
 #include "VecSim/spaces/functions/SVE_BF16.h"
@@ -214,14 +215,33 @@ dist_func_t<float> IP_FP16_GetDistFunc(size_t dim, unsigned char *alignment, con
     if (alignment == nullptr) {
         alignment = &dummy_alignment;
     }
+    auto features = getCpuOptimizationFeatures(arch_opt);
 
     dist_func_t<float> ret_dist_func = FP16_InnerProduct;
+
+#if defined(CPU_FEATURES_ARCH_AARCH64)
+#ifdef OPT_SVE2
+    if (features.sve2) {
+        return Choose_FP16_IP_implementation_SVE2(dim);
+    }
+#endif
+#ifdef OPT_SVE
+    if (features.sve) {
+        return Choose_FP16_IP_implementation_SVE(dim);
+    }
+#endif
+#ifdef OPT_NEON_HP
+    if (features.asimdhp && dim >= 8) { // Optimization assumes at least 8 16FPs (full chunk)
+        return Choose_FP16_IP_implementation_NEON_HP(dim);
+    }
+#endif
+#endif
+
+#if defined(CPU_FEATURES_ARCH_X86_64)
     // Optimizations assume at least 32 16FPs. If we have less, we use the naive implementation.
     if (dim < 32) {
         return ret_dist_func;
     }
-#ifdef CPU_FEATURES_ARCH_X86_64
-    auto features = getCpuOptimizationFeatures(arch_opt);
 #ifdef OPT_AVX512_FP16_VL
     // More details about the dimension limitation can be found in this PR's description:
     // https://github.com/RedisAI/VectorSimilarity/pull/477
diff --git a/src/VecSim/spaces/L2/L2_NEON_FP16.h b/src/VecSim/spaces/L2/L2_NEON_FP16.h
new file mode 100644
index 000000000..b2ad8c0ee
--- /dev/null
+++ b/src/VecSim/spaces/L2/L2_NEON_FP16.h
@@ -0,0 +1,97 @@
+/*
+ *Copyright Redis Ltd. 2021 - present
+ *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
+ *the Server Side Public License v1 (SSPLv1).
+ */
+
+#include <arm_neon.h>
+
+inline void L2Sqr_Step(const float16_t *&vec1, const float16_t *&vec2, float16x8_t &acc) {
+    // Load half-precision vectors
+    float16x8_t v1 = vld1q_f16(vec1);
+    float16x8_t v2 = vld1q_f16(vec2);
+    vec1 += 8;
+    vec2 += 8;
+
+    // Calculate differences
+    float16x8_t diff = vsubq_f16(v1, v2);
+    // Square and accumulate
+    acc = vfmaq_f16(acc, diff, diff);
+}
+
+template <unsigned char residual> // 0..31
+float FP16_L2Sqr_NEON_HP(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    const auto *vec1 = static_cast<const float16_t *>(pVect1v);
+    const auto *vec2 = static_cast<const float16_t *>(pVect2v);
+    const auto *const v1End = vec1 + dimension;
+    float16x8_t acc1 = vdupq_n_f16(0.0f);
+    float16x8_t acc2 = vdupq_n_f16(0.0f);
+    float16x8_t acc3 = vdupq_n_f16(0.0f);
+    float16x8_t acc4 = vdupq_n_f16(0.0f);
+
+    // First, handle the partial chunk residual
+    if constexpr (residual % 8) {
+        auto constexpr chunk_residual = residual % 8;
+        // TODO: spacial cases for some residuals and benchmark if its better
+        constexpr uint16x8_t mask = {
+            0xFFFF,
+            (chunk_residual >= 2) ? 0xFFFF : 0,
+            (chunk_residual >= 3) ? 0xFFFF : 0,
+            (chunk_residual >= 4) ? 0xFFFF : 0,
+            (chunk_residual >= 5) ? 0xFFFF : 0,
+            (chunk_residual >= 6) ? 0xFFFF : 0,
+            (chunk_residual >= 7) ? 0xFFFF : 0,
+            0,
+        };
+
+        // Load partial vectors
+        float16x8_t v1 = vld1q_f16(vec1);
+        float16x8_t v2 = vld1q_f16(vec2);
+
+        // Apply mask to both vectors
+        float16x8_t masked_v1 = vbslq_f16(mask, v1, acc1); // `acc1` should be all zeros here
+        float16x8_t masked_v2 = vbslq_f16(mask, v2, acc2); // `acc2` should be all zeros here
+
+        // Calculate differences
+        float16x8_t diff = vsubq_f16(masked_v1, masked_v2);
+        // Square and accumulate
+        acc1 = vfmaq_f16(acc1, diff, diff);
+
+        // Advance pointers
+        vec1 += chunk_residual;
+        vec2 += chunk_residual;
+    }
+
+    // Handle (residual - (residual % 8)) in chunks of 8 float16
+    if constexpr (residual >= 8)
+        L2Sqr_Step(vec1, vec2, acc2);
+    if constexpr (residual >= 16)
+        L2Sqr_Step(vec1, vec2, acc3);
+    if constexpr (residual >= 24)
+        L2Sqr_Step(vec1, vec2, acc4);
+
+    // Process the rest of the vectors (the full chunks part)
+    while (vec1 < v1End) {
+        // TODO: use `vld1q_f16_x4` for quad-loading?
+        L2Sqr_Step(vec1, vec2, acc1);
+        L2Sqr_Step(vec1, vec2, acc2);
+        L2Sqr_Step(vec1, vec2, acc3);
+        L2Sqr_Step(vec1, vec2, acc4);
+    }
+
+    // Accumulate accumulators
+    acc1 = vpaddq_f16(acc1, acc3);
+    acc2 = vpaddq_f16(acc2, acc4);
+    acc1 = vpaddq_f16(acc1, acc2);
+
+    // Horizontal sum of the accumulated values
+    float32x4_t sum_f32 = vcvt_f32_f16(vget_low_f16(acc1));
+    sum_f32 = vaddq_f32(sum_f32, vcvt_f32_f16(vget_high_f16(acc1)));
+
+    // Pairwise add to get horizontal sum
+    float32x2_t sum_2 = vadd_f32(vget_low_f32(sum_f32), vget_high_f32(sum_f32));
+    sum_2 = vpadd_f32(sum_2, sum_2);
+
+    // Extract result
+    return vget_lane_f32(sum_2, 0);
+}
diff --git a/src/VecSim/spaces/L2/L2_SVE_FP16.h b/src/VecSim/spaces/L2/L2_SVE_FP16.h
new file mode 100644
index 000000000..4ce288d73
--- /dev/null
+++ b/src/VecSim/spaces/L2/L2_SVE_FP16.h
@@ -0,0 +1,73 @@
+/*
+ *Copyright Redis Ltd. 2021 - present
+ *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
+ *the Server Side Public License v1 (SSPLv1).
+ */
+
+#include <arm_sve.h>
+
+inline void L2Sqr_Step(const float16_t *vec1, const float16_t *vec2, svfloat16_t &acc,
+                       size_t &offset, const size_t chunk) {
+    svbool_t all = svptrue_b16();
+
+    svfloat16_t v1 = svld1_f16(all, vec1 + offset);
+    svfloat16_t v2 = svld1_f16(all, vec2 + offset);
+    // Compute difference in half precision.
+    svfloat16_t diff = svsub_f16_x(all, v1, v2);
+    // Square the differences and accumulate
+    acc = svmla_f16_x(all, acc, diff, diff);
+    offset += chunk;
+}
+
+template <bool partial_chunk, unsigned char additional_steps> // [t/f, 0..3]
+float FP16_L2Sqr_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    const auto *vec1 = static_cast<const float16_t *>(pVect1v);
+    const auto *vec2 = static_cast<const float16_t *>(pVect2v);
+    const size_t chunk = svcnth(); // number of 16-bit elements in a register
+    svbool_t all = svptrue_b16();
+    svfloat16_t acc1 = svdup_f16(0.0f);
+    svfloat16_t acc2 = svdup_f16(0.0f);
+    svfloat16_t acc3 = svdup_f16(0.0f);
+    svfloat16_t acc4 = svdup_f16(0.0f);
+    size_t offset = 0;
+
+    // Process all full vectors
+    const size_t full_iterations = dimension / chunk / 4;
+    for (size_t iter = 0; iter < full_iterations; iter++) {
+        L2Sqr_Step(vec1, vec2, acc1, offset, chunk);
+        L2Sqr_Step(vec1, vec2, acc2, offset, chunk);
+        L2Sqr_Step(vec1, vec2, acc3, offset, chunk);
+        L2Sqr_Step(vec1, vec2, acc4, offset, chunk);
+    }
+
+    // Perform between 0 and 3 additional steps, according to `additional_steps` value
+    if constexpr (additional_steps >= 1)
+        L2Sqr_Step(vec1, vec2, acc1, offset, chunk);
+    if constexpr (additional_steps >= 2)
+        L2Sqr_Step(vec1, vec2, acc2, offset, chunk);
+    if constexpr (additional_steps >= 3)
+        L2Sqr_Step(vec1, vec2, acc3, offset, chunk);
+
+    // Handle partial chunk, if needed
+    if constexpr (partial_chunk) {
+        svbool_t pg = svwhilelt_b16_u64(offset, dimension);
+
+        // Load half-precision vectors.
+        svfloat16_t v1 = svld1_f16(pg, vec1 + offset);
+        svfloat16_t v2 = svld1_f16(pg, vec2 + offset);
+        // Compute difference in half precision.
+        svfloat16_t diff = svsub_f16_x(pg, v1, v2);
+        // Square the differences.
+        // Use `m` suffix to keep the inactive elements as they are in `acc`
+        acc4 = svmla_f16_m(pg, acc4, diff, diff);
+    }
+
+    // Accumulate accumulators
+    acc1 = svadd_f16_x(all, acc1, acc3);
+    acc2 = svadd_f16_x(all, acc2, acc4);
+    acc1 = svadd_f16_x(all, acc1, acc2);
+
+    // Reduce the accumulated sum.
+    float result = svaddv_f16(all, acc1);
+    return result;
+}
diff --git a/src/VecSim/spaces/L2_space.cpp b/src/VecSim/spaces/L2_space.cpp
index d1b5306f2..7acf2eb6d 100644
--- a/src/VecSim/spaces/L2_space.cpp
+++ b/src/VecSim/spaces/L2_space.cpp
@@ -19,6 +19,7 @@
 #include "VecSim/spaces/functions/AVX2.h"
 #include "VecSim/spaces/functions/SSE3.h"
 #include "VecSim/spaces/functions/NEON.h"
+#include "VecSim/spaces/functions/NEON_HP.h"
 #include "VecSim/spaces/functions/NEON_BF16.h"
 #include "VecSim/spaces/functions/SVE.h"
 #include "VecSim/spaces/functions/SVE_BF16.h"
@@ -205,14 +206,33 @@ dist_func_t<float> L2_FP16_GetDistFunc(size_t dim, unsigned char *alignment, con
     if (alignment == nullptr) {
         alignment = &dummy_alignment;
     }
+    auto features = getCpuOptimizationFeatures(arch_opt);
 
     dist_func_t<float> ret_dist_func = FP16_L2Sqr;
+
+#if defined(CPU_FEATURES_ARCH_AARCH64)
+#ifdef OPT_SVE2
+    if (features.sve2) {
+        return Choose_FP16_L2_implementation_SVE2(dim);
+    }
+#endif
+#ifdef OPT_SVE
+    if (features.sve) {
+        return Choose_FP16_L2_implementation_SVE(dim);
+    }
+#endif
+#ifdef OPT_NEON_HP
+    if (features.asimdhp && dim >= 8) { // Optimization assumes at least 8 16FPs (full chunk)
+        return Choose_FP16_L2_implementation_NEON_HP(dim);
+    }
+#endif
+#endif // CPU_FEATURES_ARCH_AARCH64
+
+#if defined(CPU_FEATURES_ARCH_X86_64)
     // Optimizations assume at least 32 16FPs. If we have less, we use the naive implementation.
     if (dim < 32) {
         return ret_dist_func;
     }
-#ifdef CPU_FEATURES_ARCH_X86_64
-    auto features = getCpuOptimizationFeatures(arch_opt);
 #ifdef OPT_AVX512_FP16_VL
     // More details about the dimension limitation can be found in this PR's description:
     // https://github.com/RedisAI/VectorSimilarity/pull/477
diff --git a/src/VecSim/spaces/functions/NEON_HP.cpp b/src/VecSim/spaces/functions/NEON_HP.cpp
new file mode 100644
index 000000000..aed96750e
--- /dev/null
+++ b/src/VecSim/spaces/functions/NEON_HP.cpp
@@ -0,0 +1,30 @@
+/*
+ *Copyright Redis Ltd. 2021 - present
+ *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
+ *the Server Side Public License v1 (SSPLv1).
+ */
+
+#include "NEON_HP.h"
+
+#include "VecSim/spaces/L2/L2_NEON_FP16.h"
+#include "VecSim/spaces/IP/IP_NEON_FP16.h"
+
+namespace spaces {
+
+#include "implementation_chooser.h"
+
+dist_func_t<float> Choose_FP16_L2_implementation_NEON_HP(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 32, FP16_L2Sqr_NEON_HP);
+    return ret_dist_func;
+}
+
+dist_func_t<float> Choose_FP16_IP_implementation_NEON_HP(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 32, FP16_InnerProduct_NEON_HP);
+    return ret_dist_func;
+}
+
+#include "implementation_chooser_cleanup.h"
+
+} // namespace spaces
diff --git a/src/VecSim/spaces/functions/NEON_HP.h b/src/VecSim/spaces/functions/NEON_HP.h
new file mode 100644
index 000000000..594f653d3
--- /dev/null
+++ b/src/VecSim/spaces/functions/NEON_HP.h
@@ -0,0 +1,17 @@
+/*
+ *Copyright Redis Ltd. 2021 - present
+ *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
+ *the Server Side Public License v1 (SSPLv1).
+ */
+
+#pragma once
+
+#include "VecSim/spaces/spaces.h"
+
+namespace spaces {
+
+dist_func_t<float> Choose_FP16_IP_implementation_NEON_HP(size_t dim);
+
+dist_func_t<float> Choose_FP16_L2_implementation_NEON_HP(size_t dim);
+
+} // namespace spaces
diff --git a/src/VecSim/spaces/functions/SVE.cpp b/src/VecSim/spaces/functions/SVE.cpp
index cadf622a8..e3ebc5209 100644
--- a/src/VecSim/spaces/functions/SVE.cpp
+++ b/src/VecSim/spaces/functions/SVE.cpp
@@ -5,9 +5,13 @@
  */
 
 #include "SVE.h"
+
 #include "VecSim/spaces/L2/L2_SVE_FP32.h"
 #include "VecSim/spaces/IP/IP_SVE_FP32.h"
 
+#include "VecSim/spaces/IP/IP_SVE_FP16.h"
+#include "VecSim/spaces/L2/L2_SVE_FP16.h"
+
 #include "VecSim/spaces/IP/IP_SVE_FP64.h"
 #include "VecSim/spaces/L2/L2_SVE_FP64.h"
 
@@ -26,6 +30,17 @@ dist_func_t<float> Choose_FP32_L2_implementation_SVE(size_t dim) {
     return ret_dist_func;
 }
 
+dist_func_t<float> Choose_FP16_IP_implementation_SVE(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, FP16_InnerProduct_SVE, dim, svcnth);
+    return ret_dist_func;
+}
+dist_func_t<float> Choose_FP16_L2_implementation_SVE(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, FP16_L2Sqr_SVE, dim, svcnth);
+    return ret_dist_func;
+}
+
 dist_func_t<double> Choose_FP64_IP_implementation_SVE(size_t dim) {
     dist_func_t<double> ret_dist_func;
     CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, FP64_InnerProductSIMD_SVE, dim, svcntd);
diff --git a/src/VecSim/spaces/functions/SVE.h b/src/VecSim/spaces/functions/SVE.h
index 6cc3d5bbf..35d1ffe4c 100644
--- a/src/VecSim/spaces/functions/SVE.h
+++ b/src/VecSim/spaces/functions/SVE.h
@@ -11,9 +11,12 @@
 namespace spaces {
 
 dist_func_t<float> Choose_FP32_IP_implementation_SVE(size_t dim);
-dist_func_t<double> Choose_FP64_IP_implementation_SVE(size_t dim);
-
 dist_func_t<float> Choose_FP32_L2_implementation_SVE(size_t dim);
+
+dist_func_t<float> Choose_FP16_IP_implementation_SVE(size_t dim);
+dist_func_t<float> Choose_FP16_L2_implementation_SVE(size_t dim);
+
+dist_func_t<double> Choose_FP64_IP_implementation_SVE(size_t dim);
 dist_func_t<double> Choose_FP64_L2_implementation_SVE(size_t dim);
 
 } // namespace spaces
diff --git a/src/VecSim/spaces/functions/SVE2.cpp b/src/VecSim/spaces/functions/SVE2.cpp
index 4256af788..c52c99da9 100644
--- a/src/VecSim/spaces/functions/SVE2.cpp
+++ b/src/VecSim/spaces/functions/SVE2.cpp
@@ -5,9 +5,13 @@
  */
 
 #include "SVE2.h"
+
 #include "VecSim/spaces/L2/L2_SVE_FP32.h"
 #include "VecSim/spaces/IP/IP_SVE_FP32.h"
 
+#include "VecSim/spaces/IP/IP_SVE_FP16.h" // using SVE implementation, but different compilation flags
+#include "VecSim/spaces/L2/L2_SVE_FP16.h" // using SVE implementation, but different compilation flags
+
 #include "VecSim/spaces/IP/IP_SVE_FP64.h"
 #include "VecSim/spaces/L2/L2_SVE_FP64.h"
 
@@ -26,6 +30,17 @@ dist_func_t<float> Choose_FP32_L2_implementation_SVE2(size_t dim) {
     return ret_dist_func;
 }
 
+dist_func_t<float> Choose_FP16_IP_implementation_SVE2(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, FP16_InnerProduct_SVE, dim, svcnth);
+    return ret_dist_func;
+}
+dist_func_t<float> Choose_FP16_L2_implementation_SVE2(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, FP16_L2Sqr_SVE, dim, svcnth);
+    return ret_dist_func;
+}
+
 dist_func_t<double> Choose_FP64_IP_implementation_SVE2(size_t dim) {
     dist_func_t<double> ret_dist_func;
     CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, FP64_InnerProductSIMD_SVE, dim, svcntd);
diff --git a/src/VecSim/spaces/functions/SVE2.h b/src/VecSim/spaces/functions/SVE2.h
index 84a4b4af8..67e36dc20 100644
--- a/src/VecSim/spaces/functions/SVE2.h
+++ b/src/VecSim/spaces/functions/SVE2.h
@@ -11,11 +11,12 @@
 namespace spaces {
 
 dist_func_t<float> Choose_FP32_IP_implementation_SVE2(size_t dim);
-
 dist_func_t<float> Choose_FP32_L2_implementation_SVE2(size_t dim);
 
-dist_func_t<double> Choose_FP64_IP_implementation_SVE2(size_t dim);
+dist_func_t<float> Choose_FP16_IP_implementation_SVE2(size_t dim);
+dist_func_t<float> Choose_FP16_L2_implementation_SVE2(size_t dim);
 
+dist_func_t<double> Choose_FP64_IP_implementation_SVE2(size_t dim);
 dist_func_t<double> Choose_FP64_L2_implementation_SVE2(size_t dim);
 
 } // namespace spaces
diff --git a/src/VecSim/spaces/functions/implementation_chooser_cleanup.h b/src/VecSim/spaces/functions/implementation_chooser_cleanup.h
index f93438a11..eff414be3 100644
--- a/src/VecSim/spaces/functions/implementation_chooser_cleanup.h
+++ b/src/VecSim/spaces/functions/implementation_chooser_cleanup.h
@@ -23,4 +23,7 @@
 #undef CASES32
 #undef CASES64
 
+#undef SVE_CASE
+
 #undef CHOOSE_IMPLEMENTATION
+#undef CHOOSE_SVE_IMPLEMENTATION
diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces.h b/tests/benchmark/spaces_benchmarks/bm_spaces.h
index d986d23f2..b14146d20 100644
--- a/tests/benchmark/spaces_benchmarks/bm_spaces.h
+++ b/tests/benchmark/spaces_benchmarks/bm_spaces.h
@@ -25,6 +25,7 @@
 #include "VecSim/spaces/functions/SSE3.h"
 #include "VecSim/spaces/functions/SSE.h"
 #include "VecSim/spaces/functions/NEON.h"
+#include "VecSim/spaces/functions/NEON_HP.h"
 #include "VecSim/spaces/functions/NEON_BF16.h"
 #include "VecSim/spaces/functions/SVE.h"
 #include "VecSim/spaces/functions/SVE_BF16.h"
diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_fp16.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_fp16.cpp
index 9457bc77d..096caefff 100644
--- a/tests/benchmark/spaces_benchmarks/bm_spaces_fp16.cpp
+++ b/tests/benchmark/spaces_benchmarks/bm_spaces_fp16.cpp
@@ -13,6 +13,25 @@ class BM_VecSimSpaces_FP16 : public BM_VecSimSpaces<vecsim_types::float16> {
     }
 };
 
+#ifdef CPU_FEATURES_ARCH_AARCH64
+cpu_features::Aarch64Features opt = cpu_features::GetAarch64Info().features;
+
+// NEON implementation for ARMv8-a
+#ifdef OPT_NEON_HP
+bool neon_supported = opt.asimdhp; // ARMv8-a always supports NEON
+INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_FP16, FP16, NEON_HP, 32, neon_supported);
+#endif
+#ifdef OPT_SVE
+bool sve_supported = opt.sve; // Check for SVE support
+INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_FP16, FP16, SVE, 32, sve_supported);
+#endif
+// SVE2 implementation
+#ifdef OPT_SVE2
+bool sve2_supported = opt.sve2; // Check for SVE2 support
+INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_FP16, FP16, SVE2, 32, sve2_supported);
+#endif
+#endif // AARCH64
+
 #ifdef CPU_FEATURES_ARCH_X86_64
 cpu_features::X86Features opt = cpu_features::GetX86Info().features;
 
diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp
index 3e03db6c9..19487bc8d 100644
--- a/tests/unit/test_spaces.cpp
+++ b/tests/unit/test_spaces.cpp
@@ -28,6 +28,7 @@
 #include "VecSim/spaces/functions/SSE3.h"
 #include "VecSim/spaces/functions/F16C.h"
 #include "VecSim/spaces/functions/NEON.h"
+#include "VecSim/spaces/functions/NEON_HP.h"
 #include "VecSim/spaces/functions/NEON_BF16.h"
 #include "VecSim/spaces/functions/SVE.h"
 #include "VecSim/spaces/functions/SVE_BF16.h"
@@ -979,8 +980,8 @@ TEST_P(FP16SpacesOptimizationTest, FP16InnerProductTest) {
     dist_func_t<float> arch_opt_func;
     float baseline = FP16_InnerProduct(v1, v2, dim);
     ASSERT_EQ(baseline, FP32_InnerProduct(v1_fp32, v2_fp32, dim)) << "Baseline check " << dim;
-    // Turn off advanced fp16 flags. They will be tested in the next test.
 #if defined(CPU_FEATURES_ARCH_X86_64)
+    // Turn off advanced fp16 flags. They will be tested in the next test.
     optimization.avx512_fp16 = optimization.avx512vl = 0;
 #ifdef OPT_AVX512F
     if (optimization.avx512f) {
@@ -1004,12 +1005,15 @@ TEST_P(FP16SpacesOptimizationTest, FP16InnerProductTest) {
         optimization.f16c = optimization.fma3 = optimization.avx = 0;
     }
 #endif
+#elif defined(CPU_FEATURES_ARCH_AARCH64)
+    // Turn off advanced fp16 flags. They will be tested in the next test.
+    optimization.sve = optimization.sve2 = optimization.asimdhp = 0;
 #endif
     unsigned char alignment = 0;
     arch_opt_func = IP_FP16_GetDistFunc(dim, &alignment, &optimization);
     ASSERT_EQ(arch_opt_func, FP16_InnerProduct)
         << "Unexpected distance function chosen for dim " << dim;
-    ASSERT_EQ(baseline, arch_opt_func(v1, v2, dim)) << "F16C with dim " << dim;
+    ASSERT_EQ(baseline, arch_opt_func(v1, v2, dim)) << "No optimization with dim " << dim;
     ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim;
 }
 
@@ -1058,6 +1062,42 @@ TEST_P(FP16SpacesOptimizationTest, FP16L2SqrTest) {
         optimization.f16c = optimization.fma3 = optimization.avx = 0;
     }
 #endif
+#elif defined(CPU_FEATURES_ARCH_AARCH64)
+#ifdef OPT_SVE2
+    if (optimization.sve2) {
+        unsigned char alignment = 0;
+        arch_opt_func = L2_FP16_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_FP16_L2_implementation_SVE2(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_EQ(baseline, arch_opt_func(v1, v2, dim)) << "SVE2 with dim " << dim;
+        ASSERT_EQ(alignment, 0) << "No alignment SVE2 with dim " << dim;
+        // Unset sve2 flag as well, so we'll choose the next option (default).
+        optimization.sve2 = 0;
+    }
+#endif
+#ifdef OPT_SVE
+    if (optimization.sve) {
+        unsigned char alignment = 0;
+        arch_opt_func = L2_FP16_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_FP16_L2_implementation_SVE(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_EQ(baseline, arch_opt_func(v1, v2, dim)) << "SVE with dim " << dim;
+        ASSERT_EQ(alignment, 0) << "No alignment SVE with dim " << dim;
+        // Unset sve flag as well, so we'll choose the next option (default).
+        optimization.sve = 0;
+    }
+#endif
+#ifdef OPT_NEON_HP
+    if (optimization.asimdhp) {
+        unsigned char alignment = 0;
+        arch_opt_func = L2_FP16_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_FP16_L2_implementation_NEON_HP(dim))
+            << "Unexpected distance function chosen for dim OPT_NEON_HP " << dim;
+        ASSERT_EQ(baseline, arch_opt_func(v1, v2, dim)) << "NEON_HP with dim " << dim;
+        ASSERT_EQ(alignment, 0) << "No alignment NEON_HP with dim " << dim;
+        optimization.asimdhp = 0;
+    }
+#endif
 #endif
     unsigned char alignment = 0;
     arch_opt_func = L2_FP16_GetDistFunc(dim, &alignment, &optimization);
@@ -1080,35 +1120,43 @@ INSTANTIATE_TEST_SUITE_P(FP16OptFuncs, FP16SpacesOptimizationTest,
  * that has different logic than the float32 and float64 reduce functions.
  * For more info, refer to intel's intrinsics guide.
  */
-#ifdef OPT_AVX512_FP16_VL
+#if defined(OPT_AVX512_FP16_VL) || defined(CPU_FEATURES_ARCH_AARCH64)
 class FP16SpacesOptimizationTestAdvanced : public testing::TestWithParam<size_t> {};
 
 TEST_P(FP16SpacesOptimizationTestAdvanced, FP16InnerProductTestAdv) {
-    auto optimization = cpu_features::GetX86Info().features;
-    if (optimization.avx512_fp16 && optimization.avx512vl) {
-        size_t dim = GetParam();
-        float16 v1[dim], v2[dim];
+    auto optimization = getCpuOptimizationFeatures();
+    size_t dim = GetParam();
+    float16 v1[dim], v2[dim];
 
-        std::mt19937 gen(42);
-        std::uniform_real_distribution<> dis(-0.99, 0.99);
+    std::mt19937 gen(42);
+    std::uniform_real_distribution<> dis(-0.99, 0.99);
 
-        _Float16 baseline = 0;
-        for (size_t i = 0; i < dim; i++) {
-            float val1 = (dis(gen));
-            float val2 = (dis(gen));
-            v1[i] = vecsim_types::FP32_to_FP16((val1));
-            v2[i] = vecsim_types::FP32_to_FP16((val2));
+#if defined(CPU_FEATURES_ARCH_AARCH64) && defined(__GNUC__) && (__GNUC__ < 13)
+    // https://github.com/pytorch/executorch/issues/6844
+    __fp16 baseline = 0;
+#else
+    _Float16 baseline = 0;
+#endif
 
-            baseline += static_cast<_Float16>(val1) * static_cast<_Float16>(val2);
-        }
-        baseline = _Float16(1) - baseline;
+    for (size_t i = 0; i < dim; i++) {
+        float val1 = (dis(gen));
+        float val2 = (dis(gen));
+        v1[i] = vecsim_types::FP32_to_FP16((val1));
+        v2[i] = vecsim_types::FP32_to_FP16((val2));
 
-        auto expected_alignment = [](size_t reg_bit_size, size_t dim) {
-            size_t elements_in_reg = reg_bit_size / sizeof(float16) / 8;
-            return (dim % elements_in_reg == 0) ? elements_in_reg * sizeof(float16) : 0;
-        };
+        baseline += static_cast<decltype(baseline)>(val1) * static_cast<decltype(baseline)>(val2);
+    }
+    baseline = decltype(baseline)(1) - baseline;
 
-        dist_func_t<float> arch_opt_func;
+    auto expected_alignment = [](size_t reg_bit_size, size_t dim) {
+        size_t elements_in_reg = reg_bit_size / sizeof(float16) / 8;
+        return (dim % elements_in_reg == 0) ? elements_in_reg * sizeof(float16) : 0;
+    };
+
+    dist_func_t<float> arch_opt_func;
+
+#ifdef OPT_AVX512_FP16_VL
+    if (optimization.avx512_fp16 && optimization.avx512vl) {
         unsigned char alignment = 0;
         arch_opt_func = IP_FP16_GetDistFunc(dim, &alignment, &optimization);
         ASSERT_EQ(arch_opt_func, Choose_FP16_IP_implementation_AVX512FP16_VL(dim))
@@ -1121,8 +1169,64 @@ TEST_P(FP16SpacesOptimizationTestAdvanced, FP16InnerProductTestAdv) {
                                << ", dist: " << dist;
         ASSERT_EQ(alignment, expected_alignment(512, dim)) << "AVX512 with dim " << dim;
     }
+#endif
+#ifdef OPT_SVE2
+    if (optimization.sve2) {
+        unsigned char alignment = 0;
+        arch_opt_func = IP_FP16_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_FP16_IP_implementation_SVE2(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        float dist = arch_opt_func(v1, v2, dim);
+        float f_baseline = baseline;
+        float error = std::abs((dist / f_baseline) - 1);
+        // Alow 1% error
+        ASSERT_LE(error, 0.01) << "SVE2 with dim " << dim << ", baseline: " << f_baseline
+                               << ", dist: " << dist;
+        // ASSERT_EQ(alignment, expected_alignment(512, dim)) << "SVE2 with dim " << dim;
+        ASSERT_EQ(alignment, 0) << "SVE2 with dim " << dim;
+        // Unset sve2 flag as well, so we'll choose the next option (default).
+        optimization.sve2 = 0;
+    }
+#endif
+#ifdef OPT_SVE
+    if (optimization.sve) {
+        unsigned char alignment = 0;
+        arch_opt_func = IP_FP16_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_FP16_IP_implementation_SVE(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        float dist = arch_opt_func(v1, v2, dim);
+        float f_baseline = baseline;
+        float error = std::abs((dist / f_baseline) - 1);
+        // Alow 1% error
+        ASSERT_LE(error, 0.01) << "SVE with dim " << dim << ", baseline: " << f_baseline
+                               << ", dist: " << dist;
+        // ASSERT_EQ(alignment, expected_alignment(512, dim)) << "SVE with dim " << dim;
+        ASSERT_EQ(alignment, 0) << "SVE with dim " << dim;
+        // Unset sve flag as well, so we'll choose the next option (default).
+        optimization.sve = 0;
+    }
+#endif
+#ifdef OPT_NEON_HP
+    if (optimization.asimdhp) {
+        unsigned char alignment = 0;
+        arch_opt_func = IP_FP16_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_FP16_IP_implementation_NEON_HP(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        float dist = arch_opt_func(v1, v2, dim);
+        float f_baseline = baseline;
+        float error = std::abs((dist / f_baseline) - 1);
+        // Alow 1% error
+        ASSERT_LE(error, 0.01) << "NEON_HP with dim " << dim << ", baseline: " << f_baseline
+                               << ", dist: " << dist;
+        // ASSERT_EQ(alignment, expected_alignment(512, dim)) << "NEON_HP with dim " << dim;
+        ASSERT_EQ(alignment, 0) << "NEON_HP with dim " << dim;
+        // Unset sve flag as well, so we'll choose the next option (default).
+        optimization.asimdhp = 0;
+    }
+#endif
 }
 
+#ifdef OPT_AVX512_FP16_VL
 TEST_P(FP16SpacesOptimizationTestAdvanced, FP16L2SqrTestAdv) {
     auto optimization = cpu_features::GetX86Info().features;
     if (optimization.avx512_fp16 && optimization.avx512vl) {
@@ -1162,12 +1266,13 @@ TEST_P(FP16SpacesOptimizationTestAdvanced, FP16L2SqrTestAdv) {
         ASSERT_EQ(alignment, expected_alignment(512, dim)) << "AVX512 with dim " << dim;
     }
 }
+#endif
 
 // Start from a 32 multiplier
 INSTANTIATE_TEST_SUITE_P(, FP16SpacesOptimizationTestAdvanced,
                          testing::Range(512UL, 512 + 32UL + 1));
 
-#endif
+#endif // defined(OPT_AVX512_FP16_VL) || defined(CPU_FEATURES_ARCH_AARCH64)
 
 class INT8SpacesOptimizationTest : public testing::TestWithParam<size_t> {};