diff --git a/.install/install_script.sh b/.install/install_script.sh
index 69a871b31..2ca688a67 100755
--- a/.install/install_script.sh
+++ b/.install/install_script.sh
@@ -1,6 +1,7 @@
 #!/bin/bash
 
 OS_TYPE=$(uname -s)
+ARCH=$(uname -m)
 
 if [[ $OS_TYPE = 'Darwin' ]]
 then
@@ -16,6 +17,23 @@ else
     OS=$(echo $OS | sed 's/[/ ]/_/g') # replace spaces and slashes with underscores
 fi
 echo $OS
+if [[ $ARCH == 'aarch64' ]]
+then
+    wget https://developer.arm.com/-/cdn-downloads/permalink/Arm-Performance-Libraries/Version_24.10/arm-performance-libraries_24.10_deb_gcc.tar
+    tar -xf arm-performance-libraries_24.10_deb_gcc.tar
+    sudo ./arm-performance-libraries_24.10_deb/arm-performance-libraries_24.10_deb.sh --accept --install-to armpl
+    sudo rm -rf arm-performance-libraries_24.10_deb_gcc.tar
+    # install libarmpl
+elif [[ $OS == 'macos' ]]
+then
+    wget https://developer.arm.com/-/cdn-downloads/permalink/Arm-Performance-Libraries/Version_24.10/arm-performance-libraries_24.10_macOS.tgz
+    tar zxvf arm-performance-libraries_24.10_macOS.tgz
+    hdiutil attach armpl_24.10_flang-new_clang_19.dmg
+    sudo /Volumes/armpl_24.10_flang-new_clang_19_installer/armpl_24.10_flang-new_clang_19_install.sh --install-to=$(pwd)/armpl -y 
+    # Clean up
+    hdiutil detach /Volumes/armpl_24.10_flang-new_clang_19_installer
+    rm -f arm-performance-libraries_24.10_macOS.tgz
+fi
 
 SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
 
diff --git a/cmake/aarch64InstructionFlags.cmake b/cmake/aarch64InstructionFlags.cmake
index 7c1363262..1cf97ad4e 100644
--- a/cmake/aarch64InstructionFlags.cmake
+++ b/cmake/aarch64InstructionFlags.cmake
@@ -7,12 +7,13 @@ message(STATUS "Building for ARM aarch64")
 CHECK_CXX_COMPILER_FLAG("-march=armv7-a+neon" CXX_ARMV7_NEON)
 CHECK_CXX_COMPILER_FLAG("-march=armv8-a" CXX_ARMV8A)
 CHECK_CXX_COMPILER_FLAG("-march=armv8-a+sve" CXX_SVE)
-CHECK_CXX_COMPILER_FLAG("-march=armv9-a+sve2" CXX_ARMV9)
+CHECK_CXX_COMPILER_FLAG("-march=armv9-a+sve2" CXX_SVE2)
+
 
 # Only use ARMv9 if both compiler and CPU support it
-if(CXX_ARMV9)
+if(CXX_SVE2)
   message(STATUS "Using ARMv9.0-a with SVE2 (supported by CPU)")
-  add_compile_definitions(OPT_ARMV9)
+  add_compile_definitions(OPT_SVE2)
 endif()
 if (CXX_ARMV8A OR CXX_ARMV7_NEON)
   add_compile_definitions(OPT_NEON)
diff --git a/src/VecSim/spaces/CMakeLists.txt b/src/VecSim/spaces/CMakeLists.txt
index 1fc9473b2..e11d17ac8 100644
--- a/src/VecSim/spaces/CMakeLists.txt
+++ b/src/VecSim/spaces/CMakeLists.txt
@@ -81,6 +81,58 @@ if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "(x86_64)|(AMD64|amd64)|(^i.86$)")
 	endif()
 endif()
 
+if (CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "(aarch64)|(arm64)|(ARM64)|(armv.*)")
+	message(STATUS "Enabling Arm Performance Libraries integration")
+	include(${root}/cmake/aarch64InstructionFlags.cmake)
+	if(EXISTS "${root}/armpl/armpl_24.10_gcc")
+		set(ARMPL_DIR "${root}/armpl/armpl_24.10_gcc")
+	elseif(EXISTS "${root}/armpl/armpl_24.10_flang-new_clang_19")
+		set(ARMPL_DIR "${root}/armpl/armpl_24.10_flang-new_clang_19")
+	elseif(EXISTS "/opt/arm/armpl_24.10_gcc")
+		set(ARMPL_DIR "/opt/arm/armpl_24.10_gcc")
+	elseif(EXISTS "/opt/arm/armpl_24.10_flang-new_clang_19")
+		set(ARMPL_DIR "/opt/arm/armpl_24.10_flang-new_clang_19")
+	else()
+		# Default fallback path
+		set(ARMPL_DIR "${root}/armpl/armpl_24.10_gcc")
+		message(WARNING "None of the expected ARM Performance Libraries paths exist. Using default: ${ARMPL_DIR}")
+	endif()
+
+	include_directories(${ARMPL_DIR}/include)
+	message(STATUS "$ENV{ARMPL_LIBRARIES} and ${ARMPL_DIR}/lib")
+	# Find and set up ARM Performance Libraries with no default path
+	find_library(ARMPL_LIB armpl_lp64 PATHS ${ARMPL_DIR}/lib)
+
+	if(NOT ARMPL_LIB)
+	
+		message(STATUS "Could not find ARM Performance Libraries")
+	else()
+		message(STATUS "Found ARM Performance Libraries: ${ARMPL_LIB}")
+	endif()
+
+	# Create different optimization implementations for ARM architecture
+	# ArmPL used for standard ARMv8-A with NEON
+	if (CXX_ARMV8A)
+		message("Building with ARMV8A and ArmPL")
+		set_source_files_properties(functions/ARMPL_NEON.cpp PROPERTIES COMPILE_FLAGS "-march=armv8-a")
+		list(APPEND OPTIMIZATIONS functions/ARMPL_NEON.cpp)
+	endif()
+
+	# ArmPL with SVE support
+	if (CXX_SVE)
+		message("Building with SVE and ArmPL")
+		set_source_files_properties(functions/ARMPL_SVE.cpp PROPERTIES COMPILE_FLAGS "-march=armv8-a+sve")
+		list(APPEND OPTIMIZATIONS functions/ARMPL_SVE.cpp)
+	endif()
+
+	# ArmPL with SVE2 support
+	if (CXX_SVE2)
+		message("Building with ARMV9 and ArmPL")
+		set_source_files_properties(functions/ARMPL_SVE2.cpp PROPERTIES COMPILE_FLAGS "-march=armv9-a+sve2")
+		list(APPEND OPTIMIZATIONS functions/ARMPL_SVE2.cpp)
+	endif()
+endif()
+
 # Here we are compiling the space selectors with the relevant optimization flag.
 add_library(VectorSimilaritySpaces
 	L2_space.cpp
@@ -90,4 +142,4 @@ add_library(VectorSimilaritySpaces
 	computer/preprocessor_container.cpp
 )
 
-target_link_libraries(VectorSimilaritySpaces VectorSimilaritySpaces_no_optimization cpu_features)
+target_link_libraries(VectorSimilaritySpaces VectorSimilaritySpaces_no_optimization cpu_features ${ARMPL_LIB})
diff --git a/src/VecSim/spaces/IP/IP_ARMPL_NEON_FP32.h b/src/VecSim/spaces/IP/IP_ARMPL_NEON_FP32.h
new file mode 100644
index 000000000..4ab901456
--- /dev/null
+++ b/src/VecSim/spaces/IP/IP_ARMPL_NEON_FP32.h
@@ -0,0 +1,17 @@
+/*
+ *Copyright Redis Ltd. 2021 - present
+ *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
+ *the Server Side Public License v1 (SSPLv1).
+ */
+
+#include "VecSim/spaces/space_includes.h"
+#include <armpl.h>
+
+float FP32_InnerProduct_ARMPL_NEON(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    auto *vec1 = (float *)pVect1v;
+    auto *vec2 = (float *)pVect2v;
+
+    // Notice: Armpl can choose different implementation based on cpu features.
+    float res = cblas_sdot(static_cast<int>(dimension), vec1, 1, vec2, 1);
+    return 1.0f - res;
+}
diff --git a/src/VecSim/spaces/IP/IP_ARMPL_SVE2_FP32.h b/src/VecSim/spaces/IP/IP_ARMPL_SVE2_FP32.h
new file mode 100644
index 000000000..1014f791e
--- /dev/null
+++ b/src/VecSim/spaces/IP/IP_ARMPL_SVE2_FP32.h
@@ -0,0 +1,17 @@
+/*
+ *Copyright Redis Ltd. 2021 - present
+ *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
+ *the Server Side Public License v1 (SSPLv1).
+ */
+
+#include "VecSim/spaces/space_includes.h"
+#include <armpl.h>
+
+float FP32_InnerProduct_ARMPL_SVE2(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    auto *vec1 = (float *)pVect1v;
+    auto *vec2 = (float *)pVect2v;
+
+    // Notice: Armpl can choose different implementation based on cpu features.
+    float res = cblas_sdot(static_cast<int>(dimension), vec1, 1, vec2, 1);
+    return 1.0f - res;
+}
diff --git a/src/VecSim/spaces/IP/IP_ARMPL_SVE_FP32.h b/src/VecSim/spaces/IP/IP_ARMPL_SVE_FP32.h
new file mode 100644
index 000000000..40641a72a
--- /dev/null
+++ b/src/VecSim/spaces/IP/IP_ARMPL_SVE_FP32.h
@@ -0,0 +1,17 @@
+/*
+ *Copyright Redis Ltd. 2021 - present
+ *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
+ *the Server Side Public License v1 (SSPLv1).
+ */
+
+#include "VecSim/spaces/space_includes.h"
+#include <armpl.h>
+
+float FP32_InnerProduct_ARMPL_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    auto *vec1 = (float *)pVect1v;
+    auto *vec2 = (float *)pVect2v;
+
+    // Notice: Armpl can choose different implementation based on cpu features.
+    float res = cblas_sdot(static_cast<int>(dimension), vec1, 1, vec2, 1);
+    return 1.0f - res;
+}
diff --git a/src/VecSim/spaces/IP_space.cpp b/src/VecSim/spaces/IP_space.cpp
index 50f015e41..07009ce39 100644
--- a/src/VecSim/spaces/IP_space.cpp
+++ b/src/VecSim/spaces/IP_space.cpp
@@ -19,6 +19,9 @@
 #include "VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h"
 #include "VecSim/spaces/functions/AVX2.h"
 #include "VecSim/spaces/functions/SSE3.h"
+#include "VecSim/spaces/functions/ARMPL_NEON.h"
+#include "VecSim/spaces/functions/ARMPL_SVE.h"
+#include "VecSim/spaces/functions/ARMPL_SVE2.h"
 
 using bfloat16 = vecsim_types::bfloat16;
 using float16 = vecsim_types::float16;
@@ -35,6 +38,30 @@ dist_func_t<float> IP_FP32_GetDistFunc(size_t dim, unsigned char *alignment, con
     if (dim < 16) {
         return ret_dist_func;
     }
+
+#ifdef CPU_FEATURES_ARCH_AARCH64
+    auto features = (arch_opt == nullptr)
+                        ? cpu_features::GetAarch64Info().features
+                        : *static_cast<const cpu_features::Aarch64Features *>(arch_opt);
+
+#ifdef OPT_SVE2
+    if (features.sve2) {
+        return Choose_FP32_IP_implementation_ARMPL_SVE2(dim);
+    }
+#endif
+#ifdef OPT_SVE
+    if (features.sve) {
+        return Choose_FP32_IP_implementation_ARMPL_SVE(dim);
+    }
+#endif
+#ifdef OPT_NEON
+    if (features.asimd) {
+        return Choose_FP32_IP_implementation_ARMPL_NEON(dim);
+    }
+#endif
+
+#endif
+
 #ifdef CPU_FEATURES_ARCH_X86_64
     auto features = (arch_opt == nullptr)
                         ? cpu_features::GetX86Info().features
diff --git a/src/VecSim/spaces/L2/L2_ARMPL_NEON_FP32.h b/src/VecSim/spaces/L2/L2_ARMPL_NEON_FP32.h
new file mode 100644
index 000000000..36bd2823a
--- /dev/null
+++ b/src/VecSim/spaces/L2/L2_ARMPL_NEON_FP32.h
@@ -0,0 +1,49 @@
+/*
+ *Copyright Redis Ltd. 2021 - present
+ *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
+ *the Server Side Public License v1 (SSPLv1).
+ */
+
+#include "VecSim/spaces/space_includes.h"
+#include <armpl.h>
+
+float FP32_L2Sqr_ARMPL_NEON(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    const float *vec1 = static_cast<const float *>(pVect1v);
+    const float *vec2 = static_cast<const float *>(pVect2v);
+
+    float result = 0.0f;
+    constexpr const size_t blockSize = 1024;
+    float buffer[blockSize];
+
+    // Pre-calculate number of full blocks and the size of the last partial block
+    const size_t fullBlockCount = dimension / blockSize;
+    const size_t lastBlockSize = dimension % blockSize;
+
+    // Process full blocks
+    for (size_t i = 0; i < fullBlockCount; i++) {
+        size_t offset = i * blockSize;
+
+        // Calculate difference vector for full block
+        for (size_t j = 0; j < blockSize; j++) {
+            buffer[j] = vec1[offset + j] - vec2[offset + j];
+        }
+
+        // Use ARMPL to compute dot product
+        result += cblas_sdot(blockSize, buffer, 1, buffer, 1);
+    }
+
+    // Handle remaining elements (if any)
+    if (lastBlockSize > 0) {
+        size_t offset = fullBlockCount * blockSize;
+
+        // Calculate difference vector for remaining elements
+        for (size_t j = 0; j < lastBlockSize; j++) {
+            buffer[j] = vec1[offset + j] - vec2[offset + j];
+        }
+
+        // Use ARMPL to compute dot product
+        result += cblas_sdot(lastBlockSize, buffer, 1, buffer, 1);
+    }
+
+    return result;
+}
diff --git a/src/VecSim/spaces/L2/L2_ARMPL_SVE2_FP32.h b/src/VecSim/spaces/L2/L2_ARMPL_SVE2_FP32.h
new file mode 100644
index 000000000..4c0a24226
--- /dev/null
+++ b/src/VecSim/spaces/L2/L2_ARMPL_SVE2_FP32.h
@@ -0,0 +1,49 @@
+/*
+ *Copyright Redis Ltd. 2021 - present
+ *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
+ *the Server Side Public License v1 (SSPLv1).
+ */
+
+#include "VecSim/spaces/space_includes.h"
+#include "armpl.h"
+
+float FP32_L2Sqr_ARMPL_SVE2(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    const float *vec1 = static_cast<const float *>(pVect1v);
+    const float *vec2 = static_cast<const float *>(pVect2v);
+
+    float result = 0.0f;
+    constexpr const size_t blockSize = 1024;
+    float buffer[blockSize];
+
+    // Pre-calculate number of full blocks and the size of the last partial block
+    const size_t fullBlockCount = dimension / blockSize;
+    const size_t lastBlockSize = dimension % blockSize;
+
+    // Process full blocks
+    for (size_t i = 0; i < fullBlockCount; i++) {
+        size_t offset = i * blockSize;
+
+        // Calculate difference vector for full block
+        for (size_t j = 0; j < blockSize; j++) {
+            buffer[j] = vec1[offset + j] - vec2[offset + j];
+        }
+
+        // Use ARMPL to compute dot product
+        result += cblas_sdot(blockSize, buffer, 1, buffer, 1);
+    }
+
+    // Handle remaining elements (if any)
+    if (lastBlockSize > 0) {
+        size_t offset = fullBlockCount * blockSize;
+
+        // Calculate difference vector for remaining elements
+        for (size_t j = 0; j < lastBlockSize; j++) {
+            buffer[j] = vec1[offset + j] - vec2[offset + j];
+        }
+
+        // Use ARMPL to compute dot product
+        result += cblas_sdot(lastBlockSize, buffer, 1, buffer, 1);
+    }
+
+    return result;
+}
diff --git a/src/VecSim/spaces/L2/L2_ARMPL_SVE_FP32.h b/src/VecSim/spaces/L2/L2_ARMPL_SVE_FP32.h
new file mode 100644
index 000000000..ecf38b4be
--- /dev/null
+++ b/src/VecSim/spaces/L2/L2_ARMPL_SVE_FP32.h
@@ -0,0 +1,49 @@
+/*
+ *Copyright Redis Ltd. 2021 - present
+ *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
+ *the Server Side Public License v1 (SSPLv1).
+ */
+
+#include "VecSim/spaces/space_includes.h"
+#include "armpl.h"
+
+float FP32_L2Sqr_ARMPL_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    const float *vec1 = static_cast<const float *>(pVect1v);
+    const float *vec2 = static_cast<const float *>(pVect2v);
+
+    float result = 0.0f;
+    constexpr const size_t blockSize = 1024;
+    float buffer[blockSize];
+
+    // Pre-calculate number of full blocks and the size of the last partial block
+    const size_t fullBlockCount = dimension / blockSize;
+    const size_t lastBlockSize = dimension % blockSize;
+
+    // Process full blocks
+    for (size_t i = 0; i < fullBlockCount; i++) {
+        size_t offset = i * blockSize;
+
+        // Calculate difference vector for full block
+        for (size_t j = 0; j < blockSize; j++) {
+            buffer[j] = vec1[offset + j] - vec2[offset + j];
+        }
+
+        // Use ARMPL to compute dot product
+        result += cblas_sdot(blockSize, buffer, 1, buffer, 1);
+    }
+
+    // Handle remaining elements (if any)
+    if (lastBlockSize > 0) {
+        size_t offset = fullBlockCount * blockSize;
+
+        // Calculate difference vector for remaining elements
+        for (size_t j = 0; j < lastBlockSize; j++) {
+            buffer[j] = vec1[offset + j] - vec2[offset + j];
+        }
+
+        // Use ARMPL to compute dot product
+        result += cblas_sdot(lastBlockSize, buffer, 1, buffer, 1);
+    }
+
+    return result;
+}
diff --git a/src/VecSim/spaces/L2_space.cpp b/src/VecSim/spaces/L2_space.cpp
index ae02231c1..f41eed674 100644
--- a/src/VecSim/spaces/L2_space.cpp
+++ b/src/VecSim/spaces/L2_space.cpp
@@ -18,6 +18,9 @@
 #include "VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h"
 #include "VecSim/spaces/functions/AVX2.h"
 #include "VecSim/spaces/functions/SSE3.h"
+#include "VecSim/spaces/functions/ARMPL_NEON.h"
+#include "VecSim/spaces/functions/ARMPL_SVE.h"
+#include "VecSim/spaces/functions/ARMPL_SVE2.h"
 
 using bfloat16 = vecsim_types::bfloat16;
 using float16 = vecsim_types::float16;
@@ -35,6 +38,28 @@ dist_func_t<float> L2_FP32_GetDistFunc(size_t dim, unsigned char *alignment, con
     if (dim < 16) {
         return ret_dist_func;
     }
+#ifdef CPU_FEATURES_ARCH_AARCH64
+    auto features = (arch_opt == nullptr)
+                        ? cpu_features::GetAarch64Info().features
+                        : *static_cast<const cpu_features::Aarch64Features *>(arch_opt);
+
+#ifdef OPT_SVE2
+    if (features.sve2) {
+        return Choose_FP32_L2_implementation_ARMPL_SVE2(dim);
+    }
+#endif
+#ifdef OPT_SVE
+    if (features.sve) {
+        return Choose_FP32_L2_implementation_ARMPL_SVE(dim);
+    }
+#endif
+#ifdef OPT_NEON
+    if (features.asimd) {
+        return Choose_FP32_L2_implementation_ARMPL_NEON(dim);
+    }
+#endif
+#endif
+
 #ifdef CPU_FEATURES_ARCH_X86_64
     auto features = (arch_opt == nullptr)
                         ? cpu_features::GetX86Info().features
diff --git a/src/VecSim/spaces/functions/ARMPL_NEON.cpp b/src/VecSim/spaces/functions/ARMPL_NEON.cpp
new file mode 100644
index 000000000..f4e16f030
--- /dev/null
+++ b/src/VecSim/spaces/functions/ARMPL_NEON.cpp
@@ -0,0 +1,25 @@
+/*
+ *Copyright Redis Ltd. 2021 - present
+ *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
+ *the Server Side Public License v1 (SSPLv1).
+ */
+
+#include "ARMPL_NEON.h"
+#include "VecSim/spaces/L2/L2_ARMPL_NEON_FP32.h"
+#include "VecSim/spaces/IP/IP_ARMPL_NEON_FP32.h"
+
+namespace spaces {
+
+#include "implementation_chooser.h"
+
+dist_func_t<float> Choose_FP32_IP_implementation_ARMPL_NEON(size_t dim) {
+    return FP32_InnerProduct_ARMPL_NEON;
+}
+
+dist_func_t<float> Choose_FP32_L2_implementation_ARMPL_NEON(size_t dim) {
+    return FP32_L2Sqr_ARMPL_NEON;
+}
+
+#include "implementation_chooser_cleanup.h"
+
+} // namespace spaces
diff --git a/src/VecSim/spaces/functions/ARMPL_NEON.h b/src/VecSim/spaces/functions/ARMPL_NEON.h
new file mode 100644
index 000000000..6be467d19
--- /dev/null
+++ b/src/VecSim/spaces/functions/ARMPL_NEON.h
@@ -0,0 +1,17 @@
+/*
+ *Copyright Redis Ltd. 2021 - present
+ *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
+ *the Server Side Public License v1 (SSPLv1).
+ */
+
+#pragma once
+
+#include "VecSim/spaces/spaces.h"
+
+namespace spaces {
+
+dist_func_t<float> Choose_FP32_IP_implementation_ARMPL_NEON(size_t dim);
+
+dist_func_t<float> Choose_FP32_L2_implementation_ARMPL_NEON(size_t dim);
+
+} // namespace spaces
diff --git a/src/VecSim/spaces/functions/ARMPL_SVE.cpp b/src/VecSim/spaces/functions/ARMPL_SVE.cpp
new file mode 100644
index 000000000..74e16100f
--- /dev/null
+++ b/src/VecSim/spaces/functions/ARMPL_SVE.cpp
@@ -0,0 +1,26 @@
+/*
+ *Copyright Redis Ltd. 2021 - present
+ *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
+ *the Server Side Public License v1 (SSPLv1).
+ */
+
+#include "ARMPL_SVE.h"
+
+#include "VecSim/spaces/L2/L2_ARMPL_SVE_FP32.h"
+#include "VecSim/spaces/IP/IP_ARMPL_SVE_FP32.h"
+
+namespace spaces {
+
+#include "implementation_chooser.h"
+
+dist_func_t<float> Choose_FP32_IP_implementation_ARMPL_SVE(size_t dim) {
+    return FP32_InnerProduct_ARMPL_SVE;
+}
+
+dist_func_t<float> Choose_FP32_L2_implementation_ARMPL_SVE(size_t dim) {
+    return FP32_L2Sqr_ARMPL_SVE;
+}
+
+#include "implementation_chooser_cleanup.h"
+
+} // namespace spaces
diff --git a/src/VecSim/spaces/functions/ARMPL_SVE.h b/src/VecSim/spaces/functions/ARMPL_SVE.h
new file mode 100644
index 000000000..a8d21fda0
--- /dev/null
+++ b/src/VecSim/spaces/functions/ARMPL_SVE.h
@@ -0,0 +1,17 @@
+/*
+ *Copyright Redis Ltd. 2021 - present
+ *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
+ *the Server Side Public License v1 (SSPLv1).
+ */
+
+#pragma once
+
+#include "VecSim/spaces/spaces.h"
+
+namespace spaces {
+
+dist_func_t<float> Choose_FP32_IP_implementation_ARMPL_SVE(size_t dim);
+
+dist_func_t<float> Choose_FP32_L2_implementation_ARMPL_SVE(size_t dim);
+
+} // namespace spaces
diff --git a/src/VecSim/spaces/functions/ARMPL_SVE2.cpp b/src/VecSim/spaces/functions/ARMPL_SVE2.cpp
new file mode 100644
index 000000000..df0369469
--- /dev/null
+++ b/src/VecSim/spaces/functions/ARMPL_SVE2.cpp
@@ -0,0 +1,26 @@
+/*
+ *Copyright Redis Ltd. 2021 - present
+ *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
+ *the Server Side Public License v1 (SSPLv1).
+ */
+
+#include "ARMPL_SVE2.h"
+
+#include "VecSim/spaces/L2/L2_ARMPL_SVE2_FP32.h"
+#include "VecSim/spaces/IP/IP_ARMPL_SVE2_FP32.h"
+
+namespace spaces {
+
+#include "implementation_chooser.h"
+
+dist_func_t<float> Choose_FP32_IP_implementation_ARMPL_SVE2(size_t dim) {
+    return FP32_InnerProduct_ARMPL_SVE2;
+}
+
+dist_func_t<float> Choose_FP32_L2_implementation_ARMPL_SVE2(size_t dim) {
+    return FP32_L2Sqr_ARMPL_SVE2;
+}
+
+#include "implementation_chooser_cleanup.h"
+
+} // namespace spaces
diff --git a/src/VecSim/spaces/functions/ARMPL_SVE2.h b/src/VecSim/spaces/functions/ARMPL_SVE2.h
new file mode 100644
index 000000000..d86c324a5
--- /dev/null
+++ b/src/VecSim/spaces/functions/ARMPL_SVE2.h
@@ -0,0 +1,17 @@
+/*
+ *Copyright Redis Ltd. 2021 - present
+ *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
+ *the Server Side Public License v1 (SSPLv1).
+ */
+
+#pragma once
+
+#include "VecSim/spaces/spaces.h"
+
+namespace spaces {
+
+dist_func_t<float> Choose_FP32_IP_implementation_ARMPL_SVE2(size_t dim);
+
+dist_func_t<float> Choose_FP32_L2_implementation_ARMPL_SVE2(size_t dim);
+
+} // namespace spaces
diff --git a/src/VecSim/spaces/space_includes.h b/src/VecSim/spaces/space_includes.h
index f320297bb..80bd518ea 100644
--- a/src/VecSim/spaces/space_includes.h
+++ b/src/VecSim/spaces/space_includes.h
@@ -12,6 +12,9 @@
 #ifdef CPU_FEATURES_ARCH_X86_64
 #include "cpuinfo_x86.h"
 #endif // CPU_FEATURES_ARCH_X86_64
+#ifdef CPU_FEATURES_ARCH_AARCH64
+#include "cpuinfo_aarch64.h"
+#endif // CPU_FEATURES_ARCH_AARCH64
 
 #if defined(__AVX512F__) || defined(__AVX__) || defined(__SSE__)
 #if defined(__GNUC__)
diff --git a/tests/benchmark/CMakeLists.txt b/tests/benchmark/CMakeLists.txt
index 7bf528de9..e16633d44 100644
--- a/tests/benchmark/CMakeLists.txt
+++ b/tests/benchmark/CMakeLists.txt
@@ -29,7 +29,6 @@ if (CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "(aarch64)|(arm64)|(ARM64)|(armv.*)")
 	message(STATUS "Enabling Arm Performance Libraries integration")
 	include(${root}/cmake/aarch64InstructionFlags.cmake)
 	add_compile_definitions(BENCHMARK_ARCH=arm64)
-
 else()
 	include(${root}/cmake/x86_64InstructionFlags.cmake)
 	add_compile_definitions(BENCHMARK_ARCH=x86_64)
diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces.h b/tests/benchmark/spaces_benchmarks/bm_spaces.h
index 3906f3d16..08e31372a 100644
--- a/tests/benchmark/spaces_benchmarks/bm_spaces.h
+++ b/tests/benchmark/spaces_benchmarks/bm_spaces.h
@@ -24,6 +24,9 @@
 #include "VecSim/spaces/functions/F16C.h"
 #include "VecSim/spaces/functions/SSE3.h"
 #include "VecSim/spaces/functions/SSE.h"
+#include "VecSim/spaces/functions/ARMPL_NEON.h"
+#include "VecSim/spaces/functions/ARMPL_SVE.h"
+#include "VecSim/spaces/functions/ARMPL_SVE2.h"
 #include "bm_macros.h"
 #include "bm_spaces_class.h"
 
diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_fp32.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_fp32.cpp
index 106b2abc8..7e2dca826 100644
--- a/tests/benchmark/spaces_benchmarks/bm_spaces_fp32.cpp
+++ b/tests/benchmark/spaces_benchmarks/bm_spaces_fp32.cpp
@@ -7,6 +7,25 @@
 
 class BM_VecSimSpaces_FP32 : public BM_VecSimSpaces<float> {};
 
+#ifdef CPU_FEATURES_ARCH_AARCH64
+cpu_features::Aarch64Features opt = cpu_features::GetAarch64Info().features;
+
+// ARMPL NEON implementation for ARMv8-a
+#ifdef OPT_NEON
+bool neon_supported = opt.asimd; // ARMv8-a always supports NEON
+INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_FP32, FP32, ARMPL_NEON, 16, neon_supported);
+#endif
+#ifdef OPT_SVE
+bool sve_supported = opt.sve; // Check for SVE support
+INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_FP32, FP32, ARMPL_SVE, 16, sve_supported);
+#endif
+// ARMPL SVE2 implementation
+#ifdef OPT_SVE2
+bool sve2_supported = opt.sve2; // Check for SVE2 support
+INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_FP32, FP32, ARMPL_SVE2, 16, sve2_supported);
+#endif
+#endif // AARCH64
+
 #ifdef CPU_FEATURES_ARCH_X86_64
 cpu_features::X86Features opt = cpu_features::GetX86Info().features;
 
diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt
index 5d55b497e..df79a6427 100644
--- a/tests/unit/CMakeLists.txt
+++ b/tests/unit/CMakeLists.txt
@@ -28,7 +28,12 @@ if(FP64_TESTS)
 	add_definitions(-DFP64_TESTS)
 endif()
 
-include(${root}/cmake/x86_64InstructionFlags.cmake)
+if (CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "(aarch64)|(arm64)|(ARM64)|(armv8)|(armv9)")
+	message(STATUS "Enabling Arm Performance Libraries integration")
+	include(${root}/cmake/aarch64InstructionFlags.cmake)
+else()
+	include(${root}/cmake/x86_64InstructionFlags.cmake)
+endif()
 
 add_executable(test_hnsw ../utils/mock_thread_pool.cpp test_hnsw.cpp test_hnsw_multi.cpp test_hnsw_tiered.cpp unit_test_utils.cpp)
 add_executable(test_hnsw_parallel test_hnsw_parallel.cpp ../utils/mock_thread_pool.cpp unit_test_utils.cpp)
diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp
index 093f6b724..2cd36cc2c 100644
--- a/tests/unit/test_spaces.cpp
+++ b/tests/unit/test_spaces.cpp
@@ -27,11 +27,25 @@
 #include "VecSim/spaces/functions/AVX2.h"
 #include "VecSim/spaces/functions/SSE3.h"
 #include "VecSim/spaces/functions/F16C.h"
+#include "VecSim/spaces/functions/ARMPL_NEON.h"
+#include "VecSim/spaces/functions/ARMPL_SVE.h"
+#include "VecSim/spaces/functions/ARMPL_SVE2.h"
 #include "tests_utils.h"
 
 using bfloat16 = vecsim_types::bfloat16;
 using float16 = vecsim_types::float16;
 
+inline auto getCpuOptimizationFeatures() {
+#if defined(CPU_FEATURES_ARCH_X86_64)
+    return cpu_features::GetX86Info().features;
+#elif defined(CPU_FEATURES_ARCH_AARCH64)
+    return cpu_features::GetAarch64Info().features;
+#else
+    // Return empty/default features if on an unsupported architecture
+    return cpu_features::X86Info().features;
+#endif
+}
+
 class SpacesTest : public ::testing::Test {
 
 protected:
@@ -380,12 +394,11 @@ TEST_F(SpacesTest, smallDimChooser) {
 
 // In this following tests we assume that compiler supports all X86 optimizations, so if we have
 // some hardware flag enabled, we check that the corresponding optimization function was chosen.
-#ifdef CPU_FEATURES_ARCH_X86_64
 
 class FP32SpacesOptimizationTest : public testing::TestWithParam<size_t> {};
 
 TEST_P(FP32SpacesOptimizationTest, FP32L2SqrTest) {
-    auto optimization = cpu_features::GetX86Info().features;
+    auto optimization = getCpuOptimizationFeatures();
     size_t dim = GetParam();
     float v[dim];
     float v2[dim];
@@ -401,6 +414,7 @@ TEST_P(FP32SpacesOptimizationTest, FP32L2SqrTest) {
 
     dist_func_t<float> arch_opt_func;
     float baseline = FP32_L2Sqr(v, v2, dim);
+// CPU_FEATURES_ARCH_X86_64
 #ifdef OPT_AVX512F
     if (optimization.avx512f) {
         unsigned char alignment = 0;
@@ -437,6 +451,42 @@ TEST_P(FP32SpacesOptimizationTest, FP32L2SqrTest) {
         optimization.sse = 0;
     }
 #endif
+
+// CPU_FEATURES_ARCH_AARCH64
+#ifdef OPT_SVE2
+    if (optimization.sve2) {
+        unsigned char alignment = 0;
+        arch_opt_func = L2_FP32_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_FP32_L2_implementation_ARMPL_SVE2(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_EQ(alignment, 0) << "No alignment ARMPL_SVE2 with dim " << dim;
+        // Unset sve2 flag as well, so we'll choose the next option (default).
+        optimization.sve2 = 0;
+    }
+#endif
+#ifdef OPT_SVE
+    if (optimization.sve) {
+        unsigned char alignment = 0;
+        arch_opt_func = L2_FP32_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_FP32_L2_implementation_ARMPL_SVE(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_EQ(baseline, arch_opt_func(v, v2, dim)) << "SVE with dim " << dim;
+        ASSERT_EQ(alignment, 0) << "No alignment ARMPL_SVE with dim " << dim;
+        // Unset sve flag as well, so we'll choose the next option (default).
+        optimization.sve = 0;
+    }
+#endif
+#ifdef OPT_NEON
+    if (optimization.asimd) {
+        unsigned char alignment = 0;
+        arch_opt_func = L2_FP32_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_FP32_L2_implementation_ARMPL_NEON(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_EQ(alignment, 0) << "No alignment ARMPL_NEON with dim " << dim;
+        optimization.asimd = 0;
+    }
+#endif
+
     unsigned char alignment = 0;
     arch_opt_func = L2_FP32_GetDistFunc(dim, &alignment, &optimization);
     ASSERT_EQ(arch_opt_func, FP32_L2Sqr) << "Unexpected distance function chosen for dim " << dim;
@@ -445,7 +495,7 @@ TEST_P(FP32SpacesOptimizationTest, FP32L2SqrTest) {
 }
 
 TEST_P(FP32SpacesOptimizationTest, FP32InnerProductTest) {
-    auto optimization = cpu_features::GetX86Info().features;
+    auto optimization = getCpuOptimizationFeatures();
     size_t dim = GetParam();
     float v[dim];
     float v2[dim];
@@ -461,6 +511,8 @@ TEST_P(FP32SpacesOptimizationTest, FP32InnerProductTest) {
 
     dist_func_t<float> arch_opt_func;
     float baseline = FP32_InnerProduct(v, v2, dim);
+
+// CPU_FEATURES_ARCH_X86_64
 #ifdef OPT_AVX512F
     if (optimization.avx512f) {
         unsigned char alignment = 0;
@@ -494,6 +546,43 @@ TEST_P(FP32SpacesOptimizationTest, FP32InnerProductTest) {
         optimization.sse = 0;
     }
 #endif
+
+// CPU_FEATURES_ARCH_AARCH64
+#ifdef OPT_SVE2
+    if (optimization.sve2) {
+        unsigned char alignment = 0;
+        arch_opt_func = IP_FP32_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_FP32_IP_implementation_ARMPL_SVE2(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_EQ(baseline, arch_opt_func(v, v2, dim)) << "SVE2 with dim " << dim;
+        ASSERT_EQ(alignment, 0) << "No alignment ARMPL_SVE2 with dim " << dim;
+        // Unset sve2 flag as well, so we'll choose the next option (default).
+        optimization.sve2 = 0;
+    }
+#endif
+#ifdef OPT_SVE
+    if (optimization.sve) {
+        unsigned char alignment = 0;
+        arch_opt_func = IP_FP32_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_FP32_IP_implementation_ARMPL_SVE(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_EQ(baseline, arch_opt_func(v, v2, dim)) << "SVE with dim " << dim;
+        ASSERT_EQ(alignment, 0) << "No alignment ARMPL_SVE with dim " << dim;
+        // Unset sve2 flag as well, so we'll choose the next option (default).
+        optimization.sve = 0;
+    }
+#endif
+#ifdef OPT_NEON
+    if (optimization.asimd) {
+        unsigned char alignment = 0;
+        arch_opt_func = IP_FP32_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_FP32_IP_implementation_ARMPL_NEON(dim))
+            << "Unexpected distance function chosen for dim OPT_NEON " << dim;
+        ASSERT_EQ(alignment, 0) << "No alignment ARMPL_NEON with dim " << dim;
+        optimization.asimd = 0;
+    }
+#endif
+
     unsigned char alignment = 0;
     arch_opt_func = IP_FP32_GetDistFunc(dim, &alignment, &optimization);
     ASSERT_EQ(arch_opt_func, FP32_InnerProduct)
@@ -508,7 +597,7 @@ INSTANTIATE_TEST_SUITE_P(FP32OptFuncs, FP32SpacesOptimizationTest,
 class FP64SpacesOptimizationTest : public testing::TestWithParam<size_t> {};
 
 TEST_P(FP64SpacesOptimizationTest, FP64L2SqrTest) {
-    auto optimization = cpu_features::GetX86Info().features;
+    auto optimization = getCpuOptimizationFeatures();
     size_t dim = GetParam();
     double v[dim];
     double v2[dim];
@@ -565,7 +654,7 @@ TEST_P(FP64SpacesOptimizationTest, FP64L2SqrTest) {
 }
 
 TEST_P(FP64SpacesOptimizationTest, FP64InnerProductTest) {
-    auto optimization = cpu_features::GetX86Info().features;
+    auto optimization = getCpuOptimizationFeatures();
     size_t dim = GetParam();
     double v[dim];
     double v2[dim];
@@ -628,7 +717,7 @@ INSTANTIATE_TEST_SUITE_P(FP64OptFuncs, FP64SpacesOptimizationTest,
 class BF16SpacesOptimizationTest : public testing::TestWithParam<size_t> {};
 
 TEST_P(BF16SpacesOptimizationTest, BF16InnerProductTest) {
-    auto optimization = cpu_features::GetX86Info().features;
+    auto optimization = getCpuOptimizationFeatures();
     size_t dim = GetParam();
     bfloat16 v[dim];
     bfloat16 v2[dim];
@@ -697,7 +786,7 @@ TEST_P(BF16SpacesOptimizationTest, BF16InnerProductTest) {
 }
 
 TEST_P(BF16SpacesOptimizationTest, BF16L2SqrTest) {
-    auto optimization = cpu_features::GetX86Info().features;
+    auto optimization = getCpuOptimizationFeatures();
     size_t dim = GetParam();
     bfloat16 v[dim];
     bfloat16 v2[dim];
@@ -760,7 +849,7 @@ INSTANTIATE_TEST_SUITE_P(BF16OptFuncs, BF16SpacesOptimizationTest,
 class FP16SpacesOptimizationTest : public testing::TestWithParam<size_t> {};
 
 TEST_P(FP16SpacesOptimizationTest, FP16InnerProductTest) {
-    auto optimization = cpu_features::GetX86Info().features;
+    auto optimization = getCpuOptimizationFeatures();
     size_t dim = GetParam();
     float16 v1[dim], v2[dim];
     float v1_fp32[dim], v2_fp32[dim];
@@ -780,6 +869,7 @@ TEST_P(FP16SpacesOptimizationTest, FP16InnerProductTest) {
     float baseline = FP16_InnerProduct(v1, v2, dim);
     ASSERT_EQ(baseline, FP32_InnerProduct(v1_fp32, v2_fp32, dim)) << "Baseline check " << dim;
     // Turn off advanced fp16 flags. They will be tested in the next test.
+#if defined(CPU_FEATURES_ARCH_X86_64)
     optimization.avx512_fp16 = optimization.avx512vl = 0;
 #ifdef OPT_AVX512F
     if (optimization.avx512f) {
@@ -802,6 +892,7 @@ TEST_P(FP16SpacesOptimizationTest, FP16InnerProductTest) {
         ASSERT_EQ(alignment, expected_alignment(256, dim)) << "F16C with dim " << dim;
         optimization.f16c = optimization.fma3 = optimization.avx = 0;
     }
+#endif
 #endif
     unsigned char alignment = 0;
     arch_opt_func = IP_FP16_GetDistFunc(dim, &alignment, &optimization);
@@ -812,7 +903,7 @@ TEST_P(FP16SpacesOptimizationTest, FP16InnerProductTest) {
 }
 
 TEST_P(FP16SpacesOptimizationTest, FP16L2SqrTest) {
-    auto optimization = cpu_features::GetX86Info().features;
+    auto optimization = getCpuOptimizationFeatures();
     size_t dim = GetParam();
     float16 v1[dim], v2[dim];
     float v1_fp32[dim], v2_fp32[dim];
@@ -831,6 +922,7 @@ TEST_P(FP16SpacesOptimizationTest, FP16L2SqrTest) {
     dist_func_t<float> arch_opt_func;
     float baseline = FP16_L2Sqr(v1, v2, dim);
     ASSERT_EQ(baseline, FP32_L2Sqr(v1_fp32, v2_fp32, dim)) << "Baseline check " << dim;
+#if defined(CPU_FEATURES_ARCH_X86_64)
     // Turn off advanced fp16 flags. They will be tested in the next test.
     optimization.avx512_fp16 = optimization.avx512vl = 0;
 #ifdef OPT_AVX512F
@@ -854,6 +946,7 @@ TEST_P(FP16SpacesOptimizationTest, FP16L2SqrTest) {
         ASSERT_EQ(alignment, expected_alignment(256, dim)) << "F16C with dim " << dim;
         optimization.f16c = optimization.fma3 = optimization.avx = 0;
     }
+#endif
 #endif
     unsigned char alignment = 0;
     arch_opt_func = L2_FP16_GetDistFunc(dim, &alignment, &optimization);
@@ -968,7 +1061,7 @@ INSTANTIATE_TEST_SUITE_P(, FP16SpacesOptimizationTestAdvanced,
 class INT8SpacesOptimizationTest : public testing::TestWithParam<size_t> {};
 
 TEST_P(INT8SpacesOptimizationTest, INT8L2SqrTest) {
-    auto optimization = cpu_features::GetX86Info().features;
+    auto optimization = getCpuOptimizationFeatures();
     size_t dim = GetParam();
     int8_t v1[dim];
     int8_t v2[dim];
@@ -1004,7 +1097,7 @@ TEST_P(INT8SpacesOptimizationTest, INT8L2SqrTest) {
 }
 
 TEST_P(INT8SpacesOptimizationTest, INT8InnerProductTest) {
-    auto optimization = cpu_features::GetX86Info().features;
+    auto optimization = getCpuOptimizationFeatures();
     size_t dim = GetParam();
     int8_t v1[dim];
     int8_t v2[dim];
@@ -1041,7 +1134,7 @@ TEST_P(INT8SpacesOptimizationTest, INT8InnerProductTest) {
 }
 
 TEST_P(INT8SpacesOptimizationTest, INT8CosineTest) {
-    auto optimization = cpu_features::GetX86Info().features;
+    auto optimization = getCpuOptimizationFeatures();
     size_t dim = GetParam();
     int8_t v1[dim + sizeof(float)];
     int8_t v2[dim + sizeof(float)];
@@ -1082,7 +1175,7 @@ INSTANTIATE_TEST_SUITE_P(INT8OptFuncs, INT8SpacesOptimizationTest,
 class UINT8SpacesOptimizationTest : public testing::TestWithParam<size_t> {};
 
 TEST_P(UINT8SpacesOptimizationTest, UINT8L2SqrTest) {
-    auto optimization = cpu_features::GetX86Info().features;
+    auto optimization = getCpuOptimizationFeatures();
     size_t dim = GetParam();
     uint8_t v1[dim];
     uint8_t v2[dim];
@@ -1118,7 +1211,7 @@ TEST_P(UINT8SpacesOptimizationTest, UINT8L2SqrTest) {
 }
 
 TEST_P(UINT8SpacesOptimizationTest, UINT8InnerProductTest) {
-    auto optimization = cpu_features::GetX86Info().features;
+    auto optimization = getCpuOptimizationFeatures();
     size_t dim = GetParam();
     uint8_t v1[dim];
     uint8_t v2[dim];
@@ -1155,7 +1248,7 @@ TEST_P(UINT8SpacesOptimizationTest, UINT8InnerProductTest) {
 }
 
 TEST_P(UINT8SpacesOptimizationTest, UINT8CosineTest) {
-    auto optimization = cpu_features::GetX86Info().features;
+    auto optimization = getCpuOptimizationFeatures();
     size_t dim = GetParam();
     uint8_t v1[dim + sizeof(float)];
     uint8_t v2[dim + sizeof(float)];
@@ -1192,5 +1285,3 @@ TEST_P(UINT8SpacesOptimizationTest, UINT8CosineTest) {
 
 INSTANTIATE_TEST_SUITE_P(UINT8OptFuncs, UINT8SpacesOptimizationTest,
                          testing::Range(32UL, 64 * 2UL + 1));
-
-#endif // CPU_FEATURES_ARCH_X86_64