RedisAI · dor-forer · Feb 19, 2025 · Feb 20, 2025 · Feb 20, 2025 · Feb 20, 2025
diff --git a/.install/install_script.sh b/.install/install_script.sh
@@ -1,6 +1,7 @@
 #!/bin/bash
 
 OS_TYPE=$(uname -s)
+ARCH=$(uname -m)
 
 if [[ $OS_TYPE = 'Darwin' ]]
 then
@@ -16,6 +17,23 @@ else
     OS=$(echo $OS | sed 's/[/ ]/_/g') # replace spaces and slashes with underscores
 fi
 echo $OS
+if [[ $ARCH == 'aarch64' ]]
+then
+    wget https://developer.arm.com/-/cdn-downloads/permalink/Arm-Performance-Libraries/Version_24.10/arm-performance-libraries_24.10_deb_gcc.tar
+    tar -xf arm-performance-libraries_24.10_deb_gcc.tar
+    sudo ./arm-performance-libraries_24.10_deb/arm-performance-libraries_24.10_deb.sh --accept --install-to armpl
+    sudo rm -rf arm-performance-libraries_24.10_deb_gcc.tar
+    # install libarmpl
+elif [[ $OS == 'macos' ]]
+then
+    wget https://developer.arm.com/-/cdn-downloads/permalink/Arm-Performance-Libraries/Version_24.10/arm-performance-libraries_24.10_macOS.tgz
+    tar zxvf arm-performance-libraries_24.10_macOS.tgz
+    hdiutil attach armpl_24.10_flang-new_clang_19.dmg
+    sudo /Volumes/armpl_24.10_flang-new_clang_19_installer/armpl_24.10_flang-new_clang_19_install.sh --install-to=$(pwd)/armpl -y 
+    # Clean up
+    hdiutil detach /Volumes/armpl_24.10_flang-new_clang_19_installer
+    rm -f arm-performance-libraries_24.10_macOS.tgz
+fi
 
 SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
 

diff --git a/cmake/aarch64InstructionFlags.cmake b/cmake/aarch64InstructionFlags.cmake
@@ -7,12 +7,13 @@ message(STATUS "Building for ARM aarch64")
 CHECK_CXX_COMPILER_FLAG("-march=armv7-a+neon" CXX_ARMV7_NEON)
 CHECK_CXX_COMPILER_FLAG("-march=armv8-a" CXX_ARMV8A)
 CHECK_CXX_COMPILER_FLAG("-march=armv8-a+sve" CXX_SVE)
-CHECK_CXX_COMPILER_FLAG("-march=armv9-a+sve2" CXX_ARMV9)
+CHECK_CXX_COMPILER_FLAG("-march=armv9-a+sve2" CXX_SVE2)
+
 
 # Only use ARMv9 if both compiler and CPU support it
-if(CXX_ARMV9)
+if(CXX_SVE2)
   message(STATUS "Using ARMv9.0-a with SVE2 (supported by CPU)")
-  add_compile_definitions(OPT_ARMV9)
+  add_compile_definitions(OPT_SVE2)
 endif()
 if (CXX_ARMV8A OR CXX_ARMV7_NEON)
   add_compile_definitions(OPT_NEON)

diff --git a/src/VecSim/spaces/CMakeLists.txt b/src/VecSim/spaces/CMakeLists.txt
@@ -81,6 +81,58 @@ if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "(x86_64)|(AMD64|amd64)|(^i.86$)")
 	endif()
 endif()
 
+if (CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "(aarch64)|(arm64)|(ARM64)|(armv.*)")
+	message(STATUS "Enabling Arm Performance Libraries integration")
+	include(${root}/cmake/aarch64InstructionFlags.cmake)
+	if(EXISTS "${root}/armpl/armpl_24.10_gcc")
+		set(ARMPL_DIR "${root}/armpl/armpl_24.10_gcc")
+	elseif(EXISTS "${root}/armpl/armpl_24.10_flang-new_clang_19")
+		set(ARMPL_DIR "${root}/armpl/armpl_24.10_flang-new_clang_19")
+	elseif(EXISTS "/opt/arm/armpl_24.10_gcc")
+		set(ARMPL_DIR "/opt/arm/armpl_24.10_gcc")
+	elseif(EXISTS "/opt/arm/armpl_24.10_flang-new_clang_19")
+		set(ARMPL_DIR "/opt/arm/armpl_24.10_flang-new_clang_19")
+	else()
+		# Default fallback path
+		set(ARMPL_DIR "${root}/armpl/armpl_24.10_gcc")
+		message(WARNING "None of the expected ARM Performance Libraries paths exist. Using default: ${ARMPL_DIR}")
+	endif()
+
+	include_directories(${ARMPL_DIR}/include)
+	message(STATUS "$ENV{ARMPL_LIBRARIES} and ${ARMPL_DIR}/lib")
+	# Find and set up ARM Performance Libraries with no default path
+	find_library(ARMPL_LIB armpl_lp64 PATHS ${ARMPL_DIR}/lib)
+
+	if(NOT ARMPL_LIB)
+
+		message(STATUS "Could not find ARM Performance Libraries")
+	else()
+		message(STATUS "Found ARM Performance Libraries: ${ARMPL_LIB}")
+	endif()
+
+	# Create different optimization implementations for ARM architecture
+	# ArmPL used for standard ARMv8-A with NEON
+	if (CXX_ARMV8A)
+		message("Building with ARMV8A and ArmPL")
+		set_source_files_properties(functions/ARMPL_NEON.cpp PROPERTIES COMPILE_FLAGS "-march=armv8-a")
+		list(APPEND OPTIMIZATIONS functions/ARMPL_NEON.cpp)
+	endif()
+
+	# ArmPL with SVE support
+	if (CXX_SVE)
+		message("Building with SVE and ArmPL")
+		set_source_files_properties(functions/ARMPL_SVE.cpp PROPERTIES COMPILE_FLAGS "-march=armv8-a+sve")
+		list(APPEND OPTIMIZATIONS functions/ARMPL_SVE.cpp)
+	endif()
+
+	# ArmPL with SVE2 support
+	if (CXX_SVE2)
+		message("Building with ARMV9 and ArmPL")
+		set_source_files_properties(functions/ARMPL_SVE2.cpp PROPERTIES COMPILE_FLAGS "-march=armv9-a+sve2")
+		list(APPEND OPTIMIZATIONS functions/ARMPL_SVE2.cpp)
+	endif()
+endif()
+
 # Here we are compiling the space selectors with the relevant optimization flag.
 add_library(VectorSimilaritySpaces
 	L2_space.cpp
@@ -90,4 +142,4 @@ add_library(VectorSimilaritySpaces
 	computer/preprocessor_container.cpp
 )
 
-target_link_libraries(VectorSimilaritySpaces VectorSimilaritySpaces_no_optimization cpu_features)
+target_link_libraries(VectorSimilaritySpaces VectorSimilaritySpaces_no_optimization cpu_features ${ARMPL_LIB})
diff --git a/src/VecSim/spaces/IP/IP_ARMPL_NEON_FP32.h b/src/VecSim/spaces/IP/IP_ARMPL_NEON_FP32.h
@@ -0,0 +1,17 @@
+/*
+ *Copyright Redis Ltd. 2021 - present
+ *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
+ *the Server Side Public License v1 (SSPLv1).
+ */
+
+#include "VecSim/spaces/space_includes.h"
+#include <armpl.h>
+
+float FP32_InnerProduct_ARMPL_NEON(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    auto *vec1 = (float *)pVect1v;
+    auto *vec2 = (float *)pVect2v;
+
+    // Notice: Armpl can choose different implementation based on cpu features.
+    float res = cblas_sdot(static_cast<int>(dimension), vec1, 1, vec2, 1);
+    return 1.0f - res;
+}
diff --git a/src/VecSim/spaces/IP/IP_ARMPL_SVE2_FP32.h b/src/VecSim/spaces/IP/IP_ARMPL_SVE2_FP32.h
@@ -0,0 +1,17 @@
+/*
+ *Copyright Redis Ltd. 2021 - present
+ *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
+ *the Server Side Public License v1 (SSPLv1).
+ */
+
+#include "VecSim/spaces/space_includes.h"
+#include <armpl.h>
+
+float FP32_InnerProduct_ARMPL_SVE2(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    auto *vec1 = (float *)pVect1v;
+    auto *vec2 = (float *)pVect2v;
+
+    // Notice: Armpl can choose different implementation based on cpu features.
+    float res = cblas_sdot(static_cast<int>(dimension), vec1, 1, vec2, 1);
+    return 1.0f - res;
+}
diff --git a/src/VecSim/spaces/IP/IP_ARMPL_SVE_FP32.h b/src/VecSim/spaces/IP/IP_ARMPL_SVE_FP32.h
@@ -0,0 +1,17 @@
+/*
+ *Copyright Redis Ltd. 2021 - present
+ *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
+ *the Server Side Public License v1 (SSPLv1).
+ */
+
+#include "VecSim/spaces/space_includes.h"
+#include <armpl.h>
+
+float FP32_InnerProduct_ARMPL_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    auto *vec1 = (float *)pVect1v;
+    auto *vec2 = (float *)pVect2v;
+
+    // Notice: Armpl can choose different implementation based on cpu features.
+    float res = cblas_sdot(static_cast<int>(dimension), vec1, 1, vec2, 1);
+    return 1.0f - res;
+}
diff --git a/src/VecSim/spaces/IP_space.cpp b/src/VecSim/spaces/IP_space.cpp
@@ -19,6 +19,9 @@
 #include "VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h"
 #include "VecSim/spaces/functions/AVX2.h"
 #include "VecSim/spaces/functions/SSE3.h"
+#include "VecSim/spaces/functions/ARMPL_NEON.h"
+#include "VecSim/spaces/functions/ARMPL_SVE.h"
+#include "VecSim/spaces/functions/ARMPL_SVE2.h"
 
 using bfloat16 = vecsim_types::bfloat16;
 using float16 = vecsim_types::float16;
@@ -35,6 +38,30 @@ dist_func_t<float> IP_FP32_GetDistFunc(size_t dim, unsigned char *alignment, con
     if (dim < 16) {
         return ret_dist_func;
     }
+
+#ifdef CPU_FEATURES_ARCH_AARCH64
+    auto features = (arch_opt == nullptr)
+                        ? cpu_features::GetAarch64Info().features
+                        : *static_cast<const cpu_features::Aarch64Features *>(arch_opt);
+
+#ifdef OPT_SVE2
+    if (features.sve2) {
+        return Choose_FP32_IP_implementation_ARMPL_SVE2(dim);
+    }
+#endif
+#ifdef OPT_SVE
+    if (features.sve) {
+        return Choose_FP32_IP_implementation_ARMPL_SVE(dim);
+    }
+#endif
+#ifdef OPT_NEON
+    if (features.asimd) {
+        return Choose_FP32_IP_implementation_ARMPL_NEON(dim);
+    }
+#endif
+
+#endif
+
 #ifdef CPU_FEATURES_ARCH_X86_64
     auto features = (arch_opt == nullptr)
                         ? cpu_features::GetX86Info().features

diff --git a/src/VecSim/spaces/L2/L2_ARMPL_NEON_FP32.h b/src/VecSim/spaces/L2/L2_ARMPL_NEON_FP32.h
@@ -0,0 +1,49 @@
+/*
+ *Copyright Redis Ltd. 2021 - present
+ *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
+ *the Server Side Public License v1 (SSPLv1).
+ */
+
+#include "VecSim/spaces/space_includes.h"
+#include <armpl.h>
+
+float FP32_L2Sqr_ARMPL_NEON(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    const float *vec1 = static_cast<const float *>(pVect1v);
+    const float *vec2 = static_cast<const float *>(pVect2v);
+
+    float result = 0.0f;
+    constexpr const size_t blockSize = 1024;
+    float buffer[blockSize];
+
+    // Pre-calculate number of full blocks and the size of the last partial block
+    const size_t fullBlockCount = dimension / blockSize;
+    const size_t lastBlockSize = dimension % blockSize;
+
+    // Process full blocks
+    for (size_t i = 0; i < fullBlockCount; i++) {
+        size_t offset = i * blockSize;
+
+        // Calculate difference vector for full block
+        for (size_t j = 0; j < blockSize; j++) {
+            buffer[j] = vec1[offset + j] - vec2[offset + j];
+        }
+
+        // Use ARMPL to compute dot product
+        result += cblas_sdot(blockSize, buffer, 1, buffer, 1);
+    }
+
+    // Handle remaining elements (if any)
+    if (lastBlockSize > 0) {
+        size_t offset = fullBlockCount * blockSize;
+
+        // Calculate difference vector for remaining elements
+        for (size_t j = 0; j < lastBlockSize; j++) {
+            buffer[j] = vec1[offset + j] - vec2[offset + j];
+        }
+
+        // Use ARMPL to compute dot product
+        result += cblas_sdot(lastBlockSize, buffer, 1, buffer, 1);
+    }
+
+    return result;
+}
diff --git a/src/VecSim/spaces/L2/L2_ARMPL_SVE2_FP32.h b/src/VecSim/spaces/L2/L2_ARMPL_SVE2_FP32.h
@@ -0,0 +1,49 @@
+/*
+ *Copyright Redis Ltd. 2021 - present
+ *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
+ *the Server Side Public License v1 (SSPLv1).
+ */
+
+#include "VecSim/spaces/space_includes.h"
+#include "armpl.h"
+
+float FP32_L2Sqr_ARMPL_SVE2(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    const float *vec1 = static_cast<const float *>(pVect1v);
+    const float *vec2 = static_cast<const float *>(pVect2v);
+
+    float result = 0.0f;
+    constexpr const size_t blockSize = 1024;
+    float buffer[blockSize];
+
+    // Pre-calculate number of full blocks and the size of the last partial block
+    const size_t fullBlockCount = dimension / blockSize;
+    const size_t lastBlockSize = dimension % blockSize;
+
+    // Process full blocks
+    for (size_t i = 0; i < fullBlockCount; i++) {
+        size_t offset = i * blockSize;
+
+        // Calculate difference vector for full block
+        for (size_t j = 0; j < blockSize; j++) {
+            buffer[j] = vec1[offset + j] - vec2[offset + j];
+        }
+
+        // Use ARMPL to compute dot product
+        result += cblas_sdot(blockSize, buffer, 1, buffer, 1);
+    }
+
+    // Handle remaining elements (if any)
+    if (lastBlockSize > 0) {
+        size_t offset = fullBlockCount * blockSize;
+
+        // Calculate difference vector for remaining elements
+        for (size_t j = 0; j < lastBlockSize; j++) {
+            buffer[j] = vec1[offset + j] - vec2[offset + j];
+        }
+
+        // Use ARMPL to compute dot product
+        result += cblas_sdot(lastBlockSize, buffer, 1, buffer, 1);
+    }
+
+    return result;
+}
diff --git a/src/VecSim/spaces/L2/L2_ARMPL_SVE_FP32.h b/src/VecSim/spaces/L2/L2_ARMPL_SVE_FP32.h
@@ -0,0 +1,49 @@
+/*
+ *Copyright Redis Ltd. 2021 - present
+ *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
+ *the Server Side Public License v1 (SSPLv1).
+ */
+
+#include "VecSim/spaces/space_includes.h"
+#include "armpl.h"
+
+float FP32_L2Sqr_ARMPL_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    const float *vec1 = static_cast<const float *>(pVect1v);
+    const float *vec2 = static_cast<const float *>(pVect2v);
+
+    float result = 0.0f;
+    constexpr const size_t blockSize = 1024;
+    float buffer[blockSize];
+
+    // Pre-calculate number of full blocks and the size of the last partial block
+    const size_t fullBlockCount = dimension / blockSize;
+    const size_t lastBlockSize = dimension % blockSize;
+
+    // Process full blocks
+    for (size_t i = 0; i < fullBlockCount; i++) {
+        size_t offset = i * blockSize;
+
+        // Calculate difference vector for full block
+        for (size_t j = 0; j < blockSize; j++) {
+            buffer[j] = vec1[offset + j] - vec2[offset + j];
+        }
+
+        // Use ARMPL to compute dot product
+        result += cblas_sdot(blockSize, buffer, 1, buffer, 1);
+    }
+
+    // Handle remaining elements (if any)
+    if (lastBlockSize > 0) {
+        size_t offset = fullBlockCount * blockSize;
+
+        // Calculate difference vector for remaining elements
+        for (size_t j = 0; j < lastBlockSize; j++) {
+            buffer[j] = vec1[offset + j] - vec2[offset + j];
+        }
+
+        // Use ARMPL to compute dot product
+        result += cblas_sdot(lastBlockSize, buffer, 1, buffer, 1);
+    }
+
+    return result;
+}
diff --git a/src/VecSim/spaces/L2_space.cpp b/src/VecSim/spaces/L2_space.cpp
@@ -18,6 +18,9 @@
 #include "VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h"
 #include "VecSim/spaces/functions/AVX2.h"
 #include "VecSim/spaces/functions/SSE3.h"
+#include "VecSim/spaces/functions/ARMPL_NEON.h"
+#include "VecSim/spaces/functions/ARMPL_SVE.h"
+#include "VecSim/spaces/functions/ARMPL_SVE2.h"
 
 using bfloat16 = vecsim_types::bfloat16;
 using float16 = vecsim_types::float16;
@@ -35,6 +38,28 @@ dist_func_t<float> L2_FP32_GetDistFunc(size_t dim, unsigned char *alignment, con
     if (dim < 16) {
         return ret_dist_func;
     }
+#ifdef CPU_FEATURES_ARCH_AARCH64
+    auto features = (arch_opt == nullptr)
+                        ? cpu_features::GetAarch64Info().features
+                        : *static_cast<const cpu_features::Aarch64Features *>(arch_opt);
+
+#ifdef OPT_SVE2
+    if (features.sve2) {
+        return Choose_FP32_L2_implementation_ARMPL_SVE2(dim);
+    }
+#endif
+#ifdef OPT_SVE
+    if (features.sve) {
+        return Choose_FP32_L2_implementation_ARMPL_SVE(dim);
+    }
+#endif
+#ifdef OPT_NEON
+    if (features.asimd) {
+        return Choose_FP32_L2_implementation_ARMPL_NEON(dim);
+    }
+#endif
+#endif
+
 #ifdef CPU_FEATURES_ARCH_X86_64
     auto features = (arch_opt == nullptr)
                         ? cpu_features::GetX86Info().features