diff --git a/.install/install_script.sh b/.install/install_script.sh index 69a871b31..2ca688a67 100755 --- a/.install/install_script.sh +++ b/.install/install_script.sh @@ -1,6 +1,7 @@ #!/bin/bash OS_TYPE=$(uname -s) +ARCH=$(uname -m) if [[ $OS_TYPE = 'Darwin' ]] then @@ -16,6 +17,23 @@ else OS=$(echo $OS | sed 's/[/ ]/_/g') # replace spaces and slashes with underscores fi echo $OS +if [[ $ARCH == 'aarch64' ]] +then + wget https://developer.arm.com/-/cdn-downloads/permalink/Arm-Performance-Libraries/Version_24.10/arm-performance-libraries_24.10_deb_gcc.tar + tar -xf arm-performance-libraries_24.10_deb_gcc.tar + sudo ./arm-performance-libraries_24.10_deb/arm-performance-libraries_24.10_deb.sh --accept --install-to armpl + sudo rm -rf arm-performance-libraries_24.10_deb_gcc.tar + # install libarmpl +elif [[ $OS == 'macos' ]] +then + wget https://developer.arm.com/-/cdn-downloads/permalink/Arm-Performance-Libraries/Version_24.10/arm-performance-libraries_24.10_macOS.tgz + tar zxvf arm-performance-libraries_24.10_macOS.tgz + hdiutil attach armpl_24.10_flang-new_clang_19.dmg + sudo /Volumes/armpl_24.10_flang-new_clang_19_installer/armpl_24.10_flang-new_clang_19_install.sh --install-to=$(pwd)/armpl -y + # Clean up + hdiutil detach /Volumes/armpl_24.10_flang-new_clang_19_installer + rm -f arm-performance-libraries_24.10_macOS.tgz +fi SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) diff --git a/cmake/aarch64InstructionFlags.cmake b/cmake/aarch64InstructionFlags.cmake index 7c1363262..1cf97ad4e 100644 --- a/cmake/aarch64InstructionFlags.cmake +++ b/cmake/aarch64InstructionFlags.cmake @@ -7,12 +7,13 @@ message(STATUS "Building for ARM aarch64") CHECK_CXX_COMPILER_FLAG("-march=armv7-a+neon" CXX_ARMV7_NEON) CHECK_CXX_COMPILER_FLAG("-march=armv8-a" CXX_ARMV8A) CHECK_CXX_COMPILER_FLAG("-march=armv8-a+sve" CXX_SVE) -CHECK_CXX_COMPILER_FLAG("-march=armv9-a+sve2" CXX_ARMV9) +CHECK_CXX_COMPILER_FLAG("-march=armv9-a+sve2" CXX_SVE2) + # Only use ARMv9 if both compiler and CPU support it -if(CXX_ARMV9) +if(CXX_SVE2) message(STATUS "Using ARMv9.0-a with SVE2 (supported by CPU)") - add_compile_definitions(OPT_ARMV9) + add_compile_definitions(OPT_SVE2) endif() if (CXX_ARMV8A OR CXX_ARMV7_NEON) add_compile_definitions(OPT_NEON) diff --git a/src/VecSim/spaces/CMakeLists.txt b/src/VecSim/spaces/CMakeLists.txt index 1fc9473b2..e11d17ac8 100644 --- a/src/VecSim/spaces/CMakeLists.txt +++ b/src/VecSim/spaces/CMakeLists.txt @@ -81,6 +81,58 @@ if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "(x86_64)|(AMD64|amd64)|(^i.86$)") endif() endif() +if (CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "(aarch64)|(arm64)|(ARM64)|(armv.*)") + message(STATUS "Enabling Arm Performance Libraries integration") + include(${root}/cmake/aarch64InstructionFlags.cmake) + if(EXISTS "${root}/armpl/armpl_24.10_gcc") + set(ARMPL_DIR "${root}/armpl/armpl_24.10_gcc") + elseif(EXISTS "${root}/armpl/armpl_24.10_flang-new_clang_19") + set(ARMPL_DIR "${root}/armpl/armpl_24.10_flang-new_clang_19") + elseif(EXISTS "/opt/arm/armpl_24.10_gcc") + set(ARMPL_DIR "/opt/arm/armpl_24.10_gcc") + elseif(EXISTS "/opt/arm/armpl_24.10_flang-new_clang_19") + set(ARMPL_DIR "/opt/arm/armpl_24.10_flang-new_clang_19") + else() + # Default fallback path + set(ARMPL_DIR "${root}/armpl/armpl_24.10_gcc") + message(WARNING "None of the expected ARM Performance Libraries paths exist. Using default: ${ARMPL_DIR}") + endif() + + include_directories(${ARMPL_DIR}/include) + message(STATUS "$ENV{ARMPL_LIBRARIES} and ${ARMPL_DIR}/lib") + # Find and set up ARM Performance Libraries with no default path + find_library(ARMPL_LIB armpl_lp64 PATHS ${ARMPL_DIR}/lib) + + if(NOT ARMPL_LIB) + + message(STATUS "Could not find ARM Performance Libraries") + else() + message(STATUS "Found ARM Performance Libraries: ${ARMPL_LIB}") + endif() + + # Create different optimization implementations for ARM architecture + # ArmPL used for standard ARMv8-A with NEON + if (CXX_ARMV8A) + message("Building with ARMV8A and ArmPL") + set_source_files_properties(functions/ARMPL_NEON.cpp PROPERTIES COMPILE_FLAGS "-march=armv8-a") + list(APPEND OPTIMIZATIONS functions/ARMPL_NEON.cpp) + endif() + + # ArmPL with SVE support + if (CXX_SVE) + message("Building with SVE and ArmPL") + set_source_files_properties(functions/ARMPL_SVE.cpp PROPERTIES COMPILE_FLAGS "-march=armv8-a+sve") + list(APPEND OPTIMIZATIONS functions/ARMPL_SVE.cpp) + endif() + + # ArmPL with SVE2 support + if (CXX_SVE2) + message("Building with ARMV9 and ArmPL") + set_source_files_properties(functions/ARMPL_SVE2.cpp PROPERTIES COMPILE_FLAGS "-march=armv9-a+sve2") + list(APPEND OPTIMIZATIONS functions/ARMPL_SVE2.cpp) + endif() +endif() + # Here we are compiling the space selectors with the relevant optimization flag. add_library(VectorSimilaritySpaces L2_space.cpp @@ -90,4 +142,4 @@ add_library(VectorSimilaritySpaces computer/preprocessor_container.cpp ) -target_link_libraries(VectorSimilaritySpaces VectorSimilaritySpaces_no_optimization cpu_features) +target_link_libraries(VectorSimilaritySpaces VectorSimilaritySpaces_no_optimization cpu_features ${ARMPL_LIB}) diff --git a/src/VecSim/spaces/IP/IP_ARMPL_NEON_FP32.h b/src/VecSim/spaces/IP/IP_ARMPL_NEON_FP32.h new file mode 100644 index 000000000..4ab901456 --- /dev/null +++ b/src/VecSim/spaces/IP/IP_ARMPL_NEON_FP32.h @@ -0,0 +1,17 @@ +/* + *Copyright Redis Ltd. 2021 - present + *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or + *the Server Side Public License v1 (SSPLv1). + */ + +#include "VecSim/spaces/space_includes.h" +#include + +float FP32_InnerProduct_ARMPL_NEON(const void *pVect1v, const void *pVect2v, size_t dimension) { + auto *vec1 = (float *)pVect1v; + auto *vec2 = (float *)pVect2v; + + // Notice: Armpl can choose different implementation based on cpu features. + float res = cblas_sdot(static_cast(dimension), vec1, 1, vec2, 1); + return 1.0f - res; +} diff --git a/src/VecSim/spaces/IP/IP_ARMPL_SVE2_FP32.h b/src/VecSim/spaces/IP/IP_ARMPL_SVE2_FP32.h new file mode 100644 index 000000000..1014f791e --- /dev/null +++ b/src/VecSim/spaces/IP/IP_ARMPL_SVE2_FP32.h @@ -0,0 +1,17 @@ +/* + *Copyright Redis Ltd. 2021 - present + *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or + *the Server Side Public License v1 (SSPLv1). + */ + +#include "VecSim/spaces/space_includes.h" +#include + +float FP32_InnerProduct_ARMPL_SVE2(const void *pVect1v, const void *pVect2v, size_t dimension) { + auto *vec1 = (float *)pVect1v; + auto *vec2 = (float *)pVect2v; + + // Notice: Armpl can choose different implementation based on cpu features. + float res = cblas_sdot(static_cast(dimension), vec1, 1, vec2, 1); + return 1.0f - res; +} diff --git a/src/VecSim/spaces/IP/IP_ARMPL_SVE_FP32.h b/src/VecSim/spaces/IP/IP_ARMPL_SVE_FP32.h new file mode 100644 index 000000000..40641a72a --- /dev/null +++ b/src/VecSim/spaces/IP/IP_ARMPL_SVE_FP32.h @@ -0,0 +1,17 @@ +/* + *Copyright Redis Ltd. 2021 - present + *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or + *the Server Side Public License v1 (SSPLv1). + */ + +#include "VecSim/spaces/space_includes.h" +#include + +float FP32_InnerProduct_ARMPL_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) { + auto *vec1 = (float *)pVect1v; + auto *vec2 = (float *)pVect2v; + + // Notice: Armpl can choose different implementation based on cpu features. + float res = cblas_sdot(static_cast(dimension), vec1, 1, vec2, 1); + return 1.0f - res; +} diff --git a/src/VecSim/spaces/IP_space.cpp b/src/VecSim/spaces/IP_space.cpp index 50f015e41..07009ce39 100644 --- a/src/VecSim/spaces/IP_space.cpp +++ b/src/VecSim/spaces/IP_space.cpp @@ -19,6 +19,9 @@ #include "VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h" #include "VecSim/spaces/functions/AVX2.h" #include "VecSim/spaces/functions/SSE3.h" +#include "VecSim/spaces/functions/ARMPL_NEON.h" +#include "VecSim/spaces/functions/ARMPL_SVE.h" +#include "VecSim/spaces/functions/ARMPL_SVE2.h" using bfloat16 = vecsim_types::bfloat16; using float16 = vecsim_types::float16; @@ -35,6 +38,30 @@ dist_func_t IP_FP32_GetDistFunc(size_t dim, unsigned char *alignment, con if (dim < 16) { return ret_dist_func; } + +#ifdef CPU_FEATURES_ARCH_AARCH64 + auto features = (arch_opt == nullptr) + ? cpu_features::GetAarch64Info().features + : *static_cast(arch_opt); + +#ifdef OPT_SVE2 + if (features.sve2) { + return Choose_FP32_IP_implementation_ARMPL_SVE2(dim); + } +#endif +#ifdef OPT_SVE + if (features.sve) { + return Choose_FP32_IP_implementation_ARMPL_SVE(dim); + } +#endif +#ifdef OPT_NEON + if (features.asimd) { + return Choose_FP32_IP_implementation_ARMPL_NEON(dim); + } +#endif + +#endif + #ifdef CPU_FEATURES_ARCH_X86_64 auto features = (arch_opt == nullptr) ? cpu_features::GetX86Info().features diff --git a/src/VecSim/spaces/L2/L2_ARMPL_NEON_FP32.h b/src/VecSim/spaces/L2/L2_ARMPL_NEON_FP32.h new file mode 100644 index 000000000..36bd2823a --- /dev/null +++ b/src/VecSim/spaces/L2/L2_ARMPL_NEON_FP32.h @@ -0,0 +1,49 @@ +/* + *Copyright Redis Ltd. 2021 - present + *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or + *the Server Side Public License v1 (SSPLv1). + */ + +#include "VecSim/spaces/space_includes.h" +#include + +float FP32_L2Sqr_ARMPL_NEON(const void *pVect1v, const void *pVect2v, size_t dimension) { + const float *vec1 = static_cast(pVect1v); + const float *vec2 = static_cast(pVect2v); + + float result = 0.0f; + constexpr const size_t blockSize = 1024; + float buffer[blockSize]; + + // Pre-calculate number of full blocks and the size of the last partial block + const size_t fullBlockCount = dimension / blockSize; + const size_t lastBlockSize = dimension % blockSize; + + // Process full blocks + for (size_t i = 0; i < fullBlockCount; i++) { + size_t offset = i * blockSize; + + // Calculate difference vector for full block + for (size_t j = 0; j < blockSize; j++) { + buffer[j] = vec1[offset + j] - vec2[offset + j]; + } + + // Use ARMPL to compute dot product + result += cblas_sdot(blockSize, buffer, 1, buffer, 1); + } + + // Handle remaining elements (if any) + if (lastBlockSize > 0) { + size_t offset = fullBlockCount * blockSize; + + // Calculate difference vector for remaining elements + for (size_t j = 0; j < lastBlockSize; j++) { + buffer[j] = vec1[offset + j] - vec2[offset + j]; + } + + // Use ARMPL to compute dot product + result += cblas_sdot(lastBlockSize, buffer, 1, buffer, 1); + } + + return result; +} diff --git a/src/VecSim/spaces/L2/L2_ARMPL_SVE2_FP32.h b/src/VecSim/spaces/L2/L2_ARMPL_SVE2_FP32.h new file mode 100644 index 000000000..4c0a24226 --- /dev/null +++ b/src/VecSim/spaces/L2/L2_ARMPL_SVE2_FP32.h @@ -0,0 +1,49 @@ +/* + *Copyright Redis Ltd. 2021 - present + *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or + *the Server Side Public License v1 (SSPLv1). + */ + +#include "VecSim/spaces/space_includes.h" +#include "armpl.h" + +float FP32_L2Sqr_ARMPL_SVE2(const void *pVect1v, const void *pVect2v, size_t dimension) { + const float *vec1 = static_cast(pVect1v); + const float *vec2 = static_cast(pVect2v); + + float result = 0.0f; + constexpr const size_t blockSize = 1024; + float buffer[blockSize]; + + // Pre-calculate number of full blocks and the size of the last partial block + const size_t fullBlockCount = dimension / blockSize; + const size_t lastBlockSize = dimension % blockSize; + + // Process full blocks + for (size_t i = 0; i < fullBlockCount; i++) { + size_t offset = i * blockSize; + + // Calculate difference vector for full block + for (size_t j = 0; j < blockSize; j++) { + buffer[j] = vec1[offset + j] - vec2[offset + j]; + } + + // Use ARMPL to compute dot product + result += cblas_sdot(blockSize, buffer, 1, buffer, 1); + } + + // Handle remaining elements (if any) + if (lastBlockSize > 0) { + size_t offset = fullBlockCount * blockSize; + + // Calculate difference vector for remaining elements + for (size_t j = 0; j < lastBlockSize; j++) { + buffer[j] = vec1[offset + j] - vec2[offset + j]; + } + + // Use ARMPL to compute dot product + result += cblas_sdot(lastBlockSize, buffer, 1, buffer, 1); + } + + return result; +} diff --git a/src/VecSim/spaces/L2/L2_ARMPL_SVE_FP32.h b/src/VecSim/spaces/L2/L2_ARMPL_SVE_FP32.h new file mode 100644 index 000000000..ecf38b4be --- /dev/null +++ b/src/VecSim/spaces/L2/L2_ARMPL_SVE_FP32.h @@ -0,0 +1,49 @@ +/* + *Copyright Redis Ltd. 2021 - present + *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or + *the Server Side Public License v1 (SSPLv1). + */ + +#include "VecSim/spaces/space_includes.h" +#include "armpl.h" + +float FP32_L2Sqr_ARMPL_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) { + const float *vec1 = static_cast(pVect1v); + const float *vec2 = static_cast(pVect2v); + + float result = 0.0f; + constexpr const size_t blockSize = 1024; + float buffer[blockSize]; + + // Pre-calculate number of full blocks and the size of the last partial block + const size_t fullBlockCount = dimension / blockSize; + const size_t lastBlockSize = dimension % blockSize; + + // Process full blocks + for (size_t i = 0; i < fullBlockCount; i++) { + size_t offset = i * blockSize; + + // Calculate difference vector for full block + for (size_t j = 0; j < blockSize; j++) { + buffer[j] = vec1[offset + j] - vec2[offset + j]; + } + + // Use ARMPL to compute dot product + result += cblas_sdot(blockSize, buffer, 1, buffer, 1); + } + + // Handle remaining elements (if any) + if (lastBlockSize > 0) { + size_t offset = fullBlockCount * blockSize; + + // Calculate difference vector for remaining elements + for (size_t j = 0; j < lastBlockSize; j++) { + buffer[j] = vec1[offset + j] - vec2[offset + j]; + } + + // Use ARMPL to compute dot product + result += cblas_sdot(lastBlockSize, buffer, 1, buffer, 1); + } + + return result; +} diff --git a/src/VecSim/spaces/L2_space.cpp b/src/VecSim/spaces/L2_space.cpp index ae02231c1..f41eed674 100644 --- a/src/VecSim/spaces/L2_space.cpp +++ b/src/VecSim/spaces/L2_space.cpp @@ -18,6 +18,9 @@ #include "VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h" #include "VecSim/spaces/functions/AVX2.h" #include "VecSim/spaces/functions/SSE3.h" +#include "VecSim/spaces/functions/ARMPL_NEON.h" +#include "VecSim/spaces/functions/ARMPL_SVE.h" +#include "VecSim/spaces/functions/ARMPL_SVE2.h" using bfloat16 = vecsim_types::bfloat16; using float16 = vecsim_types::float16; @@ -35,6 +38,28 @@ dist_func_t L2_FP32_GetDistFunc(size_t dim, unsigned char *alignment, con if (dim < 16) { return ret_dist_func; } +#ifdef CPU_FEATURES_ARCH_AARCH64 + auto features = (arch_opt == nullptr) + ? cpu_features::GetAarch64Info().features + : *static_cast(arch_opt); + +#ifdef OPT_SVE2 + if (features.sve2) { + return Choose_FP32_L2_implementation_ARMPL_SVE2(dim); + } +#endif +#ifdef OPT_SVE + if (features.sve) { + return Choose_FP32_L2_implementation_ARMPL_SVE(dim); + } +#endif +#ifdef OPT_NEON + if (features.asimd) { + return Choose_FP32_L2_implementation_ARMPL_NEON(dim); + } +#endif +#endif + #ifdef CPU_FEATURES_ARCH_X86_64 auto features = (arch_opt == nullptr) ? cpu_features::GetX86Info().features diff --git a/src/VecSim/spaces/functions/ARMPL_NEON.cpp b/src/VecSim/spaces/functions/ARMPL_NEON.cpp new file mode 100644 index 000000000..f4e16f030 --- /dev/null +++ b/src/VecSim/spaces/functions/ARMPL_NEON.cpp @@ -0,0 +1,25 @@ +/* + *Copyright Redis Ltd. 2021 - present + *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or + *the Server Side Public License v1 (SSPLv1). + */ + +#include "ARMPL_NEON.h" +#include "VecSim/spaces/L2/L2_ARMPL_NEON_FP32.h" +#include "VecSim/spaces/IP/IP_ARMPL_NEON_FP32.h" + +namespace spaces { + +#include "implementation_chooser.h" + +dist_func_t Choose_FP32_IP_implementation_ARMPL_NEON(size_t dim) { + return FP32_InnerProduct_ARMPL_NEON; +} + +dist_func_t Choose_FP32_L2_implementation_ARMPL_NEON(size_t dim) { + return FP32_L2Sqr_ARMPL_NEON; +} + +#include "implementation_chooser_cleanup.h" + +} // namespace spaces diff --git a/src/VecSim/spaces/functions/ARMPL_NEON.h b/src/VecSim/spaces/functions/ARMPL_NEON.h new file mode 100644 index 000000000..6be467d19 --- /dev/null +++ b/src/VecSim/spaces/functions/ARMPL_NEON.h @@ -0,0 +1,17 @@ +/* + *Copyright Redis Ltd. 2021 - present + *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or + *the Server Side Public License v1 (SSPLv1). + */ + +#pragma once + +#include "VecSim/spaces/spaces.h" + +namespace spaces { + +dist_func_t Choose_FP32_IP_implementation_ARMPL_NEON(size_t dim); + +dist_func_t Choose_FP32_L2_implementation_ARMPL_NEON(size_t dim); + +} // namespace spaces diff --git a/src/VecSim/spaces/functions/ARMPL_SVE.cpp b/src/VecSim/spaces/functions/ARMPL_SVE.cpp new file mode 100644 index 000000000..74e16100f --- /dev/null +++ b/src/VecSim/spaces/functions/ARMPL_SVE.cpp @@ -0,0 +1,26 @@ +/* + *Copyright Redis Ltd. 2021 - present + *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or + *the Server Side Public License v1 (SSPLv1). + */ + +#include "ARMPL_SVE.h" + +#include "VecSim/spaces/L2/L2_ARMPL_SVE_FP32.h" +#include "VecSim/spaces/IP/IP_ARMPL_SVE_FP32.h" + +namespace spaces { + +#include "implementation_chooser.h" + +dist_func_t Choose_FP32_IP_implementation_ARMPL_SVE(size_t dim) { + return FP32_InnerProduct_ARMPL_SVE; +} + +dist_func_t Choose_FP32_L2_implementation_ARMPL_SVE(size_t dim) { + return FP32_L2Sqr_ARMPL_SVE; +} + +#include "implementation_chooser_cleanup.h" + +} // namespace spaces diff --git a/src/VecSim/spaces/functions/ARMPL_SVE.h b/src/VecSim/spaces/functions/ARMPL_SVE.h new file mode 100644 index 000000000..a8d21fda0 --- /dev/null +++ b/src/VecSim/spaces/functions/ARMPL_SVE.h @@ -0,0 +1,17 @@ +/* + *Copyright Redis Ltd. 2021 - present + *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or + *the Server Side Public License v1 (SSPLv1). + */ + +#pragma once + +#include "VecSim/spaces/spaces.h" + +namespace spaces { + +dist_func_t Choose_FP32_IP_implementation_ARMPL_SVE(size_t dim); + +dist_func_t Choose_FP32_L2_implementation_ARMPL_SVE(size_t dim); + +} // namespace spaces diff --git a/src/VecSim/spaces/functions/ARMPL_SVE2.cpp b/src/VecSim/spaces/functions/ARMPL_SVE2.cpp new file mode 100644 index 000000000..df0369469 --- /dev/null +++ b/src/VecSim/spaces/functions/ARMPL_SVE2.cpp @@ -0,0 +1,26 @@ +/* + *Copyright Redis Ltd. 2021 - present + *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or + *the Server Side Public License v1 (SSPLv1). + */ + +#include "ARMPL_SVE2.h" + +#include "VecSim/spaces/L2/L2_ARMPL_SVE2_FP32.h" +#include "VecSim/spaces/IP/IP_ARMPL_SVE2_FP32.h" + +namespace spaces { + +#include "implementation_chooser.h" + +dist_func_t Choose_FP32_IP_implementation_ARMPL_SVE2(size_t dim) { + return FP32_InnerProduct_ARMPL_SVE2; +} + +dist_func_t Choose_FP32_L2_implementation_ARMPL_SVE2(size_t dim) { + return FP32_L2Sqr_ARMPL_SVE2; +} + +#include "implementation_chooser_cleanup.h" + +} // namespace spaces diff --git a/src/VecSim/spaces/functions/ARMPL_SVE2.h b/src/VecSim/spaces/functions/ARMPL_SVE2.h new file mode 100644 index 000000000..d86c324a5 --- /dev/null +++ b/src/VecSim/spaces/functions/ARMPL_SVE2.h @@ -0,0 +1,17 @@ +/* + *Copyright Redis Ltd. 2021 - present + *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or + *the Server Side Public License v1 (SSPLv1). + */ + +#pragma once + +#include "VecSim/spaces/spaces.h" + +namespace spaces { + +dist_func_t Choose_FP32_IP_implementation_ARMPL_SVE2(size_t dim); + +dist_func_t Choose_FP32_L2_implementation_ARMPL_SVE2(size_t dim); + +} // namespace spaces diff --git a/src/VecSim/spaces/space_includes.h b/src/VecSim/spaces/space_includes.h index f320297bb..80bd518ea 100644 --- a/src/VecSim/spaces/space_includes.h +++ b/src/VecSim/spaces/space_includes.h @@ -12,6 +12,9 @@ #ifdef CPU_FEATURES_ARCH_X86_64 #include "cpuinfo_x86.h" #endif // CPU_FEATURES_ARCH_X86_64 +#ifdef CPU_FEATURES_ARCH_AARCH64 +#include "cpuinfo_aarch64.h" +#endif // CPU_FEATURES_ARCH_AARCH64 #if defined(__AVX512F__) || defined(__AVX__) || defined(__SSE__) #if defined(__GNUC__) diff --git a/tests/benchmark/CMakeLists.txt b/tests/benchmark/CMakeLists.txt index 7bf528de9..e16633d44 100644 --- a/tests/benchmark/CMakeLists.txt +++ b/tests/benchmark/CMakeLists.txt @@ -29,7 +29,6 @@ if (CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "(aarch64)|(arm64)|(ARM64)|(armv.*)") message(STATUS "Enabling Arm Performance Libraries integration") include(${root}/cmake/aarch64InstructionFlags.cmake) add_compile_definitions(BENCHMARK_ARCH=arm64) - else() include(${root}/cmake/x86_64InstructionFlags.cmake) add_compile_definitions(BENCHMARK_ARCH=x86_64) diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces.h b/tests/benchmark/spaces_benchmarks/bm_spaces.h index 3906f3d16..08e31372a 100644 --- a/tests/benchmark/spaces_benchmarks/bm_spaces.h +++ b/tests/benchmark/spaces_benchmarks/bm_spaces.h @@ -24,6 +24,9 @@ #include "VecSim/spaces/functions/F16C.h" #include "VecSim/spaces/functions/SSE3.h" #include "VecSim/spaces/functions/SSE.h" +#include "VecSim/spaces/functions/ARMPL_NEON.h" +#include "VecSim/spaces/functions/ARMPL_SVE.h" +#include "VecSim/spaces/functions/ARMPL_SVE2.h" #include "bm_macros.h" #include "bm_spaces_class.h" diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_fp32.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_fp32.cpp index 106b2abc8..7e2dca826 100644 --- a/tests/benchmark/spaces_benchmarks/bm_spaces_fp32.cpp +++ b/tests/benchmark/spaces_benchmarks/bm_spaces_fp32.cpp @@ -7,6 +7,25 @@ class BM_VecSimSpaces_FP32 : public BM_VecSimSpaces {}; +#ifdef CPU_FEATURES_ARCH_AARCH64 +cpu_features::Aarch64Features opt = cpu_features::GetAarch64Info().features; + +// ARMPL NEON implementation for ARMv8-a +#ifdef OPT_NEON +bool neon_supported = opt.asimd; // ARMv8-a always supports NEON +INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_FP32, FP32, ARMPL_NEON, 16, neon_supported); +#endif +#ifdef OPT_SVE +bool sve_supported = opt.sve; // Check for SVE support +INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_FP32, FP32, ARMPL_SVE, 16, sve_supported); +#endif +// ARMPL SVE2 implementation +#ifdef OPT_SVE2 +bool sve2_supported = opt.sve2; // Check for SVE2 support +INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_FP32, FP32, ARMPL_SVE2, 16, sve2_supported); +#endif +#endif // AARCH64 + #ifdef CPU_FEATURES_ARCH_X86_64 cpu_features::X86Features opt = cpu_features::GetX86Info().features; diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt index 5d55b497e..df79a6427 100644 --- a/tests/unit/CMakeLists.txt +++ b/tests/unit/CMakeLists.txt @@ -28,7 +28,12 @@ if(FP64_TESTS) add_definitions(-DFP64_TESTS) endif() -include(${root}/cmake/x86_64InstructionFlags.cmake) +if (CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "(aarch64)|(arm64)|(ARM64)|(armv8)|(armv9)") + message(STATUS "Enabling Arm Performance Libraries integration") + include(${root}/cmake/aarch64InstructionFlags.cmake) +else() + include(${root}/cmake/x86_64InstructionFlags.cmake) +endif() add_executable(test_hnsw ../utils/mock_thread_pool.cpp test_hnsw.cpp test_hnsw_multi.cpp test_hnsw_tiered.cpp unit_test_utils.cpp) add_executable(test_hnsw_parallel test_hnsw_parallel.cpp ../utils/mock_thread_pool.cpp unit_test_utils.cpp) diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index 093f6b724..2cd36cc2c 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -27,11 +27,25 @@ #include "VecSim/spaces/functions/AVX2.h" #include "VecSim/spaces/functions/SSE3.h" #include "VecSim/spaces/functions/F16C.h" +#include "VecSim/spaces/functions/ARMPL_NEON.h" +#include "VecSim/spaces/functions/ARMPL_SVE.h" +#include "VecSim/spaces/functions/ARMPL_SVE2.h" #include "tests_utils.h" using bfloat16 = vecsim_types::bfloat16; using float16 = vecsim_types::float16; +inline auto getCpuOptimizationFeatures() { +#if defined(CPU_FEATURES_ARCH_X86_64) + return cpu_features::GetX86Info().features; +#elif defined(CPU_FEATURES_ARCH_AARCH64) + return cpu_features::GetAarch64Info().features; +#else + // Return empty/default features if on an unsupported architecture + return cpu_features::X86Info().features; +#endif +} + class SpacesTest : public ::testing::Test { protected: @@ -380,12 +394,11 @@ TEST_F(SpacesTest, smallDimChooser) { // In this following tests we assume that compiler supports all X86 optimizations, so if we have // some hardware flag enabled, we check that the corresponding optimization function was chosen. -#ifdef CPU_FEATURES_ARCH_X86_64 class FP32SpacesOptimizationTest : public testing::TestWithParam {}; TEST_P(FP32SpacesOptimizationTest, FP32L2SqrTest) { - auto optimization = cpu_features::GetX86Info().features; + auto optimization = getCpuOptimizationFeatures(); size_t dim = GetParam(); float v[dim]; float v2[dim]; @@ -401,6 +414,7 @@ TEST_P(FP32SpacesOptimizationTest, FP32L2SqrTest) { dist_func_t arch_opt_func; float baseline = FP32_L2Sqr(v, v2, dim); +// CPU_FEATURES_ARCH_X86_64 #ifdef OPT_AVX512F if (optimization.avx512f) { unsigned char alignment = 0; @@ -437,6 +451,42 @@ TEST_P(FP32SpacesOptimizationTest, FP32L2SqrTest) { optimization.sse = 0; } #endif + +// CPU_FEATURES_ARCH_AARCH64 +#ifdef OPT_SVE2 + if (optimization.sve2) { + unsigned char alignment = 0; + arch_opt_func = L2_FP32_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_FP32_L2_implementation_ARMPL_SVE2(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_EQ(alignment, 0) << "No alignment ARMPL_SVE2 with dim " << dim; + // Unset sve2 flag as well, so we'll choose the next option (default). + optimization.sve2 = 0; + } +#endif +#ifdef OPT_SVE + if (optimization.sve) { + unsigned char alignment = 0; + arch_opt_func = L2_FP32_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_FP32_L2_implementation_ARMPL_SVE(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_EQ(baseline, arch_opt_func(v, v2, dim)) << "SVE with dim " << dim; + ASSERT_EQ(alignment, 0) << "No alignment ARMPL_SVE with dim " << dim; + // Unset sve flag as well, so we'll choose the next option (default). + optimization.sve = 0; + } +#endif +#ifdef OPT_NEON + if (optimization.asimd) { + unsigned char alignment = 0; + arch_opt_func = L2_FP32_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_FP32_L2_implementation_ARMPL_NEON(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_EQ(alignment, 0) << "No alignment ARMPL_NEON with dim " << dim; + optimization.asimd = 0; + } +#endif + unsigned char alignment = 0; arch_opt_func = L2_FP32_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, FP32_L2Sqr) << "Unexpected distance function chosen for dim " << dim; @@ -445,7 +495,7 @@ TEST_P(FP32SpacesOptimizationTest, FP32L2SqrTest) { } TEST_P(FP32SpacesOptimizationTest, FP32InnerProductTest) { - auto optimization = cpu_features::GetX86Info().features; + auto optimization = getCpuOptimizationFeatures(); size_t dim = GetParam(); float v[dim]; float v2[dim]; @@ -461,6 +511,8 @@ TEST_P(FP32SpacesOptimizationTest, FP32InnerProductTest) { dist_func_t arch_opt_func; float baseline = FP32_InnerProduct(v, v2, dim); + +// CPU_FEATURES_ARCH_X86_64 #ifdef OPT_AVX512F if (optimization.avx512f) { unsigned char alignment = 0; @@ -494,6 +546,43 @@ TEST_P(FP32SpacesOptimizationTest, FP32InnerProductTest) { optimization.sse = 0; } #endif + +// CPU_FEATURES_ARCH_AARCH64 +#ifdef OPT_SVE2 + if (optimization.sve2) { + unsigned char alignment = 0; + arch_opt_func = IP_FP32_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_FP32_IP_implementation_ARMPL_SVE2(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_EQ(baseline, arch_opt_func(v, v2, dim)) << "SVE2 with dim " << dim; + ASSERT_EQ(alignment, 0) << "No alignment ARMPL_SVE2 with dim " << dim; + // Unset sve2 flag as well, so we'll choose the next option (default). + optimization.sve2 = 0; + } +#endif +#ifdef OPT_SVE + if (optimization.sve) { + unsigned char alignment = 0; + arch_opt_func = IP_FP32_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_FP32_IP_implementation_ARMPL_SVE(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_EQ(baseline, arch_opt_func(v, v2, dim)) << "SVE with dim " << dim; + ASSERT_EQ(alignment, 0) << "No alignment ARMPL_SVE with dim " << dim; + // Unset sve2 flag as well, so we'll choose the next option (default). + optimization.sve = 0; + } +#endif +#ifdef OPT_NEON + if (optimization.asimd) { + unsigned char alignment = 0; + arch_opt_func = IP_FP32_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_FP32_IP_implementation_ARMPL_NEON(dim)) + << "Unexpected distance function chosen for dim OPT_NEON " << dim; + ASSERT_EQ(alignment, 0) << "No alignment ARMPL_NEON with dim " << dim; + optimization.asimd = 0; + } +#endif + unsigned char alignment = 0; arch_opt_func = IP_FP32_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, FP32_InnerProduct) @@ -508,7 +597,7 @@ INSTANTIATE_TEST_SUITE_P(FP32OptFuncs, FP32SpacesOptimizationTest, class FP64SpacesOptimizationTest : public testing::TestWithParam {}; TEST_P(FP64SpacesOptimizationTest, FP64L2SqrTest) { - auto optimization = cpu_features::GetX86Info().features; + auto optimization = getCpuOptimizationFeatures(); size_t dim = GetParam(); double v[dim]; double v2[dim]; @@ -565,7 +654,7 @@ TEST_P(FP64SpacesOptimizationTest, FP64L2SqrTest) { } TEST_P(FP64SpacesOptimizationTest, FP64InnerProductTest) { - auto optimization = cpu_features::GetX86Info().features; + auto optimization = getCpuOptimizationFeatures(); size_t dim = GetParam(); double v[dim]; double v2[dim]; @@ -628,7 +717,7 @@ INSTANTIATE_TEST_SUITE_P(FP64OptFuncs, FP64SpacesOptimizationTest, class BF16SpacesOptimizationTest : public testing::TestWithParam {}; TEST_P(BF16SpacesOptimizationTest, BF16InnerProductTest) { - auto optimization = cpu_features::GetX86Info().features; + auto optimization = getCpuOptimizationFeatures(); size_t dim = GetParam(); bfloat16 v[dim]; bfloat16 v2[dim]; @@ -697,7 +786,7 @@ TEST_P(BF16SpacesOptimizationTest, BF16InnerProductTest) { } TEST_P(BF16SpacesOptimizationTest, BF16L2SqrTest) { - auto optimization = cpu_features::GetX86Info().features; + auto optimization = getCpuOptimizationFeatures(); size_t dim = GetParam(); bfloat16 v[dim]; bfloat16 v2[dim]; @@ -760,7 +849,7 @@ INSTANTIATE_TEST_SUITE_P(BF16OptFuncs, BF16SpacesOptimizationTest, class FP16SpacesOptimizationTest : public testing::TestWithParam {}; TEST_P(FP16SpacesOptimizationTest, FP16InnerProductTest) { - auto optimization = cpu_features::GetX86Info().features; + auto optimization = getCpuOptimizationFeatures(); size_t dim = GetParam(); float16 v1[dim], v2[dim]; float v1_fp32[dim], v2_fp32[dim]; @@ -780,6 +869,7 @@ TEST_P(FP16SpacesOptimizationTest, FP16InnerProductTest) { float baseline = FP16_InnerProduct(v1, v2, dim); ASSERT_EQ(baseline, FP32_InnerProduct(v1_fp32, v2_fp32, dim)) << "Baseline check " << dim; // Turn off advanced fp16 flags. They will be tested in the next test. +#if defined(CPU_FEATURES_ARCH_X86_64) optimization.avx512_fp16 = optimization.avx512vl = 0; #ifdef OPT_AVX512F if (optimization.avx512f) { @@ -802,6 +892,7 @@ TEST_P(FP16SpacesOptimizationTest, FP16InnerProductTest) { ASSERT_EQ(alignment, expected_alignment(256, dim)) << "F16C with dim " << dim; optimization.f16c = optimization.fma3 = optimization.avx = 0; } +#endif #endif unsigned char alignment = 0; arch_opt_func = IP_FP16_GetDistFunc(dim, &alignment, &optimization); @@ -812,7 +903,7 @@ TEST_P(FP16SpacesOptimizationTest, FP16InnerProductTest) { } TEST_P(FP16SpacesOptimizationTest, FP16L2SqrTest) { - auto optimization = cpu_features::GetX86Info().features; + auto optimization = getCpuOptimizationFeatures(); size_t dim = GetParam(); float16 v1[dim], v2[dim]; float v1_fp32[dim], v2_fp32[dim]; @@ -831,6 +922,7 @@ TEST_P(FP16SpacesOptimizationTest, FP16L2SqrTest) { dist_func_t arch_opt_func; float baseline = FP16_L2Sqr(v1, v2, dim); ASSERT_EQ(baseline, FP32_L2Sqr(v1_fp32, v2_fp32, dim)) << "Baseline check " << dim; +#if defined(CPU_FEATURES_ARCH_X86_64) // Turn off advanced fp16 flags. They will be tested in the next test. optimization.avx512_fp16 = optimization.avx512vl = 0; #ifdef OPT_AVX512F @@ -854,6 +946,7 @@ TEST_P(FP16SpacesOptimizationTest, FP16L2SqrTest) { ASSERT_EQ(alignment, expected_alignment(256, dim)) << "F16C with dim " << dim; optimization.f16c = optimization.fma3 = optimization.avx = 0; } +#endif #endif unsigned char alignment = 0; arch_opt_func = L2_FP16_GetDistFunc(dim, &alignment, &optimization); @@ -968,7 +1061,7 @@ INSTANTIATE_TEST_SUITE_P(, FP16SpacesOptimizationTestAdvanced, class INT8SpacesOptimizationTest : public testing::TestWithParam {}; TEST_P(INT8SpacesOptimizationTest, INT8L2SqrTest) { - auto optimization = cpu_features::GetX86Info().features; + auto optimization = getCpuOptimizationFeatures(); size_t dim = GetParam(); int8_t v1[dim]; int8_t v2[dim]; @@ -1004,7 +1097,7 @@ TEST_P(INT8SpacesOptimizationTest, INT8L2SqrTest) { } TEST_P(INT8SpacesOptimizationTest, INT8InnerProductTest) { - auto optimization = cpu_features::GetX86Info().features; + auto optimization = getCpuOptimizationFeatures(); size_t dim = GetParam(); int8_t v1[dim]; int8_t v2[dim]; @@ -1041,7 +1134,7 @@ TEST_P(INT8SpacesOptimizationTest, INT8InnerProductTest) { } TEST_P(INT8SpacesOptimizationTest, INT8CosineTest) { - auto optimization = cpu_features::GetX86Info().features; + auto optimization = getCpuOptimizationFeatures(); size_t dim = GetParam(); int8_t v1[dim + sizeof(float)]; int8_t v2[dim + sizeof(float)]; @@ -1082,7 +1175,7 @@ INSTANTIATE_TEST_SUITE_P(INT8OptFuncs, INT8SpacesOptimizationTest, class UINT8SpacesOptimizationTest : public testing::TestWithParam {}; TEST_P(UINT8SpacesOptimizationTest, UINT8L2SqrTest) { - auto optimization = cpu_features::GetX86Info().features; + auto optimization = getCpuOptimizationFeatures(); size_t dim = GetParam(); uint8_t v1[dim]; uint8_t v2[dim]; @@ -1118,7 +1211,7 @@ TEST_P(UINT8SpacesOptimizationTest, UINT8L2SqrTest) { } TEST_P(UINT8SpacesOptimizationTest, UINT8InnerProductTest) { - auto optimization = cpu_features::GetX86Info().features; + auto optimization = getCpuOptimizationFeatures(); size_t dim = GetParam(); uint8_t v1[dim]; uint8_t v2[dim]; @@ -1155,7 +1248,7 @@ TEST_P(UINT8SpacesOptimizationTest, UINT8InnerProductTest) { } TEST_P(UINT8SpacesOptimizationTest, UINT8CosineTest) { - auto optimization = cpu_features::GetX86Info().features; + auto optimization = getCpuOptimizationFeatures(); size_t dim = GetParam(); uint8_t v1[dim + sizeof(float)]; uint8_t v2[dim + sizeof(float)]; @@ -1192,5 +1285,3 @@ TEST_P(UINT8SpacesOptimizationTest, UINT8CosineTest) { INSTANTIATE_TEST_SUITE_P(UINT8OptFuncs, UINT8SpacesOptimizationTest, testing::Range(32UL, 64 * 2UL + 1)); - -#endif // CPU_FEATURES_ARCH_X86_64