diff --git a/cmake/aarch64InstructionFlags.cmake b/cmake/aarch64InstructionFlags.cmake index 9810485aa..7d6e72165 100644 --- a/cmake/aarch64InstructionFlags.cmake +++ b/cmake/aarch64InstructionFlags.cmake @@ -8,6 +8,7 @@ CHECK_CXX_COMPILER_FLAG("-march=armv7-a+neon" CXX_ARMV7_NEON) CHECK_CXX_COMPILER_FLAG("-march=armv8-a" CXX_ARMV8A) CHECK_CXX_COMPILER_FLAG("-march=armv8-a+sve" CXX_SVE) CHECK_CXX_COMPILER_FLAG("-march=armv9-a+sve2" CXX_SVE2) +CHECK_CXX_COMPILER_FLAG("-march=armv8.2-a+fp16fml" CXX_NEON_HP) CHECK_CXX_COMPILER_FLAG("-march=armv8.2-a+bf16" CXX_NEON_BF16) CHECK_CXX_COMPILER_FLAG("-march=armv8.2-a+sve+bf16" CXX_SVE_BF16) @@ -17,7 +18,12 @@ if(CXX_SVE2) add_compile_definitions(OPT_SVE2) endif() if (CXX_ARMV8A OR CXX_ARMV7_NEON) + message(STATUS "Using ARMv8.0-a with NEON") add_compile_definitions(OPT_NEON) + endif() +if (CXX_NEON_HP) + message(STATUS "Using ARMv8.2-a with NEON half-percision extension") + add_compile_definitions(OPT_NEON_HP) endif() if (CXX_NEON_BF16) add_compile_definitions(OPT_NEON_BF16) diff --git a/src/VecSim/spaces/CMakeLists.txt b/src/VecSim/spaces/CMakeLists.txt index f0e354265..2674e711d 100644 --- a/src/VecSim/spaces/CMakeLists.txt +++ b/src/VecSim/spaces/CMakeLists.txt @@ -91,6 +91,13 @@ if (CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "(aarch64)|(arm64)|(ARM64)|(armv.*)") list(APPEND OPTIMIZATIONS functions/NEON.cpp) endif() + # NEON half-precision support + if (CXX_NEON_HP AND CXX_ARMV8A) + message("Building with NEON+HP") + set_source_files_properties(functions/NEON_HP.cpp PROPERTIES COMPILE_FLAGS "-march=armv8.2-a+fp16fml") + list(APPEND OPTIMIZATIONS functions/NEON_HP.cpp) + endif() + # NEON bfloat16 support if (CXX_NEON_BF16) message("Building with NEON + BF16") diff --git a/src/VecSim/spaces/IP/IP_NEON_FP16.h b/src/VecSim/spaces/IP/IP_NEON_FP16.h new file mode 100644 index 000000000..797b93ae6 --- /dev/null +++ b/src/VecSim/spaces/IP/IP_NEON_FP16.h @@ -0,0 +1,93 @@ +/* + *Copyright Redis Ltd. 2021 - present + *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or + *the Server Side Public License v1 (SSPLv1). + */ + +#include + +inline void InnerProduct_Step(const float16_t *&vec1, const float16_t *&vec2, float16x8_t &acc) { + // Load half-precision vectors + float16x8_t v1 = vld1q_f16(vec1); + float16x8_t v2 = vld1q_f16(vec2); + vec1 += 8; + vec2 += 8; + + // Multiply and accumulate + acc = vfmaq_f16(acc, v1, v2); +} + +template // 0..31 +float FP16_InnerProduct_NEON_HP(const void *pVect1v, const void *pVect2v, size_t dimension) { + const auto *vec1 = static_cast(pVect1v); + const auto *vec2 = static_cast(pVect2v); + const auto *const v1End = vec1 + dimension; + float16x8_t acc1 = vdupq_n_f16(0.0f); + float16x8_t acc2 = vdupq_n_f16(0.0f); + float16x8_t acc3 = vdupq_n_f16(0.0f); + float16x8_t acc4 = vdupq_n_f16(0.0f); + + // First, handle the partial chunk residual + if constexpr (residual % 8) { + auto constexpr chunk_residual = residual % 8; + // TODO: spacial cases for some residuals and benchmark if its better + constexpr uint16x8_t mask = { + 0xFFFF, + (chunk_residual >= 2) ? 0xFFFF : 0, + (chunk_residual >= 3) ? 0xFFFF : 0, + (chunk_residual >= 4) ? 0xFFFF : 0, + (chunk_residual >= 5) ? 0xFFFF : 0, + (chunk_residual >= 6) ? 0xFFFF : 0, + (chunk_residual >= 7) ? 0xFFFF : 0, + 0, + }; + + // Load partial vectors + float16x8_t v1 = vld1q_f16(vec1); + float16x8_t v2 = vld1q_f16(vec2); + + // Apply mask to both vectors + float16x8_t masked_v1 = vbslq_f16(mask, v1, acc1); // `acc1` should be all zeros here + float16x8_t masked_v2 = vbslq_f16(mask, v2, acc2); // `acc2` should be all zeros here + + // Multiply and accumulate + acc1 = vfmaq_f16(acc1, masked_v1, masked_v2); + + // Advance pointers + vec1 += chunk_residual; + vec2 += chunk_residual; + } + + // Handle (residual - (residual % 8)) in chunks of 8 float16 + if constexpr (residual >= 8) + InnerProduct_Step(vec1, vec2, acc2); + if constexpr (residual >= 16) + InnerProduct_Step(vec1, vec2, acc3); + if constexpr (residual >= 24) + InnerProduct_Step(vec1, vec2, acc4); + + // Process the rest of the vectors (the full chunks part) + while (vec1 < v1End) { + // TODO: use `vld1q_f16_x4` for quad-loading? + InnerProduct_Step(vec1, vec2, acc1); + InnerProduct_Step(vec1, vec2, acc2); + InnerProduct_Step(vec1, vec2, acc3); + InnerProduct_Step(vec1, vec2, acc4); + } + + // Accumulate accumulators + acc1 = vpaddq_f16(acc1, acc3); + acc2 = vpaddq_f16(acc2, acc4); + acc1 = vpaddq_f16(acc1, acc2); + + // Horizontal sum of the accumulated values + float32x4_t sum_f32 = vcvt_f32_f16(vget_low_f16(acc1)); + sum_f32 = vaddq_f32(sum_f32, vcvt_f32_f16(vget_high_f16(acc1))); + + // Pairwise add to get horizontal sum + float32x2_t sum_2 = vadd_f32(vget_low_f32(sum_f32), vget_high_f32(sum_f32)); + sum_2 = vpadd_f32(sum_2, sum_2); + + // Extract result + return 1.0f - vget_lane_f32(sum_2, 0); +} diff --git a/src/VecSim/spaces/IP/IP_SVE_FP16.h b/src/VecSim/spaces/IP/IP_SVE_FP16.h new file mode 100644 index 000000000..2ac849920 --- /dev/null +++ b/src/VecSim/spaces/IP/IP_SVE_FP16.h @@ -0,0 +1,72 @@ +/* + *Copyright Redis Ltd. 2021 - present + *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or + *the Server Side Public License v1 (SSPLv1). + */ + +#include + +inline void InnerProduct_Step(const float16_t *vec1, const float16_t *vec2, svfloat16_t &acc, + size_t &offset, const size_t chunk) { + svbool_t all = svptrue_b16(); + + // Load half-precision vectors. + svfloat16_t v1 = svld1_f16(all, vec1 + offset); + svfloat16_t v2 = svld1_f16(all, vec2 + offset); + // Compute multiplications and add to the accumulator + acc = svmla_f16_x(all, acc, v1, v2); + + // Move to next chunk + offset += chunk; +} + +template // [t/f, 0..3] +float FP16_InnerProduct_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) { + const auto *vec1 = static_cast(pVect1v); + const auto *vec2 = static_cast(pVect2v); + const size_t chunk = svcnth(); // number of 16-bit elements in a register + svbool_t all = svptrue_b16(); + svfloat16_t acc1 = svdup_f16(0.0f); + svfloat16_t acc2 = svdup_f16(0.0f); + svfloat16_t acc3 = svdup_f16(0.0f); + svfloat16_t acc4 = svdup_f16(0.0f); + size_t offset = 0; + + // Process all full vectors + const size_t full_iterations = dimension / chunk / 4; + for (size_t iter = 0; iter < full_iterations; iter++) { + InnerProduct_Step(vec1, vec2, acc1, offset, chunk); + InnerProduct_Step(vec1, vec2, acc2, offset, chunk); + InnerProduct_Step(vec1, vec2, acc3, offset, chunk); + InnerProduct_Step(vec1, vec2, acc4, offset, chunk); + } + + // Perform between 0 and 3 additional steps, according to `additional_steps` value + if constexpr (additional_steps >= 1) + InnerProduct_Step(vec1, vec2, acc1, offset, chunk); + if constexpr (additional_steps >= 2) + InnerProduct_Step(vec1, vec2, acc2, offset, chunk); + if constexpr (additional_steps >= 3) + InnerProduct_Step(vec1, vec2, acc3, offset, chunk); + + // Handle the tail with the residual predicate + if constexpr (partial_chunk) { + svbool_t pg = svwhilelt_b16_u64(offset, dimension); + + // Load half-precision vectors. + svfloat16_t v1 = svld1_f16(pg, vec1 + offset); + svfloat16_t v2 = svld1_f16(pg, vec2 + offset); + // Compute multiplications and add to the accumulator. + // use the existing value of `acc` for the inactive elements (by the `m` suffix) + acc4 = svmla_f16_m(pg, acc4, v1, v2); + } + + // Accumulate accumulators + acc1 = svadd_f16_x(all, acc1, acc3); + acc2 = svadd_f16_x(all, acc2, acc4); + acc1 = svadd_f16_x(all, acc1, acc2); + + // Reduce the accumulated sum. + float result = svaddv_f16(all, acc1); + return 1.0f - result; +} diff --git a/src/VecSim/spaces/IP_space.cpp b/src/VecSim/spaces/IP_space.cpp index 1d23c587b..e930ee143 100644 --- a/src/VecSim/spaces/IP_space.cpp +++ b/src/VecSim/spaces/IP_space.cpp @@ -20,6 +20,7 @@ #include "VecSim/spaces/functions/AVX2.h" #include "VecSim/spaces/functions/SSE3.h" #include "VecSim/spaces/functions/NEON.h" +#include "VecSim/spaces/functions/NEON_HP.h" #include "VecSim/spaces/functions/NEON_BF16.h" #include "VecSim/spaces/functions/SVE.h" #include "VecSim/spaces/functions/SVE_BF16.h" @@ -214,14 +215,33 @@ dist_func_t IP_FP16_GetDistFunc(size_t dim, unsigned char *alignment, con if (alignment == nullptr) { alignment = &dummy_alignment; } + auto features = getCpuOptimizationFeatures(arch_opt); dist_func_t ret_dist_func = FP16_InnerProduct; + +#if defined(CPU_FEATURES_ARCH_AARCH64) +#ifdef OPT_SVE2 + if (features.sve2) { + return Choose_FP16_IP_implementation_SVE2(dim); + } +#endif +#ifdef OPT_SVE + if (features.sve) { + return Choose_FP16_IP_implementation_SVE(dim); + } +#endif +#ifdef OPT_NEON_HP + if (features.asimdhp && dim >= 8) { // Optimization assumes at least 8 16FPs (full chunk) + return Choose_FP16_IP_implementation_NEON_HP(dim); + } +#endif +#endif + +#if defined(CPU_FEATURES_ARCH_X86_64) // Optimizations assume at least 32 16FPs. If we have less, we use the naive implementation. if (dim < 32) { return ret_dist_func; } -#ifdef CPU_FEATURES_ARCH_X86_64 - auto features = getCpuOptimizationFeatures(arch_opt); #ifdef OPT_AVX512_FP16_VL // More details about the dimension limitation can be found in this PR's description: // https://github.com/RedisAI/VectorSimilarity/pull/477 diff --git a/src/VecSim/spaces/L2/L2_NEON_FP16.h b/src/VecSim/spaces/L2/L2_NEON_FP16.h new file mode 100644 index 000000000..b2ad8c0ee --- /dev/null +++ b/src/VecSim/spaces/L2/L2_NEON_FP16.h @@ -0,0 +1,97 @@ +/* + *Copyright Redis Ltd. 2021 - present + *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or + *the Server Side Public License v1 (SSPLv1). + */ + +#include + +inline void L2Sqr_Step(const float16_t *&vec1, const float16_t *&vec2, float16x8_t &acc) { + // Load half-precision vectors + float16x8_t v1 = vld1q_f16(vec1); + float16x8_t v2 = vld1q_f16(vec2); + vec1 += 8; + vec2 += 8; + + // Calculate differences + float16x8_t diff = vsubq_f16(v1, v2); + // Square and accumulate + acc = vfmaq_f16(acc, diff, diff); +} + +template // 0..31 +float FP16_L2Sqr_NEON_HP(const void *pVect1v, const void *pVect2v, size_t dimension) { + const auto *vec1 = static_cast(pVect1v); + const auto *vec2 = static_cast(pVect2v); + const auto *const v1End = vec1 + dimension; + float16x8_t acc1 = vdupq_n_f16(0.0f); + float16x8_t acc2 = vdupq_n_f16(0.0f); + float16x8_t acc3 = vdupq_n_f16(0.0f); + float16x8_t acc4 = vdupq_n_f16(0.0f); + + // First, handle the partial chunk residual + if constexpr (residual % 8) { + auto constexpr chunk_residual = residual % 8; + // TODO: spacial cases for some residuals and benchmark if its better + constexpr uint16x8_t mask = { + 0xFFFF, + (chunk_residual >= 2) ? 0xFFFF : 0, + (chunk_residual >= 3) ? 0xFFFF : 0, + (chunk_residual >= 4) ? 0xFFFF : 0, + (chunk_residual >= 5) ? 0xFFFF : 0, + (chunk_residual >= 6) ? 0xFFFF : 0, + (chunk_residual >= 7) ? 0xFFFF : 0, + 0, + }; + + // Load partial vectors + float16x8_t v1 = vld1q_f16(vec1); + float16x8_t v2 = vld1q_f16(vec2); + + // Apply mask to both vectors + float16x8_t masked_v1 = vbslq_f16(mask, v1, acc1); // `acc1` should be all zeros here + float16x8_t masked_v2 = vbslq_f16(mask, v2, acc2); // `acc2` should be all zeros here + + // Calculate differences + float16x8_t diff = vsubq_f16(masked_v1, masked_v2); + // Square and accumulate + acc1 = vfmaq_f16(acc1, diff, diff); + + // Advance pointers + vec1 += chunk_residual; + vec2 += chunk_residual; + } + + // Handle (residual - (residual % 8)) in chunks of 8 float16 + if constexpr (residual >= 8) + L2Sqr_Step(vec1, vec2, acc2); + if constexpr (residual >= 16) + L2Sqr_Step(vec1, vec2, acc3); + if constexpr (residual >= 24) + L2Sqr_Step(vec1, vec2, acc4); + + // Process the rest of the vectors (the full chunks part) + while (vec1 < v1End) { + // TODO: use `vld1q_f16_x4` for quad-loading? + L2Sqr_Step(vec1, vec2, acc1); + L2Sqr_Step(vec1, vec2, acc2); + L2Sqr_Step(vec1, vec2, acc3); + L2Sqr_Step(vec1, vec2, acc4); + } + + // Accumulate accumulators + acc1 = vpaddq_f16(acc1, acc3); + acc2 = vpaddq_f16(acc2, acc4); + acc1 = vpaddq_f16(acc1, acc2); + + // Horizontal sum of the accumulated values + float32x4_t sum_f32 = vcvt_f32_f16(vget_low_f16(acc1)); + sum_f32 = vaddq_f32(sum_f32, vcvt_f32_f16(vget_high_f16(acc1))); + + // Pairwise add to get horizontal sum + float32x2_t sum_2 = vadd_f32(vget_low_f32(sum_f32), vget_high_f32(sum_f32)); + sum_2 = vpadd_f32(sum_2, sum_2); + + // Extract result + return vget_lane_f32(sum_2, 0); +} diff --git a/src/VecSim/spaces/L2/L2_SVE_FP16.h b/src/VecSim/spaces/L2/L2_SVE_FP16.h new file mode 100644 index 000000000..4ce288d73 --- /dev/null +++ b/src/VecSim/spaces/L2/L2_SVE_FP16.h @@ -0,0 +1,73 @@ +/* + *Copyright Redis Ltd. 2021 - present + *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or + *the Server Side Public License v1 (SSPLv1). + */ + +#include + +inline void L2Sqr_Step(const float16_t *vec1, const float16_t *vec2, svfloat16_t &acc, + size_t &offset, const size_t chunk) { + svbool_t all = svptrue_b16(); + + svfloat16_t v1 = svld1_f16(all, vec1 + offset); + svfloat16_t v2 = svld1_f16(all, vec2 + offset); + // Compute difference in half precision. + svfloat16_t diff = svsub_f16_x(all, v1, v2); + // Square the differences and accumulate + acc = svmla_f16_x(all, acc, diff, diff); + offset += chunk; +} + +template // [t/f, 0..3] +float FP16_L2Sqr_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) { + const auto *vec1 = static_cast(pVect1v); + const auto *vec2 = static_cast(pVect2v); + const size_t chunk = svcnth(); // number of 16-bit elements in a register + svbool_t all = svptrue_b16(); + svfloat16_t acc1 = svdup_f16(0.0f); + svfloat16_t acc2 = svdup_f16(0.0f); + svfloat16_t acc3 = svdup_f16(0.0f); + svfloat16_t acc4 = svdup_f16(0.0f); + size_t offset = 0; + + // Process all full vectors + const size_t full_iterations = dimension / chunk / 4; + for (size_t iter = 0; iter < full_iterations; iter++) { + L2Sqr_Step(vec1, vec2, acc1, offset, chunk); + L2Sqr_Step(vec1, vec2, acc2, offset, chunk); + L2Sqr_Step(vec1, vec2, acc3, offset, chunk); + L2Sqr_Step(vec1, vec2, acc4, offset, chunk); + } + + // Perform between 0 and 3 additional steps, according to `additional_steps` value + if constexpr (additional_steps >= 1) + L2Sqr_Step(vec1, vec2, acc1, offset, chunk); + if constexpr (additional_steps >= 2) + L2Sqr_Step(vec1, vec2, acc2, offset, chunk); + if constexpr (additional_steps >= 3) + L2Sqr_Step(vec1, vec2, acc3, offset, chunk); + + // Handle partial chunk, if needed + if constexpr (partial_chunk) { + svbool_t pg = svwhilelt_b16_u64(offset, dimension); + + // Load half-precision vectors. + svfloat16_t v1 = svld1_f16(pg, vec1 + offset); + svfloat16_t v2 = svld1_f16(pg, vec2 + offset); + // Compute difference in half precision. + svfloat16_t diff = svsub_f16_x(pg, v1, v2); + // Square the differences. + // Use `m` suffix to keep the inactive elements as they are in `acc` + acc4 = svmla_f16_m(pg, acc4, diff, diff); + } + + // Accumulate accumulators + acc1 = svadd_f16_x(all, acc1, acc3); + acc2 = svadd_f16_x(all, acc2, acc4); + acc1 = svadd_f16_x(all, acc1, acc2); + + // Reduce the accumulated sum. + float result = svaddv_f16(all, acc1); + return result; +} diff --git a/src/VecSim/spaces/L2_space.cpp b/src/VecSim/spaces/L2_space.cpp index d1b5306f2..7acf2eb6d 100644 --- a/src/VecSim/spaces/L2_space.cpp +++ b/src/VecSim/spaces/L2_space.cpp @@ -19,6 +19,7 @@ #include "VecSim/spaces/functions/AVX2.h" #include "VecSim/spaces/functions/SSE3.h" #include "VecSim/spaces/functions/NEON.h" +#include "VecSim/spaces/functions/NEON_HP.h" #include "VecSim/spaces/functions/NEON_BF16.h" #include "VecSim/spaces/functions/SVE.h" #include "VecSim/spaces/functions/SVE_BF16.h" @@ -205,14 +206,33 @@ dist_func_t L2_FP16_GetDistFunc(size_t dim, unsigned char *alignment, con if (alignment == nullptr) { alignment = &dummy_alignment; } + auto features = getCpuOptimizationFeatures(arch_opt); dist_func_t ret_dist_func = FP16_L2Sqr; + +#if defined(CPU_FEATURES_ARCH_AARCH64) +#ifdef OPT_SVE2 + if (features.sve2) { + return Choose_FP16_L2_implementation_SVE2(dim); + } +#endif +#ifdef OPT_SVE + if (features.sve) { + return Choose_FP16_L2_implementation_SVE(dim); + } +#endif +#ifdef OPT_NEON_HP + if (features.asimdhp && dim >= 8) { // Optimization assumes at least 8 16FPs (full chunk) + return Choose_FP16_L2_implementation_NEON_HP(dim); + } +#endif +#endif // CPU_FEATURES_ARCH_AARCH64 + +#if defined(CPU_FEATURES_ARCH_X86_64) // Optimizations assume at least 32 16FPs. If we have less, we use the naive implementation. if (dim < 32) { return ret_dist_func; } -#ifdef CPU_FEATURES_ARCH_X86_64 - auto features = getCpuOptimizationFeatures(arch_opt); #ifdef OPT_AVX512_FP16_VL // More details about the dimension limitation can be found in this PR's description: // https://github.com/RedisAI/VectorSimilarity/pull/477 diff --git a/src/VecSim/spaces/functions/NEON_HP.cpp b/src/VecSim/spaces/functions/NEON_HP.cpp new file mode 100644 index 000000000..aed96750e --- /dev/null +++ b/src/VecSim/spaces/functions/NEON_HP.cpp @@ -0,0 +1,30 @@ +/* + *Copyright Redis Ltd. 2021 - present + *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or + *the Server Side Public License v1 (SSPLv1). + */ + +#include "NEON_HP.h" + +#include "VecSim/spaces/L2/L2_NEON_FP16.h" +#include "VecSim/spaces/IP/IP_NEON_FP16.h" + +namespace spaces { + +#include "implementation_chooser.h" + +dist_func_t Choose_FP16_L2_implementation_NEON_HP(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 32, FP16_L2Sqr_NEON_HP); + return ret_dist_func; +} + +dist_func_t Choose_FP16_IP_implementation_NEON_HP(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 32, FP16_InnerProduct_NEON_HP); + return ret_dist_func; +} + +#include "implementation_chooser_cleanup.h" + +} // namespace spaces diff --git a/src/VecSim/spaces/functions/NEON_HP.h b/src/VecSim/spaces/functions/NEON_HP.h new file mode 100644 index 000000000..594f653d3 --- /dev/null +++ b/src/VecSim/spaces/functions/NEON_HP.h @@ -0,0 +1,17 @@ +/* + *Copyright Redis Ltd. 2021 - present + *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or + *the Server Side Public License v1 (SSPLv1). + */ + +#pragma once + +#include "VecSim/spaces/spaces.h" + +namespace spaces { + +dist_func_t Choose_FP16_IP_implementation_NEON_HP(size_t dim); + +dist_func_t Choose_FP16_L2_implementation_NEON_HP(size_t dim); + +} // namespace spaces diff --git a/src/VecSim/spaces/functions/SVE.cpp b/src/VecSim/spaces/functions/SVE.cpp index cadf622a8..e3ebc5209 100644 --- a/src/VecSim/spaces/functions/SVE.cpp +++ b/src/VecSim/spaces/functions/SVE.cpp @@ -5,9 +5,13 @@ */ #include "SVE.h" + #include "VecSim/spaces/L2/L2_SVE_FP32.h" #include "VecSim/spaces/IP/IP_SVE_FP32.h" +#include "VecSim/spaces/IP/IP_SVE_FP16.h" +#include "VecSim/spaces/L2/L2_SVE_FP16.h" + #include "VecSim/spaces/IP/IP_SVE_FP64.h" #include "VecSim/spaces/L2/L2_SVE_FP64.h" @@ -26,6 +30,17 @@ dist_func_t Choose_FP32_L2_implementation_SVE(size_t dim) { return ret_dist_func; } +dist_func_t Choose_FP16_IP_implementation_SVE(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, FP16_InnerProduct_SVE, dim, svcnth); + return ret_dist_func; +} +dist_func_t Choose_FP16_L2_implementation_SVE(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, FP16_L2Sqr_SVE, dim, svcnth); + return ret_dist_func; +} + dist_func_t Choose_FP64_IP_implementation_SVE(size_t dim) { dist_func_t ret_dist_func; CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, FP64_InnerProductSIMD_SVE, dim, svcntd); diff --git a/src/VecSim/spaces/functions/SVE.h b/src/VecSim/spaces/functions/SVE.h index 6cc3d5bbf..35d1ffe4c 100644 --- a/src/VecSim/spaces/functions/SVE.h +++ b/src/VecSim/spaces/functions/SVE.h @@ -11,9 +11,12 @@ namespace spaces { dist_func_t Choose_FP32_IP_implementation_SVE(size_t dim); -dist_func_t Choose_FP64_IP_implementation_SVE(size_t dim); - dist_func_t Choose_FP32_L2_implementation_SVE(size_t dim); + +dist_func_t Choose_FP16_IP_implementation_SVE(size_t dim); +dist_func_t Choose_FP16_L2_implementation_SVE(size_t dim); + +dist_func_t Choose_FP64_IP_implementation_SVE(size_t dim); dist_func_t Choose_FP64_L2_implementation_SVE(size_t dim); } // namespace spaces diff --git a/src/VecSim/spaces/functions/SVE2.cpp b/src/VecSim/spaces/functions/SVE2.cpp index 4256af788..c52c99da9 100644 --- a/src/VecSim/spaces/functions/SVE2.cpp +++ b/src/VecSim/spaces/functions/SVE2.cpp @@ -5,9 +5,13 @@ */ #include "SVE2.h" + #include "VecSim/spaces/L2/L2_SVE_FP32.h" #include "VecSim/spaces/IP/IP_SVE_FP32.h" +#include "VecSim/spaces/IP/IP_SVE_FP16.h" // using SVE implementation, but different compilation flags +#include "VecSim/spaces/L2/L2_SVE_FP16.h" // using SVE implementation, but different compilation flags + #include "VecSim/spaces/IP/IP_SVE_FP64.h" #include "VecSim/spaces/L2/L2_SVE_FP64.h" @@ -26,6 +30,17 @@ dist_func_t Choose_FP32_L2_implementation_SVE2(size_t dim) { return ret_dist_func; } +dist_func_t Choose_FP16_IP_implementation_SVE2(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, FP16_InnerProduct_SVE, dim, svcnth); + return ret_dist_func; +} +dist_func_t Choose_FP16_L2_implementation_SVE2(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, FP16_L2Sqr_SVE, dim, svcnth); + return ret_dist_func; +} + dist_func_t Choose_FP64_IP_implementation_SVE2(size_t dim) { dist_func_t ret_dist_func; CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, FP64_InnerProductSIMD_SVE, dim, svcntd); diff --git a/src/VecSim/spaces/functions/SVE2.h b/src/VecSim/spaces/functions/SVE2.h index 84a4b4af8..67e36dc20 100644 --- a/src/VecSim/spaces/functions/SVE2.h +++ b/src/VecSim/spaces/functions/SVE2.h @@ -11,11 +11,12 @@ namespace spaces { dist_func_t Choose_FP32_IP_implementation_SVE2(size_t dim); - dist_func_t Choose_FP32_L2_implementation_SVE2(size_t dim); -dist_func_t Choose_FP64_IP_implementation_SVE2(size_t dim); +dist_func_t Choose_FP16_IP_implementation_SVE2(size_t dim); +dist_func_t Choose_FP16_L2_implementation_SVE2(size_t dim); +dist_func_t Choose_FP64_IP_implementation_SVE2(size_t dim); dist_func_t Choose_FP64_L2_implementation_SVE2(size_t dim); } // namespace spaces diff --git a/src/VecSim/spaces/functions/implementation_chooser_cleanup.h b/src/VecSim/spaces/functions/implementation_chooser_cleanup.h index f93438a11..eff414be3 100644 --- a/src/VecSim/spaces/functions/implementation_chooser_cleanup.h +++ b/src/VecSim/spaces/functions/implementation_chooser_cleanup.h @@ -23,4 +23,7 @@ #undef CASES32 #undef CASES64 +#undef SVE_CASE + #undef CHOOSE_IMPLEMENTATION +#undef CHOOSE_SVE_IMPLEMENTATION diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces.h b/tests/benchmark/spaces_benchmarks/bm_spaces.h index d986d23f2..b14146d20 100644 --- a/tests/benchmark/spaces_benchmarks/bm_spaces.h +++ b/tests/benchmark/spaces_benchmarks/bm_spaces.h @@ -25,6 +25,7 @@ #include "VecSim/spaces/functions/SSE3.h" #include "VecSim/spaces/functions/SSE.h" #include "VecSim/spaces/functions/NEON.h" +#include "VecSim/spaces/functions/NEON_HP.h" #include "VecSim/spaces/functions/NEON_BF16.h" #include "VecSim/spaces/functions/SVE.h" #include "VecSim/spaces/functions/SVE_BF16.h" diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_fp16.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_fp16.cpp index 9457bc77d..096caefff 100644 --- a/tests/benchmark/spaces_benchmarks/bm_spaces_fp16.cpp +++ b/tests/benchmark/spaces_benchmarks/bm_spaces_fp16.cpp @@ -13,6 +13,25 @@ class BM_VecSimSpaces_FP16 : public BM_VecSimSpaces { } }; +#ifdef CPU_FEATURES_ARCH_AARCH64 +cpu_features::Aarch64Features opt = cpu_features::GetAarch64Info().features; + +// NEON implementation for ARMv8-a +#ifdef OPT_NEON_HP +bool neon_supported = opt.asimdhp; // ARMv8-a always supports NEON +INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_FP16, FP16, NEON_HP, 32, neon_supported); +#endif +#ifdef OPT_SVE +bool sve_supported = opt.sve; // Check for SVE support +INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_FP16, FP16, SVE, 32, sve_supported); +#endif +// SVE2 implementation +#ifdef OPT_SVE2 +bool sve2_supported = opt.sve2; // Check for SVE2 support +INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_FP16, FP16, SVE2, 32, sve2_supported); +#endif +#endif // AARCH64 + #ifdef CPU_FEATURES_ARCH_X86_64 cpu_features::X86Features opt = cpu_features::GetX86Info().features; diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index 3e03db6c9..19487bc8d 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -28,6 +28,7 @@ #include "VecSim/spaces/functions/SSE3.h" #include "VecSim/spaces/functions/F16C.h" #include "VecSim/spaces/functions/NEON.h" +#include "VecSim/spaces/functions/NEON_HP.h" #include "VecSim/spaces/functions/NEON_BF16.h" #include "VecSim/spaces/functions/SVE.h" #include "VecSim/spaces/functions/SVE_BF16.h" @@ -979,8 +980,8 @@ TEST_P(FP16SpacesOptimizationTest, FP16InnerProductTest) { dist_func_t arch_opt_func; float baseline = FP16_InnerProduct(v1, v2, dim); ASSERT_EQ(baseline, FP32_InnerProduct(v1_fp32, v2_fp32, dim)) << "Baseline check " << dim; - // Turn off advanced fp16 flags. They will be tested in the next test. #if defined(CPU_FEATURES_ARCH_X86_64) + // Turn off advanced fp16 flags. They will be tested in the next test. optimization.avx512_fp16 = optimization.avx512vl = 0; #ifdef OPT_AVX512F if (optimization.avx512f) { @@ -1004,12 +1005,15 @@ TEST_P(FP16SpacesOptimizationTest, FP16InnerProductTest) { optimization.f16c = optimization.fma3 = optimization.avx = 0; } #endif +#elif defined(CPU_FEATURES_ARCH_AARCH64) + // Turn off advanced fp16 flags. They will be tested in the next test. + optimization.sve = optimization.sve2 = optimization.asimdhp = 0; #endif unsigned char alignment = 0; arch_opt_func = IP_FP16_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, FP16_InnerProduct) << "Unexpected distance function chosen for dim " << dim; - ASSERT_EQ(baseline, arch_opt_func(v1, v2, dim)) << "F16C with dim " << dim; + ASSERT_EQ(baseline, arch_opt_func(v1, v2, dim)) << "No optimization with dim " << dim; ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; } @@ -1058,6 +1062,42 @@ TEST_P(FP16SpacesOptimizationTest, FP16L2SqrTest) { optimization.f16c = optimization.fma3 = optimization.avx = 0; } #endif +#elif defined(CPU_FEATURES_ARCH_AARCH64) +#ifdef OPT_SVE2 + if (optimization.sve2) { + unsigned char alignment = 0; + arch_opt_func = L2_FP16_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_FP16_L2_implementation_SVE2(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_EQ(baseline, arch_opt_func(v1, v2, dim)) << "SVE2 with dim " << dim; + ASSERT_EQ(alignment, 0) << "No alignment SVE2 with dim " << dim; + // Unset sve2 flag as well, so we'll choose the next option (default). + optimization.sve2 = 0; + } +#endif +#ifdef OPT_SVE + if (optimization.sve) { + unsigned char alignment = 0; + arch_opt_func = L2_FP16_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_FP16_L2_implementation_SVE(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_EQ(baseline, arch_opt_func(v1, v2, dim)) << "SVE with dim " << dim; + ASSERT_EQ(alignment, 0) << "No alignment SVE with dim " << dim; + // Unset sve flag as well, so we'll choose the next option (default). + optimization.sve = 0; + } +#endif +#ifdef OPT_NEON_HP + if (optimization.asimdhp) { + unsigned char alignment = 0; + arch_opt_func = L2_FP16_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_FP16_L2_implementation_NEON_HP(dim)) + << "Unexpected distance function chosen for dim OPT_NEON_HP " << dim; + ASSERT_EQ(baseline, arch_opt_func(v1, v2, dim)) << "NEON_HP with dim " << dim; + ASSERT_EQ(alignment, 0) << "No alignment NEON_HP with dim " << dim; + optimization.asimdhp = 0; + } +#endif #endif unsigned char alignment = 0; arch_opt_func = L2_FP16_GetDistFunc(dim, &alignment, &optimization); @@ -1080,35 +1120,43 @@ INSTANTIATE_TEST_SUITE_P(FP16OptFuncs, FP16SpacesOptimizationTest, * that has different logic than the float32 and float64 reduce functions. * For more info, refer to intel's intrinsics guide. */ -#ifdef OPT_AVX512_FP16_VL +#if defined(OPT_AVX512_FP16_VL) || defined(CPU_FEATURES_ARCH_AARCH64) class FP16SpacesOptimizationTestAdvanced : public testing::TestWithParam {}; TEST_P(FP16SpacesOptimizationTestAdvanced, FP16InnerProductTestAdv) { - auto optimization = cpu_features::GetX86Info().features; - if (optimization.avx512_fp16 && optimization.avx512vl) { - size_t dim = GetParam(); - float16 v1[dim], v2[dim]; + auto optimization = getCpuOptimizationFeatures(); + size_t dim = GetParam(); + float16 v1[dim], v2[dim]; - std::mt19937 gen(42); - std::uniform_real_distribution<> dis(-0.99, 0.99); + std::mt19937 gen(42); + std::uniform_real_distribution<> dis(-0.99, 0.99); - _Float16 baseline = 0; - for (size_t i = 0; i < dim; i++) { - float val1 = (dis(gen)); - float val2 = (dis(gen)); - v1[i] = vecsim_types::FP32_to_FP16((val1)); - v2[i] = vecsim_types::FP32_to_FP16((val2)); +#if defined(CPU_FEATURES_ARCH_AARCH64) && defined(__GNUC__) && (__GNUC__ < 13) + // https://github.com/pytorch/executorch/issues/6844 + __fp16 baseline = 0; +#else + _Float16 baseline = 0; +#endif - baseline += static_cast<_Float16>(val1) * static_cast<_Float16>(val2); - } - baseline = _Float16(1) - baseline; + for (size_t i = 0; i < dim; i++) { + float val1 = (dis(gen)); + float val2 = (dis(gen)); + v1[i] = vecsim_types::FP32_to_FP16((val1)); + v2[i] = vecsim_types::FP32_to_FP16((val2)); - auto expected_alignment = [](size_t reg_bit_size, size_t dim) { - size_t elements_in_reg = reg_bit_size / sizeof(float16) / 8; - return (dim % elements_in_reg == 0) ? elements_in_reg * sizeof(float16) : 0; - }; + baseline += static_cast(val1) * static_cast(val2); + } + baseline = decltype(baseline)(1) - baseline; - dist_func_t arch_opt_func; + auto expected_alignment = [](size_t reg_bit_size, size_t dim) { + size_t elements_in_reg = reg_bit_size / sizeof(float16) / 8; + return (dim % elements_in_reg == 0) ? elements_in_reg * sizeof(float16) : 0; + }; + + dist_func_t arch_opt_func; + +#ifdef OPT_AVX512_FP16_VL + if (optimization.avx512_fp16 && optimization.avx512vl) { unsigned char alignment = 0; arch_opt_func = IP_FP16_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, Choose_FP16_IP_implementation_AVX512FP16_VL(dim)) @@ -1121,8 +1169,64 @@ TEST_P(FP16SpacesOptimizationTestAdvanced, FP16InnerProductTestAdv) { << ", dist: " << dist; ASSERT_EQ(alignment, expected_alignment(512, dim)) << "AVX512 with dim " << dim; } +#endif +#ifdef OPT_SVE2 + if (optimization.sve2) { + unsigned char alignment = 0; + arch_opt_func = IP_FP16_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_FP16_IP_implementation_SVE2(dim)) + << "Unexpected distance function chosen for dim " << dim; + float dist = arch_opt_func(v1, v2, dim); + float f_baseline = baseline; + float error = std::abs((dist / f_baseline) - 1); + // Alow 1% error + ASSERT_LE(error, 0.01) << "SVE2 with dim " << dim << ", baseline: " << f_baseline + << ", dist: " << dist; + // ASSERT_EQ(alignment, expected_alignment(512, dim)) << "SVE2 with dim " << dim; + ASSERT_EQ(alignment, 0) << "SVE2 with dim " << dim; + // Unset sve2 flag as well, so we'll choose the next option (default). + optimization.sve2 = 0; + } +#endif +#ifdef OPT_SVE + if (optimization.sve) { + unsigned char alignment = 0; + arch_opt_func = IP_FP16_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_FP16_IP_implementation_SVE(dim)) + << "Unexpected distance function chosen for dim " << dim; + float dist = arch_opt_func(v1, v2, dim); + float f_baseline = baseline; + float error = std::abs((dist / f_baseline) - 1); + // Alow 1% error + ASSERT_LE(error, 0.01) << "SVE with dim " << dim << ", baseline: " << f_baseline + << ", dist: " << dist; + // ASSERT_EQ(alignment, expected_alignment(512, dim)) << "SVE with dim " << dim; + ASSERT_EQ(alignment, 0) << "SVE with dim " << dim; + // Unset sve flag as well, so we'll choose the next option (default). + optimization.sve = 0; + } +#endif +#ifdef OPT_NEON_HP + if (optimization.asimdhp) { + unsigned char alignment = 0; + arch_opt_func = IP_FP16_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_FP16_IP_implementation_NEON_HP(dim)) + << "Unexpected distance function chosen for dim " << dim; + float dist = arch_opt_func(v1, v2, dim); + float f_baseline = baseline; + float error = std::abs((dist / f_baseline) - 1); + // Alow 1% error + ASSERT_LE(error, 0.01) << "NEON_HP with dim " << dim << ", baseline: " << f_baseline + << ", dist: " << dist; + // ASSERT_EQ(alignment, expected_alignment(512, dim)) << "NEON_HP with dim " << dim; + ASSERT_EQ(alignment, 0) << "NEON_HP with dim " << dim; + // Unset sve flag as well, so we'll choose the next option (default). + optimization.asimdhp = 0; + } +#endif } +#ifdef OPT_AVX512_FP16_VL TEST_P(FP16SpacesOptimizationTestAdvanced, FP16L2SqrTestAdv) { auto optimization = cpu_features::GetX86Info().features; if (optimization.avx512_fp16 && optimization.avx512vl) { @@ -1162,12 +1266,13 @@ TEST_P(FP16SpacesOptimizationTestAdvanced, FP16L2SqrTestAdv) { ASSERT_EQ(alignment, expected_alignment(512, dim)) << "AVX512 with dim " << dim; } } +#endif // Start from a 32 multiplier INSTANTIATE_TEST_SUITE_P(, FP16SpacesOptimizationTestAdvanced, testing::Range(512UL, 512 + 32UL + 1)); -#endif +#endif // defined(OPT_AVX512_FP16_VL) || defined(CPU_FEATURES_ARCH_AARCH64) class INT8SpacesOptimizationTest : public testing::TestWithParam {};