diff --git a/cmake/aarch64InstructionFlags.cmake b/cmake/aarch64InstructionFlags.cmake index c1160a052..9810485aa 100644 --- a/cmake/aarch64InstructionFlags.cmake +++ b/cmake/aarch64InstructionFlags.cmake @@ -8,6 +8,8 @@ CHECK_CXX_COMPILER_FLAG("-march=armv7-a+neon" CXX_ARMV7_NEON) CHECK_CXX_COMPILER_FLAG("-march=armv8-a" CXX_ARMV8A) CHECK_CXX_COMPILER_FLAG("-march=armv8-a+sve" CXX_SVE) CHECK_CXX_COMPILER_FLAG("-march=armv9-a+sve2" CXX_SVE2) +CHECK_CXX_COMPILER_FLAG("-march=armv8.2-a+bf16" CXX_NEON_BF16) +CHECK_CXX_COMPILER_FLAG("-march=armv8.2-a+sve+bf16" CXX_SVE_BF16) # Only use ARMv9 if both compiler and CPU support it if(CXX_SVE2) @@ -17,6 +19,12 @@ endif() if (CXX_ARMV8A OR CXX_ARMV7_NEON) add_compile_definitions(OPT_NEON) endif() +if (CXX_NEON_BF16) + add_compile_definitions(OPT_NEON_BF16) +endif() if (CXX_SVE) add_compile_definitions(OPT_SVE) endif() +if (CXX_SVE_BF16) + add_compile_definitions(OPT_SVE_BF16) +endif() diff --git a/src/VecSim/spaces/CMakeLists.txt b/src/VecSim/spaces/CMakeLists.txt index 790464900..f0e354265 100644 --- a/src/VecSim/spaces/CMakeLists.txt +++ b/src/VecSim/spaces/CMakeLists.txt @@ -91,6 +91,13 @@ if (CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "(aarch64)|(arm64)|(ARM64)|(armv.*)") list(APPEND OPTIMIZATIONS functions/NEON.cpp) endif() + # NEON bfloat16 support + if (CXX_NEON_BF16) + message("Building with NEON + BF16") + set_source_files_properties(functions/NEON_BF16.cpp PROPERTIES COMPILE_FLAGS "-march=armv8.2-a+bf16") + list(APPEND OPTIMIZATIONS functions/NEON_BF16.cpp) + endif() + # SVE support if (CXX_SVE) message("Building with SVE") @@ -98,6 +105,13 @@ if (CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "(aarch64)|(arm64)|(ARM64)|(armv.*)") list(APPEND OPTIMIZATIONS functions/SVE.cpp) endif() + # SVE with BF16 support + if (CXX_SVE_BF16) + message("Building with SVE + BF16") + set_source_files_properties(functions/SVE_BF16.cpp PROPERTIES COMPILE_FLAGS "-march=armv8.2-a+sve+bf16") + list(APPEND OPTIMIZATIONS functions/SVE_BF16.cpp) + endif() + # SVE2 support if (CXX_SVE2) message("Building with ARMV9A and SVE2") diff --git a/src/VecSim/spaces/IP/IP_NEON_BF16.h b/src/VecSim/spaces/IP/IP_NEON_BF16.h new file mode 100644 index 000000000..4decc0a37 --- /dev/null +++ b/src/VecSim/spaces/IP/IP_NEON_BF16.h @@ -0,0 +1,89 @@ +/* + *Copyright Redis Ltd. 2021 - present + *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or + *the Server Side Public License v1 (SSPLv1). + */ + +#include + +inline void InnerProduct_Step(const bfloat16_t *&vec1, const bfloat16_t *&vec2, float32x4_t &acc) { + // Load brain-half-precision vectors + bfloat16x8_t v1 = vld1q_bf16(vec1); + bfloat16x8_t v2 = vld1q_bf16(vec2); + vec1 += 8; + vec2 += 8; + // Compute multiplications and add to the accumulator + acc = vbfdotq_f32(acc, v1, v2); +} + +template // 0..31 +float BF16_InnerProduct_NEON(const void *pVect1v, const void *pVect2v, size_t dimension) { + const auto *vec1 = static_cast(pVect1v); + const auto *vec2 = static_cast(pVect2v); + const auto *const v1End = vec1 + dimension; + float32x4_t acc1 = vdupq_n_f32(0.0f); + float32x4_t acc2 = vdupq_n_f32(0.0f); + float32x4_t acc3 = vdupq_n_f32(0.0f); + float32x4_t acc4 = vdupq_n_f32(0.0f); + + // First, handle the partial chunk residual + if constexpr (residual % 8) { + auto constexpr chunk_residual = residual % 8; + // TODO: special cases for some residuals and benchmark if its better + constexpr uint16x8_t mask = { + 0xFFFF, + (chunk_residual >= 2) ? 0xFFFF : 0, + (chunk_residual >= 3) ? 0xFFFF : 0, + (chunk_residual >= 4) ? 0xFFFF : 0, + (chunk_residual >= 5) ? 0xFFFF : 0, + (chunk_residual >= 6) ? 0xFFFF : 0, + (chunk_residual >= 7) ? 0xFFFF : 0, + 0, + }; + + // Load partial vectors + bfloat16x8_t v1 = vld1q_bf16(vec1); + bfloat16x8_t v2 = vld1q_bf16(vec2); + + // Apply mask to both vectors + bfloat16x8_t masked_v1 = + vreinterpretq_bf16_u16(vandq_u16(vreinterpretq_u16_bf16(v1), mask)); + bfloat16x8_t masked_v2 = + vreinterpretq_bf16_u16(vandq_u16(vreinterpretq_u16_bf16(v2), mask)); + + acc1 = vbfdotq_f32(acc1, masked_v1, masked_v2); + + // Advance pointers + vec1 += chunk_residual; + vec2 += chunk_residual; + } + + // Handle (residual - (residual % 8)) in chunks of 8 bfloat16 + if constexpr (residual >= 8) + InnerProduct_Step(vec1, vec2, acc2); + if constexpr (residual >= 16) + InnerProduct_Step(vec1, vec2, acc3); + if constexpr (residual >= 24) + InnerProduct_Step(vec1, vec2, acc4); + + // Process the rest of the vectors (the full chunks part) + while (vec1 < v1End) { + // TODO: use `vld1q_f16_x4` for quad-loading? + InnerProduct_Step(vec1, vec2, acc1); + InnerProduct_Step(vec1, vec2, acc2); + InnerProduct_Step(vec1, vec2, acc3); + InnerProduct_Step(vec1, vec2, acc4); + } + + // Accumulate accumulators + acc1 = vpaddq_f32(acc1, acc3); + acc2 = vpaddq_f32(acc2, acc4); + acc1 = vpaddq_f32(acc1, acc2); + + // Pairwise add to get horizontal sum + float32x2_t folded = vadd_f32(vget_low_f32(acc1), vget_high_f32(acc1)); + folded = vpadd_f32(folded, folded); + + // Extract result + return 1.0f - vget_lane_f32(folded, 0); +} diff --git a/src/VecSim/spaces/IP/IP_SVE_BF16.h b/src/VecSim/spaces/IP/IP_SVE_BF16.h new file mode 100644 index 000000000..e3980422d --- /dev/null +++ b/src/VecSim/spaces/IP/IP_SVE_BF16.h @@ -0,0 +1,71 @@ +/* + *Copyright Redis Ltd. 2021 - present + *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or + *the Server Side Public License v1 (SSPLv1). + */ + +#include + +inline void InnerProduct_Step(const bfloat16_t *vec1, const bfloat16_t *vec2, svfloat32_t &acc, + size_t &offset, const size_t chunk) { + svbool_t all = svptrue_b16(); + + // Load brain-half-precision vectors. + svbfloat16_t v1 = svld1_bf16(all, vec1 + offset); + svbfloat16_t v2 = svld1_bf16(all, vec2 + offset); + // Compute multiplications and add to the accumulator + acc = svbfdot(acc, v1, v2); + + // Move to next chunk + offset += chunk; +} + +template // [t/f, 0..3] +float BF16_InnerProduct_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) { + const auto *vec1 = static_cast(pVect1v); + const auto *vec2 = static_cast(pVect2v); + const size_t chunk = svcnth(); // number of 16-bit elements in a register + svfloat32_t acc1 = svdup_f32(0.0f); + svfloat32_t acc2 = svdup_f32(0.0f); + svfloat32_t acc3 = svdup_f32(0.0f); + svfloat32_t acc4 = svdup_f32(0.0f); + size_t offset = 0; + + // Process all full vectors + const size_t full_iterations = dimension / chunk / 4; + for (size_t iter = 0; iter < full_iterations; iter++) { + InnerProduct_Step(vec1, vec2, acc1, offset, chunk); + InnerProduct_Step(vec1, vec2, acc2, offset, chunk); + InnerProduct_Step(vec1, vec2, acc3, offset, chunk); + InnerProduct_Step(vec1, vec2, acc4, offset, chunk); + } + + // Perform between 0 and 3 additional steps, according to `additional_steps` value + if constexpr (additional_steps >= 1) + InnerProduct_Step(vec1, vec2, acc1, offset, chunk); + if constexpr (additional_steps >= 2) + InnerProduct_Step(vec1, vec2, acc2, offset, chunk); + if constexpr (additional_steps >= 3) + InnerProduct_Step(vec1, vec2, acc3, offset, chunk); + + // Handle the tail with the residual predicate + if constexpr (partial_chunk) { + svbool_t pg = svwhilelt_b16_u64(offset, dimension); + + // Load brain-half-precision vectors. + // Inactive elements are zeros, according to the docs + svbfloat16_t v1 = svld1_bf16(pg, vec1 + offset); + svbfloat16_t v2 = svld1_bf16(pg, vec2 + offset); + // Compute multiplications and add to the accumulator. + acc4 = svbfdot(acc4, v1, v2); + } + + // Accumulate accumulators + acc1 = svadd_f32_x(svptrue_b32(), acc1, acc3); + acc2 = svadd_f32_x(svptrue_b32(), acc2, acc4); + acc1 = svadd_f32_x(svptrue_b32(), acc1, acc2); + + // Reduce the accumulated sum. + float result = svaddv_f32(svptrue_b32(), acc1); + return 1.0f - result; +} diff --git a/src/VecSim/spaces/IP_space.cpp b/src/VecSim/spaces/IP_space.cpp index 56aa35258..1d23c587b 100644 --- a/src/VecSim/spaces/IP_space.cpp +++ b/src/VecSim/spaces/IP_space.cpp @@ -20,7 +20,9 @@ #include "VecSim/spaces/functions/AVX2.h" #include "VecSim/spaces/functions/SSE3.h" #include "VecSim/spaces/functions/NEON.h" +#include "VecSim/spaces/functions/NEON_BF16.h" #include "VecSim/spaces/functions/SVE.h" +#include "VecSim/spaces/functions/SVE_BF16.h" #include "VecSim/spaces/functions/SVE2.h" using bfloat16 = vecsim_types::bfloat16; @@ -154,13 +156,27 @@ dist_func_t IP_BF16_GetDistFunc(size_t dim, unsigned char *alignment, con if (!is_little_endian()) { return BF16_InnerProduct_BigEndian; } + auto features = getCpuOptimizationFeatures(arch_opt); + +#if defined(CPU_FEATURES_ARCH_AARCH64) +#ifdef OPT_SVE_BF16 + if (features.svebf16) { + return Choose_BF16_IP_implementation_SVE_BF16(dim); + } +#endif +#ifdef OPT_NEON_BF16 + if (features.bf16 && dim >= 8) { // Optimization assumes at least 8 BF16s (full chunk) + return Choose_BF16_IP_implementation_NEON_BF16(dim); + } +#endif +#endif // AARCH64 + +#if defined(CPU_FEATURES_ARCH_X86_64) // Optimizations assume at least 32 bfloats. If we have less, we use the naive implementation. if (dim < 32) { return ret_dist_func; } -#ifdef CPU_FEATURES_ARCH_X86_64 - auto features = getCpuOptimizationFeatures(arch_opt); #ifdef OPT_AVX512_BF16_VL if (features.avx512_bf16 && features.avx512vl) { if (dim % 32 == 0) // no point in aligning if we have an offsetting residual diff --git a/src/VecSim/spaces/L2/L2_NEON_BF16.h b/src/VecSim/spaces/L2/L2_NEON_BF16.h new file mode 100644 index 000000000..34ba96b2b --- /dev/null +++ b/src/VecSim/spaces/L2/L2_NEON_BF16.h @@ -0,0 +1,103 @@ +/* + *Copyright Redis Ltd. 2021 - present + *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or + *the Server Side Public License v1 (SSPLv1). + */ + +#include + +// Assumes little-endianess +inline void L2Sqr_Op(float32x4_t &acc, bfloat16x8_t &v1, bfloat16x8_t &v2) { + float32x4_t v1_lo = vcvtq_low_f32_bf16(v1); + float32x4_t v2_lo = vcvtq_low_f32_bf16(v2); + float32x4_t diff_lo = vsubq_f32(v1_lo, v2_lo); + + acc = vfmaq_f32(acc, diff_lo, diff_lo); + + float32x4_t v1_hi = vcvtq_high_f32_bf16(v1); + float32x4_t v2_hi = vcvtq_high_f32_bf16(v2); + float32x4_t diff_hi = vsubq_f32(v1_hi, v2_hi); + + acc = vfmaq_f32(acc, diff_hi, diff_hi); +} + +inline void L2Sqr_Step(const bfloat16_t *&vec1, const bfloat16_t *&vec2, float32x4_t &acc) { + // Load brain-half-precision vectors + bfloat16x8_t v1 = vld1q_bf16(vec1); + bfloat16x8_t v2 = vld1q_bf16(vec2); + vec1 += 8; + vec2 += 8; + L2Sqr_Op(acc, v1, v2); +} + +template // 0..31 +float BF16_L2Sqr_NEON(const void *pVect1v, const void *pVect2v, size_t dimension) { + const auto *vec1 = static_cast(pVect1v); + const auto *vec2 = static_cast(pVect2v); + const auto *const v1End = vec1 + dimension; + float32x4_t acc1 = vdupq_n_f32(0.0f); + float32x4_t acc2 = vdupq_n_f32(0.0f); + float32x4_t acc3 = vdupq_n_f32(0.0f); + float32x4_t acc4 = vdupq_n_f32(0.0f); + + // First, handle the partial chunk residual + if constexpr (residual % 8) { + auto constexpr chunk_residual = residual % 8; + // TODO: special cases for some residuals and benchmark if its better + constexpr uint16x8_t mask = { + 0xFFFF, + (chunk_residual >= 2) ? 0xFFFF : 0, + (chunk_residual >= 3) ? 0xFFFF : 0, + (chunk_residual >= 4) ? 0xFFFF : 0, + (chunk_residual >= 5) ? 0xFFFF : 0, + (chunk_residual >= 6) ? 0xFFFF : 0, + (chunk_residual >= 7) ? 0xFFFF : 0, + 0, + }; + + // Load partial vectors + bfloat16x8_t v1 = vld1q_bf16(vec1); + bfloat16x8_t v2 = vld1q_bf16(vec2); + + // Apply mask to both vectors + bfloat16x8_t masked_v1 = + vreinterpretq_bf16_u16(vandq_u16(vreinterpretq_u16_bf16(v1), mask)); + bfloat16x8_t masked_v2 = + vreinterpretq_bf16_u16(vandq_u16(vreinterpretq_u16_bf16(v2), mask)); + + L2Sqr_Op(acc1, masked_v1, masked_v2); + + // Advance pointers + vec1 += chunk_residual; + vec2 += chunk_residual; + } + + // Handle (residual - (residual % 8)) in chunks of 8 bfloat16 + if constexpr (residual >= 8) + L2Sqr_Step(vec1, vec2, acc2); + if constexpr (residual >= 16) + L2Sqr_Step(vec1, vec2, acc3); + if constexpr (residual >= 24) + L2Sqr_Step(vec1, vec2, acc4); + + // Process the rest of the vectors (the full chunks part) + while (vec1 < v1End) { + // TODO: use `vld1q_f16_x4` for quad-loading? + L2Sqr_Step(vec1, vec2, acc1); + L2Sqr_Step(vec1, vec2, acc2); + L2Sqr_Step(vec1, vec2, acc3); + L2Sqr_Step(vec1, vec2, acc4); + } + + // Accumulate accumulators + acc1 = vpaddq_f32(acc1, acc3); + acc2 = vpaddq_f32(acc2, acc4); + acc1 = vpaddq_f32(acc1, acc2); + + // Pairwise add to get horizontal sum + float32x2_t folded = vadd_f32(vget_low_f32(acc1), vget_high_f32(acc1)); + folded = vpadd_f32(folded, folded); + + // Extract result + return vget_lane_f32(folded, 0); +} diff --git a/src/VecSim/spaces/L2/L2_SVE_BF16.h b/src/VecSim/spaces/L2/L2_SVE_BF16.h new file mode 100644 index 000000000..d3326a19f --- /dev/null +++ b/src/VecSim/spaces/L2/L2_SVE_BF16.h @@ -0,0 +1,86 @@ +/* + *Copyright Redis Ltd. 2021 - present + *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or + *the Server Side Public License v1 (SSPLv1). + */ + +#include + +// Assumes little-endianess +inline void L2Sqr_Op(svfloat32_t &acc, svbfloat16_t &v1, svbfloat16_t &v2) { + svfloat32_t v1_lo = svreinterpret_f32(svzip1(svdup_u16(0), svreinterpret_u16(v1))); + svfloat32_t v2_lo = svreinterpret_f32(svzip1(svdup_u16(0), svreinterpret_u16(v2))); + svfloat32_t diff_lo = svsub_f32_x(svptrue_b32(), v1_lo, v2_lo); + + acc = svmla_f32_x(svptrue_b32(), acc, diff_lo, diff_lo); + + svfloat32_t v1_hi = svreinterpret_f32(svzip2(svdup_u16(0), svreinterpret_u16(v1))); + svfloat32_t v2_hi = svreinterpret_f32(svzip2(svdup_u16(0), svreinterpret_u16(v2))); + svfloat32_t diff_hi = svsub_f32_x(svptrue_b32(), v1_hi, v2_hi); + + acc = svmla_f32_x(svptrue_b32(), acc, diff_hi, diff_hi); +} + +inline void L2Sqr_Step(const bfloat16_t *vec1, const bfloat16_t *vec2, svfloat32_t &acc, + size_t &offset, const size_t chunk) { + svbool_t all = svptrue_b16(); + + // Load brain-half-precision vectors. + svbfloat16_t v1 = svld1_bf16(all, vec1 + offset); + svbfloat16_t v2 = svld1_bf16(all, vec2 + offset); + // Compute multiplications and add to the accumulator + L2Sqr_Op(acc, v1, v2); + + // Move to next chunk + offset += chunk; +} + +template // [t/f, 0..3] +float BF16_L2Sqr_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) { + const auto *vec1 = static_cast(pVect1v); + const auto *vec2 = static_cast(pVect2v); + const size_t chunk = svcnth(); // number of 16-bit elements in a register + svfloat32_t acc1 = svdup_f32(0.0f); + svfloat32_t acc2 = svdup_f32(0.0f); + svfloat32_t acc3 = svdup_f32(0.0f); + svfloat32_t acc4 = svdup_f32(0.0f); + size_t offset = 0; + + // Process all full vectors + const size_t full_iterations = dimension / chunk / 4; + for (size_t iter = 0; iter < full_iterations; iter++) { + L2Sqr_Step(vec1, vec2, acc1, offset, chunk); + L2Sqr_Step(vec1, vec2, acc2, offset, chunk); + L2Sqr_Step(vec1, vec2, acc3, offset, chunk); + L2Sqr_Step(vec1, vec2, acc4, offset, chunk); + } + + // Perform between 0 and 3 additional steps, according to `additional_steps` value + if constexpr (additional_steps >= 1) + L2Sqr_Step(vec1, vec2, acc1, offset, chunk); + if constexpr (additional_steps >= 2) + L2Sqr_Step(vec1, vec2, acc2, offset, chunk); + if constexpr (additional_steps >= 3) + L2Sqr_Step(vec1, vec2, acc3, offset, chunk); + + // Handle the tail with the residual predicate + if constexpr (partial_chunk) { + svbool_t pg = svwhilelt_b16_u64(offset, dimension); + + // Load brain-half-precision vectors. + // Inactive elements are zeros, according to the docs + svbfloat16_t v1 = svld1_bf16(pg, vec1 + offset); + svbfloat16_t v2 = svld1_bf16(pg, vec2 + offset); + // Compute multiplications and add to the accumulator. + L2Sqr_Op(acc4, v1, v2); + } + + // Accumulate accumulators + acc1 = svadd_f32_x(svptrue_b32(), acc1, acc3); + acc2 = svadd_f32_x(svptrue_b32(), acc2, acc4); + acc1 = svadd_f32_x(svptrue_b32(), acc1, acc2); + + // Reduce the accumulated sum. + float result = svaddv_f32(svptrue_b32(), acc1); + return result; +} diff --git a/src/VecSim/spaces/L2_space.cpp b/src/VecSim/spaces/L2_space.cpp index 3532ae7a6..d1b5306f2 100644 --- a/src/VecSim/spaces/L2_space.cpp +++ b/src/VecSim/spaces/L2_space.cpp @@ -19,7 +19,9 @@ #include "VecSim/spaces/functions/AVX2.h" #include "VecSim/spaces/functions/SSE3.h" #include "VecSim/spaces/functions/NEON.h" +#include "VecSim/spaces/functions/NEON_BF16.h" #include "VecSim/spaces/functions/SVE.h" +#include "VecSim/spaces/functions/SVE_BF16.h" #include "VecSim/spaces/functions/SVE2.h" using bfloat16 = vecsim_types::bfloat16; @@ -153,12 +155,26 @@ dist_func_t L2_BF16_GetDistFunc(size_t dim, unsigned char *alignment, con if (!is_little_endian()) { return BF16_L2Sqr_BigEndian; } + auto features = getCpuOptimizationFeatures(arch_opt); + +#if defined(CPU_FEATURES_ARCH_AARCH64) +#ifdef OPT_SVE_BF16 + if (features.svebf16) { + return Choose_BF16_L2_implementation_SVE_BF16(dim); + } +#endif +#ifdef OPT_NEON_BF16 + if (features.bf16 && dim >= 8) { // Optimization assumes at least 8 BF16s (full chunk) + return Choose_BF16_L2_implementation_NEON_BF16(dim); + } +#endif +#endif // AARCH64 + +#if defined(CPU_FEATURES_ARCH_X86_64) // Optimizations assume at least 32 bfloats. If we have less, we use the naive implementation. if (dim < 32) { return ret_dist_func; } -#ifdef CPU_FEATURES_ARCH_X86_64 - auto features = getCpuOptimizationFeatures(arch_opt); #ifdef OPT_AVX512_BW_VBMI2 if (features.avx512bw && features.avx512vbmi2) { if (dim % 32 == 0) // no point in aligning if we have an offsetting residual diff --git a/src/VecSim/spaces/functions/NEON_BF16.cpp b/src/VecSim/spaces/functions/NEON_BF16.cpp new file mode 100644 index 000000000..e47c619ae --- /dev/null +++ b/src/VecSim/spaces/functions/NEON_BF16.cpp @@ -0,0 +1,30 @@ +/* + *Copyright Redis Ltd. 2021 - present + *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or + *the Server Side Public License v1 (SSPLv1). + */ + +#include "NEON_BF16.h" + +#include "VecSim/spaces/L2/L2_NEON_BF16.h" +#include "VecSim/spaces/IP/IP_NEON_BF16.h" + +namespace spaces { + +#include "implementation_chooser.h" + +dist_func_t Choose_BF16_L2_implementation_NEON_BF16(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 32, BF16_L2Sqr_NEON); + return ret_dist_func; +} + +dist_func_t Choose_BF16_IP_implementation_NEON_BF16(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 32, BF16_InnerProduct_NEON); + return ret_dist_func; +} + +#include "implementation_chooser_cleanup.h" + +} // namespace spaces diff --git a/src/VecSim/spaces/functions/NEON_BF16.h b/src/VecSim/spaces/functions/NEON_BF16.h new file mode 100644 index 000000000..a7271150a --- /dev/null +++ b/src/VecSim/spaces/functions/NEON_BF16.h @@ -0,0 +1,17 @@ +/* + *Copyright Redis Ltd. 2021 - present + *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or + *the Server Side Public License v1 (SSPLv1). + */ + +#pragma once + +#include "VecSim/spaces/spaces.h" + +namespace spaces { + +dist_func_t Choose_BF16_IP_implementation_NEON_BF16(size_t dim); + +dist_func_t Choose_BF16_L2_implementation_NEON_BF16(size_t dim); + +} // namespace spaces diff --git a/src/VecSim/spaces/functions/SVE_BF16.cpp b/src/VecSim/spaces/functions/SVE_BF16.cpp new file mode 100644 index 000000000..c881eca38 --- /dev/null +++ b/src/VecSim/spaces/functions/SVE_BF16.cpp @@ -0,0 +1,29 @@ +/* + *Copyright Redis Ltd. 2021 - present + *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or + *the Server Side Public License v1 (SSPLv1). + */ + +#include "SVE_BF16.h" + +#include "VecSim/spaces/IP/IP_SVE_BF16.h" +#include "VecSim/spaces/L2/L2_SVE_BF16.h" + +namespace spaces { + +#include "implementation_chooser.h" + +dist_func_t Choose_BF16_IP_implementation_SVE_BF16(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, BF16_InnerProduct_SVE, dim, svcnth); + return ret_dist_func; +} +dist_func_t Choose_BF16_L2_implementation_SVE_BF16(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, BF16_L2Sqr_SVE, dim, svcnth); + return ret_dist_func; +} + +#include "implementation_chooser_cleanup.h" + +} // namespace spaces diff --git a/src/VecSim/spaces/functions/SVE_BF16.h b/src/VecSim/spaces/functions/SVE_BF16.h new file mode 100644 index 000000000..99bc3deb8 --- /dev/null +++ b/src/VecSim/spaces/functions/SVE_BF16.h @@ -0,0 +1,16 @@ +/* + *Copyright Redis Ltd. 2021 - present + *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or + *the Server Side Public License v1 (SSPLv1). + */ + +#pragma once + +#include "VecSim/spaces/spaces.h" + +namespace spaces { + +dist_func_t Choose_BF16_IP_implementation_SVE_BF16(size_t dim); +dist_func_t Choose_BF16_L2_implementation_SVE_BF16(size_t dim); + +} // namespace spaces diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces.h b/tests/benchmark/spaces_benchmarks/bm_spaces.h index 72f4d9eb2..d986d23f2 100644 --- a/tests/benchmark/spaces_benchmarks/bm_spaces.h +++ b/tests/benchmark/spaces_benchmarks/bm_spaces.h @@ -25,7 +25,9 @@ #include "VecSim/spaces/functions/SSE3.h" #include "VecSim/spaces/functions/SSE.h" #include "VecSim/spaces/functions/NEON.h" +#include "VecSim/spaces/functions/NEON_BF16.h" #include "VecSim/spaces/functions/SVE.h" +#include "VecSim/spaces/functions/SVE_BF16.h" #include "VecSim/spaces/functions/SVE2.h" #include "bm_macros.h" #include "bm_spaces_class.h" diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_bf16.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_bf16.cpp index 27fe82a3d..5457cf128 100644 --- a/tests/benchmark/spaces_benchmarks/bm_spaces_bf16.cpp +++ b/tests/benchmark/spaces_benchmarks/bm_spaces_bf16.cpp @@ -13,6 +13,20 @@ class BM_VecSimSpaces_BF16 : public BM_VecSimSpaces { } }; +#ifdef CPU_FEATURES_ARCH_AARCH64 +cpu_features::Aarch64Features opt = cpu_features::GetAarch64Info().features; + +// NEON implementation for ARMv8-a +#ifdef OPT_NEON_BF16 +bool neon_supported = opt.bf16; +INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_BF16, BF16, NEON_BF16, 32, neon_supported); +#endif +#ifdef OPT_SVE_BF16 +bool sve_supported = opt.svebf16; // Check for SVE support +INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_BF16, BF16, SVE_BF16, 32, sve_supported); +#endif +#endif // AARCH64 + #ifdef CPU_FEATURES_ARCH_X86_64 cpu_features::X86Features opt = cpu_features::GetX86Info().features; diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index db122d544..3e03db6c9 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -28,7 +28,9 @@ #include "VecSim/spaces/functions/SSE3.h" #include "VecSim/spaces/functions/F16C.h" #include "VecSim/spaces/functions/NEON.h" +#include "VecSim/spaces/functions/NEON_BF16.h" #include "VecSim/spaces/functions/SVE.h" +#include "VecSim/spaces/functions/SVE_BF16.h" #include "VecSim/spaces/functions/SVE2.h" #include "tests_utils.h" @@ -841,6 +843,28 @@ TEST_P(BF16SpacesOptimizationTest, BF16InnerProductTest) { ASSERT_EQ(alignment, expected_alignment(128, dim)) << "SSE with dim " << dim; optimization.sse3 = 0; } +#endif +#ifdef OPT_SVE_BF16 + if (optimization.svebf16) { + unsigned char alignment = 0; + arch_opt_func = IP_BF16_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_BF16_IP_implementation_SVE_BF16(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_EQ(baseline, arch_opt_func(v, v2, dim)) << "SVE_BF16 with dim " << dim; + ASSERT_EQ(alignment, 0) << "SVE_BF16 with dim " << dim; + optimization.svebf16 = 0; + } +#endif +#ifdef OPT_NEON_BF16 + if (optimization.bf16) { + unsigned char alignment = 0; + arch_opt_func = IP_BF16_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_BF16_IP_implementation_NEON_BF16(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_EQ(baseline, arch_opt_func(v, v2, dim)) << "NEON_BF16 with dim " << dim; + ASSERT_EQ(alignment, 0) << "NEON_BF16 with dim " << dim; + optimization.bf16 = 0; + } #endif unsigned char alignment = 0; arch_opt_func = IP_BF16_GetDistFunc(dim, &alignment, &optimization); @@ -899,6 +923,28 @@ TEST_P(BF16SpacesOptimizationTest, BF16L2SqrTest) { ASSERT_EQ(alignment, expected_alignment(128, dim)) << "SSE with dim " << dim; optimization.sse3 = 0; } +#endif +#ifdef OPT_SVE_BF16 + if (optimization.svebf16) { + unsigned char alignment = 0; + arch_opt_func = L2_BF16_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_BF16_L2_implementation_SVE_BF16(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_EQ(baseline, arch_opt_func(v, v2, dim)) << "SVE_BF16 with dim " << dim; + ASSERT_EQ(alignment, 0) << "SVE_BF16 with dim " << dim; + optimization.svebf16 = 0; + } +#endif +#ifdef OPT_NEON_BF16 + if (optimization.bf16) { + unsigned char alignment = 0; + arch_opt_func = L2_BF16_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_BF16_L2_implementation_NEON_BF16(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_EQ(baseline, arch_opt_func(v, v2, dim)) << "NEON_BF16 with dim " << dim; + ASSERT_EQ(alignment, 0) << "NEON_BF16 with dim " << dim; + optimization.bf16 = 0; + } #endif unsigned char alignment = 0; arch_opt_func = L2_BF16_GetDistFunc(dim, &alignment, &optimization);