RedisAI
diff --git a/‎cmake/aarch64InstructionFlags.cmake‎
Lines changed: 14 additions & 0 deletions b/‎cmake/aarch64InstructionFlags.cmake‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎src/VecSim/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎src/VecSim/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/VecSim/index_factories/components/components_factory.cpp‎
Lines changed: 21 additions & 0 deletions b/‎src/VecSim/index_factories/components/components_factory.cpp‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎src/VecSim/index_factories/components/components_factory.h‎
Lines changed: 6 additions & 10 deletions b/‎src/VecSim/index_factories/components/components_factory.h‎
Lines changed: 6 additions & 10 deletions
diff --git a/‎src/VecSim/spaces/CMakeLists.txt‎
Lines changed: 21 additions & 0 deletions b/‎src/VecSim/spaces/CMakeLists.txt‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎src/VecSim/spaces/IP/IP_NEON_BF16.h‎
Lines changed: 89 additions & 0 deletions b/‎src/VecSim/spaces/IP/IP_NEON_BF16.h‎
Lines changed: 89 additions & 0 deletions
diff --git a/‎src/VecSim/spaces/IP/IP_NEON_FP16.h‎
Lines changed: 93 additions & 0 deletions b/‎src/VecSim/spaces/IP/IP_NEON_FP16.h‎
Lines changed: 93 additions & 0 deletions
diff --git a/‎src/VecSim/spaces/IP/IP_SVE_BF16.h‎
Lines changed: 71 additions & 0 deletions b/‎src/VecSim/spaces/IP/IP_SVE_BF16.h‎
Lines changed: 71 additions & 0 deletions
@@ -8,15 +8,29 @@ CHECK_CXX_COMPILER_FLAG("-march=armv7-a+neon" CXX_ARMV7_NEON)
 CHECK_CXX_COMPILER_FLAG("-march=armv8-a" CXX_ARMV8A)
 CHECK_CXX_COMPILER_FLAG("-march=armv8-a+sve" CXX_SVE)
 CHECK_CXX_COMPILER_FLAG("-march=armv9-a+sve2" CXX_SVE2)
+CHECK_CXX_COMPILER_FLAG("-march=armv8.2-a+fp16fml" CXX_NEON_HP)
+CHECK_CXX_COMPILER_FLAG("-march=armv8.2-a+bf16" CXX_NEON_BF16)
+CHECK_CXX_COMPILER_FLAG("-march=armv8.2-a+sve+bf16" CXX_SVE_BF16)
 
 # Only use ARMv9 if both compiler and CPU support it
 if(CXX_SVE2)
   message(STATUS "Using ARMv9.0-a with SVE2 (supported by CPU)")
   add_compile_definitions(OPT_SVE2)
 endif()
 if (CXX_ARMV8A OR CXX_ARMV7_NEON)
+  message(STATUS "Using ARMv8.0-a with NEON")
   add_compile_definitions(OPT_NEON)
+  endif()
+if (CXX_NEON_HP)
+  message(STATUS "Using ARMv8.2-a with NEON half-percision extension")
+  add_compile_definitions(OPT_NEON_HP)
+endif()
+if (CXX_NEON_BF16)
+  add_compile_definitions(OPT_NEON_BF16)
 endif()
 if (CXX_SVE)
   add_compile_definitions(OPT_SVE)
 endif()
+if (CXX_SVE_BF16)
+  add_compile_definitions(OPT_SVE_BF16)
+endif()
@@ -63,6 +63,7 @@ add_library(VectorSimilarity ${VECSIM_LIBTYPE}
     index_factories/tiered_factory.cpp
     ${svs_factory_file}
     index_factories/index_factory.cpp
+    index_factories/components/components_factory.cpp
     algorithms/hnsw/visited_nodes_handler.cpp
     vec_sim.cpp
     vec_sim_debug.cpp
 
@@ -0,0 +1,21 @@
+/*
+ *Copyright Redis Ltd. 2021 - present
+ *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
+ *the Server Side Public License v1 (SSPLv1).
+ */
+
+#include "VecSim/index_factories/components/components_factory.h"
+
+PreprocessorsContainerParams CreatePreprocessorsContainerParams(VecSimMetric metric, size_t dim,
+                                                                bool is_normalized,
+                                                                unsigned char alignment) {
+    // If the index metric is Cosine, and is_normalized == true, we will skip normalizing vectors
+    // and query blobs.
+    VecSimMetric pp_metric;
+    if (is_normalized && metric == VecSimMetric_Cosine) {
+        pp_metric = VecSimMetric_IP;
+    } else {
+        pp_metric = metric;
+    }
+    return {.metric = pp_metric, .dim = dim, .alignment = alignment};
+}
@@ -12,6 +12,10 @@
 #include "VecSim/index_factories/components/preprocessors_factory.h"
 #include "VecSim/spaces/computer/calculator.h"
 
+PreprocessorsContainerParams CreatePreprocessorsContainerParams(VecSimMetric metric, size_t dim,
+                                                                bool is_normalized,
+                                                                unsigned char alignment);
+
 template <typename DataType, typename DistType>
 IndexComponents<DataType, DistType>
 CreateIndexComponents(std::shared_ptr<VecSimAllocator> allocator, VecSimMetric metric, size_t dim,
@@ -22,16 +26,8 @@ CreateIndexComponents(std::shared_ptr<VecSimAllocator> allocator, VecSimMetric m
     // Currently we have only one distance calculator implementation
     auto indexCalculator = new (allocator) DistanceCalculatorCommon<DistType>(allocator, distFunc);
 
-    // If the index metric is Cosine, and is_normalized == true, we will skip normalizing vectors
-    // and query blobs.
-    VecSimMetric pp_metric;
-    if (is_normalized && metric == VecSimMetric_Cosine) {
-        pp_metric = VecSimMetric_IP;
-    } else {
-        pp_metric = metric;
-    }
-    PreprocessorsContainerParams ppParams = {
-        .metric = pp_metric, .dim = dim, .alignment = alignment};
+    PreprocessorsContainerParams ppParams =
+        CreatePreprocessorsContainerParams(metric, dim, is_normalized, alignment);
     auto preprocessors = CreatePreprocessorsContainer<DataType>(allocator, ppParams);
 
     return {indexCalculator, preprocessors};
 
@@ -91,13 +91,34 @@ if (CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "(aarch64)|(arm64)|(ARM64)|(armv.*)")
 		list(APPEND OPTIMIZATIONS functions/NEON.cpp)
 	endif()
 
+	# NEON half-precision support
+	if (CXX_NEON_HP AND CXX_ARMV8A)
+		message("Building with NEON+HP")
+		set_source_files_properties(functions/NEON_HP.cpp PROPERTIES COMPILE_FLAGS "-march=armv8.2-a+fp16fml")
+		list(APPEND OPTIMIZATIONS functions/NEON_HP.cpp)
+  endif()
+
+	# NEON bfloat16 support
+	if (CXX_NEON_BF16)
+		message("Building with NEON + BF16")
+		set_source_files_properties(functions/NEON_BF16.cpp PROPERTIES COMPILE_FLAGS "-march=armv8.2-a+bf16")
+		list(APPEND OPTIMIZATIONS functions/NEON_BF16.cpp)
+	endif()
+
 	# SVE support
 	if (CXX_SVE)
 		message("Building with SVE")
 		set_source_files_properties(functions/SVE.cpp PROPERTIES COMPILE_FLAGS "-march=armv8-a+sve")
 		list(APPEND OPTIMIZATIONS functions/SVE.cpp)
 	endif()
 
+	# SVE with BF16 support
+	if (CXX_SVE_BF16)
+		message("Building with SVE + BF16")
+		set_source_files_properties(functions/SVE_BF16.cpp PROPERTIES COMPILE_FLAGS "-march=armv8.2-a+sve+bf16")
+		list(APPEND OPTIMIZATIONS functions/SVE_BF16.cpp)
+	endif()
+
 	# SVE2 support
 	if (CXX_SVE2)
 		message("Building with ARMV9A and SVE2")
 
@@ -0,0 +1,89 @@
+/*
+ *Copyright Redis Ltd. 2021 - present
+ *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
+ *the Server Side Public License v1 (SSPLv1).
+ */
+
+#include <arm_neon.h>
+
+inline void InnerProduct_Step(const bfloat16_t *&vec1, const bfloat16_t *&vec2, float32x4_t &acc) {
+    // Load brain-half-precision vectors
+    bfloat16x8_t v1 = vld1q_bf16(vec1);
+    bfloat16x8_t v2 = vld1q_bf16(vec2);
+    vec1 += 8;
+    vec2 += 8;
+    // Compute multiplications and add to the accumulator
+    acc = vbfdotq_f32(acc, v1, v2);
+}
+
+template <unsigned char residual> // 0..31
+float BF16_InnerProduct_NEON(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    const auto *vec1 = static_cast<const bfloat16_t *>(pVect1v);
+    const auto *vec2 = static_cast<const bfloat16_t *>(pVect2v);
+    const auto *const v1End = vec1 + dimension;
+    float32x4_t acc1 = vdupq_n_f32(0.0f);
+    float32x4_t acc2 = vdupq_n_f32(0.0f);
+    float32x4_t acc3 = vdupq_n_f32(0.0f);
+    float32x4_t acc4 = vdupq_n_f32(0.0f);
+
+    // First, handle the partial chunk residual
+    if constexpr (residual % 8) {
+        auto constexpr chunk_residual = residual % 8;
+        // TODO: special cases for some residuals and benchmark if its better
+        constexpr uint16x8_t mask = {
+            0xFFFF,
+            (chunk_residual >= 2) ? 0xFFFF : 0,
+            (chunk_residual >= 3) ? 0xFFFF : 0,
+            (chunk_residual >= 4) ? 0xFFFF : 0,
+            (chunk_residual >= 5) ? 0xFFFF : 0,
+            (chunk_residual >= 6) ? 0xFFFF : 0,
+            (chunk_residual >= 7) ? 0xFFFF : 0,
+            0,
+        };
+
+        // Load partial vectors
+        bfloat16x8_t v1 = vld1q_bf16(vec1);
+        bfloat16x8_t v2 = vld1q_bf16(vec2);
+
+        // Apply mask to both vectors
+        bfloat16x8_t masked_v1 =
+            vreinterpretq_bf16_u16(vandq_u16(vreinterpretq_u16_bf16(v1), mask));
+        bfloat16x8_t masked_v2 =
+            vreinterpretq_bf16_u16(vandq_u16(vreinterpretq_u16_bf16(v2), mask));
+
+        acc1 = vbfdotq_f32(acc1, masked_v1, masked_v2);
+
+        // Advance pointers
+        vec1 += chunk_residual;
+        vec2 += chunk_residual;
+    }
+
+    // Handle (residual - (residual % 8)) in chunks of 8 bfloat16
+    if constexpr (residual >= 8)
+        InnerProduct_Step(vec1, vec2, acc2);
+    if constexpr (residual >= 16)
+        InnerProduct_Step(vec1, vec2, acc3);
+    if constexpr (residual >= 24)
+        InnerProduct_Step(vec1, vec2, acc4);
+
+    // Process the rest of the vectors (the full chunks part)
+    while (vec1 < v1End) {
+        // TODO: use `vld1q_f16_x4` for quad-loading?
+        InnerProduct_Step(vec1, vec2, acc1);
+        InnerProduct_Step(vec1, vec2, acc2);
+        InnerProduct_Step(vec1, vec2, acc3);
+        InnerProduct_Step(vec1, vec2, acc4);
+    }
+
+    // Accumulate accumulators
+    acc1 = vpaddq_f32(acc1, acc3);
+    acc2 = vpaddq_f32(acc2, acc4);
+    acc1 = vpaddq_f32(acc1, acc2);
+
+    // Pairwise add to get horizontal sum
+    float32x2_t folded = vadd_f32(vget_low_f32(acc1), vget_high_f32(acc1));
+    folded = vpadd_f32(folded, folded);
+
+    // Extract result
+    return 1.0f - vget_lane_f32(folded, 0);
+}
@@ -0,0 +1,93 @@
+/*
+ *Copyright Redis Ltd. 2021 - present
+ *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
+ *the Server Side Public License v1 (SSPLv1).
+ */
+
+#include <arm_neon.h>
+
+inline void InnerProduct_Step(const float16_t *&vec1, const float16_t *&vec2, float16x8_t &acc) {
+    // Load half-precision vectors
+    float16x8_t v1 = vld1q_f16(vec1);
+    float16x8_t v2 = vld1q_f16(vec2);
+    vec1 += 8;
+    vec2 += 8;
+
+    // Multiply and accumulate
+    acc = vfmaq_f16(acc, v1, v2);
+}
+
+template <unsigned char residual> // 0..31
+float FP16_InnerProduct_NEON_HP(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    const auto *vec1 = static_cast<const float16_t *>(pVect1v);
+    const auto *vec2 = static_cast<const float16_t *>(pVect2v);
+    const auto *const v1End = vec1 + dimension;
+    float16x8_t acc1 = vdupq_n_f16(0.0f);
+    float16x8_t acc2 = vdupq_n_f16(0.0f);
+    float16x8_t acc3 = vdupq_n_f16(0.0f);
+    float16x8_t acc4 = vdupq_n_f16(0.0f);
+
+    // First, handle the partial chunk residual
+    if constexpr (residual % 8) {
+        auto constexpr chunk_residual = residual % 8;
+        // TODO: spacial cases for some residuals and benchmark if its better
+        constexpr uint16x8_t mask = {
+            0xFFFF,
+            (chunk_residual >= 2) ? 0xFFFF : 0,
+            (chunk_residual >= 3) ? 0xFFFF : 0,
+            (chunk_residual >= 4) ? 0xFFFF : 0,
+            (chunk_residual >= 5) ? 0xFFFF : 0,
+            (chunk_residual >= 6) ? 0xFFFF : 0,
+            (chunk_residual >= 7) ? 0xFFFF : 0,
+            0,
+        };
+
+        // Load partial vectors
+        float16x8_t v1 = vld1q_f16(vec1);
+        float16x8_t v2 = vld1q_f16(vec2);
+
+        // Apply mask to both vectors
+        float16x8_t masked_v1 = vbslq_f16(mask, v1, acc1); // `acc1` should be all zeros here
+        float16x8_t masked_v2 = vbslq_f16(mask, v2, acc2); // `acc2` should be all zeros here
+
+        // Multiply and accumulate
+        acc1 = vfmaq_f16(acc1, masked_v1, masked_v2);
+
+        // Advance pointers
+        vec1 += chunk_residual;
+        vec2 += chunk_residual;
+    }
+
+    // Handle (residual - (residual % 8)) in chunks of 8 float16
+    if constexpr (residual >= 8)
+        InnerProduct_Step(vec1, vec2, acc2);
+    if constexpr (residual >= 16)
+        InnerProduct_Step(vec1, vec2, acc3);
+    if constexpr (residual >= 24)
+        InnerProduct_Step(vec1, vec2, acc4);
+
+    // Process the rest of the vectors (the full chunks part)
+    while (vec1 < v1End) {
+        // TODO: use `vld1q_f16_x4` for quad-loading?
+        InnerProduct_Step(vec1, vec2, acc1);
+        InnerProduct_Step(vec1, vec2, acc2);
+        InnerProduct_Step(vec1, vec2, acc3);
+        InnerProduct_Step(vec1, vec2, acc4);
+    }
+
+    // Accumulate accumulators
+    acc1 = vpaddq_f16(acc1, acc3);
+    acc2 = vpaddq_f16(acc2, acc4);
+    acc1 = vpaddq_f16(acc1, acc2);
+
+    // Horizontal sum of the accumulated values
+    float32x4_t sum_f32 = vcvt_f32_f16(vget_low_f16(acc1));
+    sum_f32 = vaddq_f32(sum_f32, vcvt_f32_f16(vget_high_f16(acc1)));
+
+    // Pairwise add to get horizontal sum
+    float32x2_t sum_2 = vadd_f32(vget_low_f32(sum_f32), vget_high_f32(sum_f32));
+    sum_2 = vpadd_f32(sum_2, sum_2);
+
+    // Extract result
+    return 1.0f - vget_lane_f32(sum_2, 0);
+}
@@ -0,0 +1,71 @@
+/*
+ *Copyright Redis Ltd. 2021 - present
+ *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
+ *the Server Side Public License v1 (SSPLv1).
+ */
+
+#include <arm_sve.h>
+
+inline void InnerProduct_Step(const bfloat16_t *vec1, const bfloat16_t *vec2, svfloat32_t &acc,
+                              size_t &offset, const size_t chunk) {
+    svbool_t all = svptrue_b16();
+
+    // Load brain-half-precision vectors.
+    svbfloat16_t v1 = svld1_bf16(all, vec1 + offset);
+    svbfloat16_t v2 = svld1_bf16(all, vec2 + offset);
+    // Compute multiplications and add to the accumulator
+    acc = svbfdot(acc, v1, v2);
+
+    // Move to next chunk
+    offset += chunk;
+}
+
+template <bool partial_chunk, unsigned char additional_steps> // [t/f, 0..3]
+float BF16_InnerProduct_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    const auto *vec1 = static_cast<const bfloat16_t *>(pVect1v);
+    const auto *vec2 = static_cast<const bfloat16_t *>(pVect2v);
+    const size_t chunk = svcnth(); // number of 16-bit elements in a register
+    svfloat32_t acc1 = svdup_f32(0.0f);
+    svfloat32_t acc2 = svdup_f32(0.0f);
+    svfloat32_t acc3 = svdup_f32(0.0f);
+    svfloat32_t acc4 = svdup_f32(0.0f);
+    size_t offset = 0;
+
+    // Process all full vectors
+    const size_t full_iterations = dimension / chunk / 4;
+    for (size_t iter = 0; iter < full_iterations; iter++) {
+        InnerProduct_Step(vec1, vec2, acc1, offset, chunk);
+        InnerProduct_Step(vec1, vec2, acc2, offset, chunk);
+        InnerProduct_Step(vec1, vec2, acc3, offset, chunk);
+        InnerProduct_Step(vec1, vec2, acc4, offset, chunk);
+    }
+
+    // Perform between 0 and 3 additional steps, according to `additional_steps` value
+    if constexpr (additional_steps >= 1)
+        InnerProduct_Step(vec1, vec2, acc1, offset, chunk);
+    if constexpr (additional_steps >= 2)
+        InnerProduct_Step(vec1, vec2, acc2, offset, chunk);
+    if constexpr (additional_steps >= 3)
+        InnerProduct_Step(vec1, vec2, acc3, offset, chunk);
+
+    // Handle the tail with the residual predicate
+    if constexpr (partial_chunk) {
+        svbool_t pg = svwhilelt_b16_u64(offset, dimension);
+
+        // Load brain-half-precision vectors.
+        // Inactive elements are zeros, according to the docs
+        svbfloat16_t v1 = svld1_bf16(pg, vec1 + offset);
+        svbfloat16_t v2 = svld1_bf16(pg, vec2 + offset);
+        // Compute multiplications and add to the accumulator.
+        acc4 = svbfdot(acc4, v1, v2);
+    }
+
+    // Accumulate accumulators
+    acc1 = svadd_f32_x(svptrue_b32(), acc1, acc3);
+    acc2 = svadd_f32_x(svptrue_b32(), acc2, acc4);
+    acc1 = svadd_f32_x(svptrue_b32(), acc1, acc2);
+
+    // Reduce the accumulated sum.
+    float result = svaddv_f32(svptrue_b32(), acc1);
+    return 1.0f - result;
+}