Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
102 changes: 102 additions & 0 deletions src/VecSim/spaces/IP/IP_SVE_INT8.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
/*
*Copyright Redis Ltd. 2021 - present
*Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
*the Server Side Public License v1 (SSPLv1).
*/
#pragma once
#include "VecSim/spaces/space_includes.h"
#include <arm_sve.h>

inline void InnerProductStep(const int8_t *&pVect1, const int8_t *&pVect2, size_t &offset,
svint32_t &sum, const size_t chunk) {
svbool_t pg = svptrue_b8();

// Load int8 vectors
svint8_t v1_i8 = svld1_s8(pg, pVect1 + offset);
svint8_t v2_i8 = svld1_s8(pg, pVect2 + offset);

sum = svdot_s32(sum, v1_i8, v2_i8);

offset += chunk; // Move to the next set of int8 elements
}

template <bool partial_chunk, unsigned char additional_steps>
float INT8_InnerProductImp(const void *pVect1v, const void *pVect2v, size_t dimension) {
const int8_t *pVect1 = reinterpret_cast<const int8_t *>(pVect1v);
const int8_t *pVect2 = reinterpret_cast<const int8_t *>(pVect2v);

size_t offset = 0;
const size_t vl = svcntb();
const size_t chunk_size = 4 * vl;

// Each innerProductStep adds maximum 2^8 & 2^8 = 2^16
// Therefore, on a single accumulator, we can perform 2^15 steps before overflowing
// That scenario will happen only is the dimension of the vector is larger than 16*4*2^15 = 2^21
// (16 int8 in 1 SVE register) * (4 accumulators) * (2^15 steps)
// We can safely assume that the dimension is smaller than that
// So using int32_t is safe

svint32_t sum0 = svdup_s32(0);
svint32_t sum1 = svdup_s32(0);
svint32_t sum2 = svdup_s32(0);
svint32_t sum3 = svdup_s32(0);

size_t num_chunks = dimension / chunk_size;

for (size_t i = 0; i < num_chunks; ++i) {
InnerProductStep(pVect1, pVect2, offset, sum0, vl);
InnerProductStep(pVect1, pVect2, offset, sum1, vl);
InnerProductStep(pVect1, pVect2, offset, sum2, vl);
InnerProductStep(pVect1, pVect2, offset, sum3, vl);
}

// Process remaining complete SVE vectors that didn't fit into the main loop
// These are full vector operations (0-3 elements)
if constexpr (additional_steps > 0) {
if constexpr (additional_steps >= 1) {
InnerProductStep(pVect1, pVect2, offset, sum0, vl);
}
if constexpr (additional_steps >= 2) {
InnerProductStep(pVect1, pVect2, offset, sum1, vl);
}
if constexpr (additional_steps >= 3) {
InnerProductStep(pVect1, pVect2, offset, sum2, vl);
}
}

if constexpr (partial_chunk) {
svbool_t pg = svwhilelt_b8_u64(offset, dimension);

svint8_t v1_i8 = svld1_s8(pg, pVect1 + offset); // Load int8 vectors
svint8_t v2_i8 = svld1_s8(pg, pVect2 + offset); // Load int8 vectors

sum3 = svdot_s32(sum3, v1_i8, v2_i8);

pVect1 += vl;
pVect2 += vl;
}

sum0 = svadd_s32_x(svptrue_b32(), sum0, sum1);
sum2 = svadd_s32_x(svptrue_b32(), sum2, sum3);

// Perform vector addition in parallel and Horizontal sum
int32_t sum_all = svaddv_s32(svptrue_b32(), svadd_s32_x(svptrue_b32(), sum0, sum2));

return sum_all;
}

template <bool partial_chunk, unsigned char additional_steps>
float INT8_InnerProductSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) {
return 1.0f -
INT8_InnerProductImp<partial_chunk, additional_steps>(pVect1v, pVect2v, dimension);
}

template <bool partial_chunk, unsigned char additional_steps>
float INT8_CosineSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) {
float ip = INT8_InnerProductImp<partial_chunk, additional_steps>(pVect1v, pVect2v, dimension);
float norm_v1 =
*reinterpret_cast<const float *>(static_cast<const int8_t *>(pVect1v) + dimension);
float norm_v2 =
*reinterpret_cast<const float *>(static_cast<const int8_t *>(pVect2v) + dimension);
return 1.0f - ip / (norm_v1 * norm_v2);
}
102 changes: 102 additions & 0 deletions src/VecSim/spaces/IP/IP_SVE_UINT8.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
/*
*Copyright Redis Ltd. 2021 - present
*Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or
*the Server Side Public License v1 (SSPLv1).
*/
#pragma once
#include "VecSim/spaces/space_includes.h"
#include <arm_sve.h>

inline void InnerProductStep(const uint8_t *&pVect1, const uint8_t *&pVect2, size_t &offset,
svuint32_t &sum, const size_t chunk) {
svbool_t pg = svptrue_b8();

// Load uint8 vectors
svuint8_t v1_ui8 = svld1_u8(pg, pVect1 + offset);
svuint8_t v2_ui8 = svld1_u8(pg, pVect2 + offset);

sum = svdot_u32(sum, v1_ui8, v2_ui8);

offset += chunk; // Move to the next set of uint8 elements
}

template <bool partial_chunk, unsigned char additional_steps>
float UINT8_InnerProductImp(const void *pVect1v, const void *pVect2v, size_t dimension) {
const uint8_t *pVect1 = reinterpret_cast<const uint8_t *>(pVect1v);
const uint8_t *pVect2 = reinterpret_cast<const uint8_t *>(pVect2v);

size_t offset = 0;
const size_t vl = svcntb();
const size_t chunk_size = 4 * vl;

// Each innerProductStep adds maximum 2^8 & 2^8 = 2^16
// Therefore, on a single accumulator, we can perform 2^16 steps before overflowing
// That scenario will happen only is the dimension of the vector is larger than 16*4*2^16 = 2^22
// (16 uint8 in 1 SVE register) * (4 accumulators) * (2^16 steps)
// We can safely assume that the dimension is smaller than that
// So using int32_t is safe

svuint32_t sum0 = svdup_u32(0);
svuint32_t sum1 = svdup_u32(0);
svuint32_t sum2 = svdup_u32(0);
svuint32_t sum3 = svdup_u32(0);

size_t num_chunks = dimension / chunk_size;

for (size_t i = 0; i < num_chunks; ++i) {
InnerProductStep(pVect1, pVect2, offset, sum0, vl);
InnerProductStep(pVect1, pVect2, offset, sum1, vl);
InnerProductStep(pVect1, pVect2, offset, sum2, vl);
InnerProductStep(pVect1, pVect2, offset, sum3, vl);
}

// Process remaining complete SVE vectors that didn't fit into the main loop
// These are full vector operations (0-3 elements)
if constexpr (additional_steps > 0) {
if constexpr (additional_steps >= 1) {
InnerProductStep(pVect1, pVect2, offset, sum0, vl);
}
if constexpr (additional_steps >= 2) {
InnerProductStep(pVect1, pVect2, offset, sum1, vl);
}
if constexpr (additional_steps >= 3) {
InnerProductStep(pVect1, pVect2, offset, sum2, vl);
}
}

if constexpr (partial_chunk) {
svbool_t pg = svwhilelt_b8_u64(offset, dimension);

svuint8_t v1_ui8 = svld1_u8(pg, pVect1 + offset); // Load uint8 vectors
svuint8_t v2_ui8 = svld1_u8(pg, pVect2 + offset); // Load uint8 vectors

sum3 = svdot_u32(sum3, v1_ui8, v2_ui8);

pVect1 += vl;
pVect2 += vl;
}

sum0 = svadd_u32_x(svptrue_b32(), sum0, sum1);
sum2 = svadd_u32_x(svptrue_b32(), sum2, sum3);

// Perform vector addition in parallel and Horizontal sum
int32_t sum_all = svaddv_u32(svptrue_b32(), svadd_u32_x(svptrue_b32(), sum0, sum2));

return sum_all;
}

template <bool partial_chunk, unsigned char additional_steps>
float UINT8_InnerProductSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) {
return 1.0f -
UINT8_InnerProductImp<partial_chunk, additional_steps>(pVect1v, pVect2v, dimension);
}

template <bool partial_chunk, unsigned char additional_steps>
float UINT8_CosineSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) {
float ip = UINT8_InnerProductImp<partial_chunk, additional_steps>(pVect1v, pVect2v, dimension);
float norm_v1 =
*reinterpret_cast<const float *>(static_cast<const uint8_t *>(pVect1v) + dimension);
float norm_v2 =
*reinterpret_cast<const float *>(static_cast<const uint8_t *>(pVect2v) + dimension);
return 1.0f - ip / (norm_v1 * norm_v2);
}
73 changes: 65 additions & 8 deletions src/VecSim/spaces/IP_space.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -276,12 +276,27 @@ dist_func_t<float> IP_INT8_GetDistFunc(size_t dim, unsigned char *alignment, con
}

dist_func_t<float> ret_dist_func = INT8_InnerProduct;

auto features = getCpuOptimizationFeatures(arch_opt);

#ifdef CPU_FEATURES_ARCH_AARCH64
#ifdef OPT_SVE2
if (features.sve2) {
return Choose_INT8_IP_implementation_SVE2(dim);
}
#endif
#ifdef OPT_SVE
if (features.sve) {
return Choose_INT8_IP_implementation_SVE(dim);
}
#endif
#endif
#ifdef CPU_FEATURES_ARCH_X86_64
// Optimizations assume at least 32 int8. If we have less, we use the naive implementation.
if (dim < 32) {
return ret_dist_func;
}
#ifdef CPU_FEATURES_ARCH_X86_64
auto features = getCpuOptimizationFeatures(arch_opt);

#ifdef OPT_AVX512_F_BW_VL_VNNI
if (features.avx512f && features.avx512bw && features.avx512vl && features.avx512vnni) {
if (dim % 32 == 0) // no point in aligning if we have an offsetting residual
Expand All @@ -301,12 +316,26 @@ dist_func_t<float> Cosine_INT8_GetDistFunc(size_t dim, unsigned char *alignment,
}

dist_func_t<float> ret_dist_func = INT8_Cosine;

auto features = getCpuOptimizationFeatures(arch_opt);

#ifdef CPU_FEATURES_ARCH_AARCH64
#ifdef OPT_SVE2
if (features.sve2) {
return Choose_INT8_Cosine_implementation_SVE2(dim);
}
#endif
#ifdef OPT_SVE
if (features.sve) {
return Choose_INT8_Cosine_implementation_SVE(dim);
}
#endif
#endif
#ifdef CPU_FEATURES_ARCH_X86_64
// Optimizations assume at least 32 int8. If we have less, we use the naive implementation.
if (dim < 32) {
return ret_dist_func;
}
#ifdef CPU_FEATURES_ARCH_X86_64
auto features = getCpuOptimizationFeatures(arch_opt);
#ifdef OPT_AVX512_F_BW_VL_VNNI
if (features.avx512f && features.avx512bw && features.avx512vl && features.avx512vnni) {
// For int8 vectors with cosine distance, the extra float for the norm shifts alignment to
Expand All @@ -329,12 +358,26 @@ dist_func_t<float> IP_UINT8_GetDistFunc(size_t dim, unsigned char *alignment,
}

dist_func_t<float> ret_dist_func = UINT8_InnerProduct;

auto features = getCpuOptimizationFeatures(arch_opt);

#ifdef CPU_FEATURES_ARCH_AARCH64
#ifdef OPT_SVE2
if (features.sve2) {
return Choose_UINT8_IP_implementation_SVE2(dim);
}
#endif
#ifdef OPT_SVE
if (features.sve) {
return Choose_UINT8_IP_implementation_SVE(dim);
}
#endif
#endif
#ifdef CPU_FEATURES_ARCH_X86_64
// Optimizations assume at least 32 uint8. If we have less, we use the naive implementation.
if (dim < 32) {
return ret_dist_func;
}
#ifdef CPU_FEATURES_ARCH_X86_64
auto features = getCpuOptimizationFeatures(arch_opt);
#ifdef OPT_AVX512_F_BW_VL_VNNI
if (features.avx512f && features.avx512bw && features.avx512vl && features.avx512vnni) {
if (dim % 32 == 0) // no point in aligning if we have an offsetting residual
Expand All @@ -354,12 +397,26 @@ dist_func_t<float> Cosine_UINT8_GetDistFunc(size_t dim, unsigned char *alignment
}

dist_func_t<float> ret_dist_func = UINT8_Cosine;

auto features = getCpuOptimizationFeatures(arch_opt);

#ifdef CPU_FEATURES_ARCH_AARCH64
#ifdef OPT_SVE2
if (features.sve2) {
return Choose_UINT8_Cosine_implementation_SVE2(dim);
}
#endif
#ifdef OPT_SVE
if (features.sve) {
return Choose_UINT8_Cosine_implementation_SVE(dim);
}
#endif
#endif
#ifdef CPU_FEATURES_ARCH_X86_64
// Optimizations assume at least 32 uint8. If we have less, we use the naive implementation.
if (dim < 32) {
return ret_dist_func;
}
#ifdef CPU_FEATURES_ARCH_X86_64
auto features = getCpuOptimizationFeatures(arch_opt);
#ifdef OPT_AVX512_F_BW_VL_VNNI
if (features.avx512f && features.avx512bw && features.avx512vl && features.avx512vnni) {
// For uint8 vectors with cosine distance, the extra float for the norm shifts alignment to
Expand Down
Loading