diff --git a/src/VecSim/spaces/IP/IP_SVE_INT8.h b/src/VecSim/spaces/IP/IP_SVE_INT8.h new file mode 100644 index 000000000..8dc1df313 --- /dev/null +++ b/src/VecSim/spaces/IP/IP_SVE_INT8.h @@ -0,0 +1,102 @@ +/* + *Copyright Redis Ltd. 2021 - present + *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or + *the Server Side Public License v1 (SSPLv1). + */ +#pragma once +#include "VecSim/spaces/space_includes.h" +#include + +inline void InnerProductStep(const int8_t *&pVect1, const int8_t *&pVect2, size_t &offset, + svint32_t &sum, const size_t chunk) { + svbool_t pg = svptrue_b8(); + + // Load int8 vectors + svint8_t v1_i8 = svld1_s8(pg, pVect1 + offset); + svint8_t v2_i8 = svld1_s8(pg, pVect2 + offset); + + sum = svdot_s32(sum, v1_i8, v2_i8); + + offset += chunk; // Move to the next set of int8 elements +} + +template +float INT8_InnerProductImp(const void *pVect1v, const void *pVect2v, size_t dimension) { + const int8_t *pVect1 = reinterpret_cast(pVect1v); + const int8_t *pVect2 = reinterpret_cast(pVect2v); + + size_t offset = 0; + const size_t vl = svcntb(); + const size_t chunk_size = 4 * vl; + + // Each innerProductStep adds maximum 2^8 & 2^8 = 2^16 + // Therefore, on a single accumulator, we can perform 2^15 steps before overflowing + // That scenario will happen only is the dimension of the vector is larger than 16*4*2^15 = 2^21 + // (16 int8 in 1 SVE register) * (4 accumulators) * (2^15 steps) + // We can safely assume that the dimension is smaller than that + // So using int32_t is safe + + svint32_t sum0 = svdup_s32(0); + svint32_t sum1 = svdup_s32(0); + svint32_t sum2 = svdup_s32(0); + svint32_t sum3 = svdup_s32(0); + + size_t num_chunks = dimension / chunk_size; + + for (size_t i = 0; i < num_chunks; ++i) { + InnerProductStep(pVect1, pVect2, offset, sum0, vl); + InnerProductStep(pVect1, pVect2, offset, sum1, vl); + InnerProductStep(pVect1, pVect2, offset, sum2, vl); + InnerProductStep(pVect1, pVect2, offset, sum3, vl); + } + + // Process remaining complete SVE vectors that didn't fit into the main loop + // These are full vector operations (0-3 elements) + if constexpr (additional_steps > 0) { + if constexpr (additional_steps >= 1) { + InnerProductStep(pVect1, pVect2, offset, sum0, vl); + } + if constexpr (additional_steps >= 2) { + InnerProductStep(pVect1, pVect2, offset, sum1, vl); + } + if constexpr (additional_steps >= 3) { + InnerProductStep(pVect1, pVect2, offset, sum2, vl); + } + } + + if constexpr (partial_chunk) { + svbool_t pg = svwhilelt_b8_u64(offset, dimension); + + svint8_t v1_i8 = svld1_s8(pg, pVect1 + offset); // Load int8 vectors + svint8_t v2_i8 = svld1_s8(pg, pVect2 + offset); // Load int8 vectors + + sum3 = svdot_s32(sum3, v1_i8, v2_i8); + + pVect1 += vl; + pVect2 += vl; + } + + sum0 = svadd_s32_x(svptrue_b32(), sum0, sum1); + sum2 = svadd_s32_x(svptrue_b32(), sum2, sum3); + + // Perform vector addition in parallel and Horizontal sum + int32_t sum_all = svaddv_s32(svptrue_b32(), svadd_s32_x(svptrue_b32(), sum0, sum2)); + + return sum_all; +} + +template +float INT8_InnerProductSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) { + return 1.0f - + INT8_InnerProductImp(pVect1v, pVect2v, dimension); +} + +template +float INT8_CosineSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) { + float ip = INT8_InnerProductImp(pVect1v, pVect2v, dimension); + float norm_v1 = + *reinterpret_cast(static_cast(pVect1v) + dimension); + float norm_v2 = + *reinterpret_cast(static_cast(pVect2v) + dimension); + return 1.0f - ip / (norm_v1 * norm_v2); +} diff --git a/src/VecSim/spaces/IP/IP_SVE_UINT8.h b/src/VecSim/spaces/IP/IP_SVE_UINT8.h new file mode 100644 index 000000000..daaa8267a --- /dev/null +++ b/src/VecSim/spaces/IP/IP_SVE_UINT8.h @@ -0,0 +1,102 @@ +/* + *Copyright Redis Ltd. 2021 - present + *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or + *the Server Side Public License v1 (SSPLv1). + */ +#pragma once +#include "VecSim/spaces/space_includes.h" +#include + +inline void InnerProductStep(const uint8_t *&pVect1, const uint8_t *&pVect2, size_t &offset, + svuint32_t &sum, const size_t chunk) { + svbool_t pg = svptrue_b8(); + + // Load uint8 vectors + svuint8_t v1_ui8 = svld1_u8(pg, pVect1 + offset); + svuint8_t v2_ui8 = svld1_u8(pg, pVect2 + offset); + + sum = svdot_u32(sum, v1_ui8, v2_ui8); + + offset += chunk; // Move to the next set of uint8 elements +} + +template +float UINT8_InnerProductImp(const void *pVect1v, const void *pVect2v, size_t dimension) { + const uint8_t *pVect1 = reinterpret_cast(pVect1v); + const uint8_t *pVect2 = reinterpret_cast(pVect2v); + + size_t offset = 0; + const size_t vl = svcntb(); + const size_t chunk_size = 4 * vl; + + // Each innerProductStep adds maximum 2^8 & 2^8 = 2^16 + // Therefore, on a single accumulator, we can perform 2^16 steps before overflowing + // That scenario will happen only is the dimension of the vector is larger than 16*4*2^16 = 2^22 + // (16 uint8 in 1 SVE register) * (4 accumulators) * (2^16 steps) + // We can safely assume that the dimension is smaller than that + // So using int32_t is safe + + svuint32_t sum0 = svdup_u32(0); + svuint32_t sum1 = svdup_u32(0); + svuint32_t sum2 = svdup_u32(0); + svuint32_t sum3 = svdup_u32(0); + + size_t num_chunks = dimension / chunk_size; + + for (size_t i = 0; i < num_chunks; ++i) { + InnerProductStep(pVect1, pVect2, offset, sum0, vl); + InnerProductStep(pVect1, pVect2, offset, sum1, vl); + InnerProductStep(pVect1, pVect2, offset, sum2, vl); + InnerProductStep(pVect1, pVect2, offset, sum3, vl); + } + + // Process remaining complete SVE vectors that didn't fit into the main loop + // These are full vector operations (0-3 elements) + if constexpr (additional_steps > 0) { + if constexpr (additional_steps >= 1) { + InnerProductStep(pVect1, pVect2, offset, sum0, vl); + } + if constexpr (additional_steps >= 2) { + InnerProductStep(pVect1, pVect2, offset, sum1, vl); + } + if constexpr (additional_steps >= 3) { + InnerProductStep(pVect1, pVect2, offset, sum2, vl); + } + } + + if constexpr (partial_chunk) { + svbool_t pg = svwhilelt_b8_u64(offset, dimension); + + svuint8_t v1_ui8 = svld1_u8(pg, pVect1 + offset); // Load uint8 vectors + svuint8_t v2_ui8 = svld1_u8(pg, pVect2 + offset); // Load uint8 vectors + + sum3 = svdot_u32(sum3, v1_ui8, v2_ui8); + + pVect1 += vl; + pVect2 += vl; + } + + sum0 = svadd_u32_x(svptrue_b32(), sum0, sum1); + sum2 = svadd_u32_x(svptrue_b32(), sum2, sum3); + + // Perform vector addition in parallel and Horizontal sum + int32_t sum_all = svaddv_u32(svptrue_b32(), svadd_u32_x(svptrue_b32(), sum0, sum2)); + + return sum_all; +} + +template +float UINT8_InnerProductSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) { + return 1.0f - + UINT8_InnerProductImp(pVect1v, pVect2v, dimension); +} + +template +float UINT8_CosineSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) { + float ip = UINT8_InnerProductImp(pVect1v, pVect2v, dimension); + float norm_v1 = + *reinterpret_cast(static_cast(pVect1v) + dimension); + float norm_v2 = + *reinterpret_cast(static_cast(pVect2v) + dimension); + return 1.0f - ip / (norm_v1 * norm_v2); +} diff --git a/src/VecSim/spaces/IP_space.cpp b/src/VecSim/spaces/IP_space.cpp index e930ee143..df1f656b5 100644 --- a/src/VecSim/spaces/IP_space.cpp +++ b/src/VecSim/spaces/IP_space.cpp @@ -276,12 +276,27 @@ dist_func_t IP_INT8_GetDistFunc(size_t dim, unsigned char *alignment, con } dist_func_t ret_dist_func = INT8_InnerProduct; + + auto features = getCpuOptimizationFeatures(arch_opt); + +#ifdef CPU_FEATURES_ARCH_AARCH64 +#ifdef OPT_SVE2 + if (features.sve2) { + return Choose_INT8_IP_implementation_SVE2(dim); + } +#endif +#ifdef OPT_SVE + if (features.sve) { + return Choose_INT8_IP_implementation_SVE(dim); + } +#endif +#endif +#ifdef CPU_FEATURES_ARCH_X86_64 // Optimizations assume at least 32 int8. If we have less, we use the naive implementation. if (dim < 32) { return ret_dist_func; } -#ifdef CPU_FEATURES_ARCH_X86_64 - auto features = getCpuOptimizationFeatures(arch_opt); + #ifdef OPT_AVX512_F_BW_VL_VNNI if (features.avx512f && features.avx512bw && features.avx512vl && features.avx512vnni) { if (dim % 32 == 0) // no point in aligning if we have an offsetting residual @@ -301,12 +316,26 @@ dist_func_t Cosine_INT8_GetDistFunc(size_t dim, unsigned char *alignment, } dist_func_t ret_dist_func = INT8_Cosine; + + auto features = getCpuOptimizationFeatures(arch_opt); + +#ifdef CPU_FEATURES_ARCH_AARCH64 +#ifdef OPT_SVE2 + if (features.sve2) { + return Choose_INT8_Cosine_implementation_SVE2(dim); + } +#endif +#ifdef OPT_SVE + if (features.sve) { + return Choose_INT8_Cosine_implementation_SVE(dim); + } +#endif +#endif +#ifdef CPU_FEATURES_ARCH_X86_64 // Optimizations assume at least 32 int8. If we have less, we use the naive implementation. if (dim < 32) { return ret_dist_func; } -#ifdef CPU_FEATURES_ARCH_X86_64 - auto features = getCpuOptimizationFeatures(arch_opt); #ifdef OPT_AVX512_F_BW_VL_VNNI if (features.avx512f && features.avx512bw && features.avx512vl && features.avx512vnni) { // For int8 vectors with cosine distance, the extra float for the norm shifts alignment to @@ -329,12 +358,26 @@ dist_func_t IP_UINT8_GetDistFunc(size_t dim, unsigned char *alignment, } dist_func_t ret_dist_func = UINT8_InnerProduct; + + auto features = getCpuOptimizationFeatures(arch_opt); + +#ifdef CPU_FEATURES_ARCH_AARCH64 +#ifdef OPT_SVE2 + if (features.sve2) { + return Choose_UINT8_IP_implementation_SVE2(dim); + } +#endif +#ifdef OPT_SVE + if (features.sve) { + return Choose_UINT8_IP_implementation_SVE(dim); + } +#endif +#endif +#ifdef CPU_FEATURES_ARCH_X86_64 // Optimizations assume at least 32 uint8. If we have less, we use the naive implementation. if (dim < 32) { return ret_dist_func; } -#ifdef CPU_FEATURES_ARCH_X86_64 - auto features = getCpuOptimizationFeatures(arch_opt); #ifdef OPT_AVX512_F_BW_VL_VNNI if (features.avx512f && features.avx512bw && features.avx512vl && features.avx512vnni) { if (dim % 32 == 0) // no point in aligning if we have an offsetting residual @@ -354,12 +397,26 @@ dist_func_t Cosine_UINT8_GetDistFunc(size_t dim, unsigned char *alignment } dist_func_t ret_dist_func = UINT8_Cosine; + + auto features = getCpuOptimizationFeatures(arch_opt); + +#ifdef CPU_FEATURES_ARCH_AARCH64 +#ifdef OPT_SVE2 + if (features.sve2) { + return Choose_UINT8_Cosine_implementation_SVE2(dim); + } +#endif +#ifdef OPT_SVE + if (features.sve) { + return Choose_UINT8_Cosine_implementation_SVE(dim); + } +#endif +#endif +#ifdef CPU_FEATURES_ARCH_X86_64 // Optimizations assume at least 32 uint8. If we have less, we use the naive implementation. if (dim < 32) { return ret_dist_func; } -#ifdef CPU_FEATURES_ARCH_X86_64 - auto features = getCpuOptimizationFeatures(arch_opt); #ifdef OPT_AVX512_F_BW_VL_VNNI if (features.avx512f && features.avx512bw && features.avx512vl && features.avx512vnni) { // For uint8 vectors with cosine distance, the extra float for the norm shifts alignment to diff --git a/src/VecSim/spaces/L2/L2_SVE_INT8.h b/src/VecSim/spaces/L2/L2_SVE_INT8.h new file mode 100644 index 000000000..eaad44a53 --- /dev/null +++ b/src/VecSim/spaces/L2/L2_SVE_INT8.h @@ -0,0 +1,89 @@ +/* + *Copyright Redis Ltd. 2021 - present + *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or + *the Server Side Public License v1 (SSPLv1). + */ + +#include "VecSim/spaces/space_includes.h" +#include + +// Aligned step using svptrue_b8() +inline void L2SquareStep(const int8_t *&pVect1, const int8_t *&pVect2, size_t &offset, + svuint32_t &sum, const size_t chunk) { + svbool_t pg = svptrue_b8(); + // Note: Because all the bits are 1, the extention to 16 and 32 bits does not make a difference + // Otherwise, pg should be recalculated for 16 and 32 operations + + svint8_t v1_i8 = svld1_s8(pg, pVect1 + offset); // Load int8 vectors from pVect1 + svint8_t v2_i8 = svld1_s8(pg, pVect2 + offset); // Load int8 vectors from pVect2 + + // The result of svabd can be reinterpreted as uint8 + svuint8_t abs_diff = svreinterpret_u8_s8(svabd_s8_x(pg, v1_i8, v2_i8)); + + sum = svdot_u32(sum, abs_diff, abs_diff); + offset += chunk; // Move to the next set of int8 elements +} + +template +float INT8_L2SqrSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) { + const int8_t *pVect1 = reinterpret_cast(pVect1v); + const int8_t *pVect2 = reinterpret_cast(pVect2v); + + // number of uint8 per SVE register (we use uint accumulators) + const size_t vl = svcntb(); + const size_t chunk_size = 4 * vl; + svbool_t all = svptrue_b8(); + + // Each L2SquareStep adds maximum (2^8)^2 = 2^16 + // Therefor, on a single accumulator, we can perform 2^16 steps before overflowing + // That scenario will happen only is the dimension of the vector is larger than 16*4*2^16 = 2^22 + // (16 uint8 in 1 SVE register) * (4 accumulators) * (2^16 steps) + // We can safely assume that the dimension is smaller than that + // So using uint32_t is safe + + svuint32_t sum0 = svdup_u32(0); + svuint32_t sum1 = svdup_u32(0); + svuint32_t sum2 = svdup_u32(0); + svuint32_t sum3 = svdup_u32(0); + + size_t offset = 0; + size_t num_main_blocks = dimension / chunk_size; + + for (size_t i = 0; i < num_main_blocks; ++i) { + L2SquareStep(pVect1, pVect2, offset, sum0, vl); + L2SquareStep(pVect1, pVect2, offset, sum1, vl); + L2SquareStep(pVect1, pVect2, offset, sum2, vl); + L2SquareStep(pVect1, pVect2, offset, sum3, vl); + } + + if constexpr (additional_steps > 0) { + if constexpr (additional_steps >= 1) { + L2SquareStep(pVect1, pVect2, offset, sum0, vl); + } + if constexpr (additional_steps >= 2) { + L2SquareStep(pVect1, pVect2, offset, sum1, vl); + } + if constexpr (additional_steps >= 3) { + L2SquareStep(pVect1, pVect2, offset, sum2, vl); + } + } + + if constexpr (partial_chunk) { + + svbool_t pg = svwhilelt_b8_u64(offset, dimension); + + svint8_t v1_i8 = svld1_s8(pg, pVect1 + offset); // Load int8 vectors from pVect1 + svint8_t v2_i8 = svld1_s8(pg, pVect2 + offset); // Load int8 vectors from pVect2 + + // The result of svabd can be reinterpreted as uint8 + svuint8_t abs_diff = svreinterpret_u8_s8(svabd_s8_x(pg, v1_i8, v2_i8)); + + // Can sum with taking into account pg because svld1 will set inactive lanes to 0 + sum3 = svdot_u32(sum3, abs_diff, abs_diff); + } + + sum0 = svadd_u32_x(all, sum0, sum1); + sum2 = svadd_u32_x(all, sum2, sum3); + svuint32_t sum_all = svadd_u32_x(all, sum0, sum2); + return svaddv_u32(svptrue_b32(), sum_all); +} diff --git a/src/VecSim/spaces/L2/L2_SVE_UINT8.h b/src/VecSim/spaces/L2/L2_SVE_UINT8.h new file mode 100644 index 000000000..bf56d9ad1 --- /dev/null +++ b/src/VecSim/spaces/L2/L2_SVE_UINT8.h @@ -0,0 +1,86 @@ +/* + *Copyright Redis Ltd. 2021 - present + *Licensed under your choice of the Redis Source Available License 2.0 (RSALv2) or + *the Server Side Public License v1 (SSPLv1). + */ + +#include "VecSim/spaces/space_includes.h" +#include + +// Aligned step using svptrue_b8() +inline void L2SquareStep(const uint8_t *&pVect1, const uint8_t *&pVect2, size_t &offset, + svuint32_t &sum, const size_t chunk) { + svbool_t pg = svptrue_b8(); + // Note: Because all the bits are 1, the extention to 16 and 32 bits does not make a difference + // Otherwise, pg should be recalculated for 16 and 32 operations + + svuint8_t v1_ui8 = svld1_u8(pg, pVect1 + offset); // Load uint8 vectors from pVect1 + svuint8_t v2_ui8 = svld1_u8(pg, pVect2 + offset); // Load uint8 vectors from pVect2 + + svuint8_t abs_diff = svabd_u8_x(pg, v1_ui8, v2_ui8); + + sum = svdot_u32(sum, abs_diff, abs_diff); + + offset += chunk; // Move to the next set of uint8 elements +} + +template +float UINT8_L2SqrSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) { + const uint8_t *pVect1 = reinterpret_cast(pVect1v); + const uint8_t *pVect2 = reinterpret_cast(pVect2v); + + // number of uint8 per SVE register + const size_t vl = svcntb(); + const size_t chunk_size = 4 * vl; + svbool_t all = svptrue_b8(); + + // Each L2SquareStep adds maximum (2^8)^2 = 2^16 + // Therefor, on a single accumulator, we can perform 2^16 steps before overflowing + // That scenario will happen only is the dimension of the vector is larger than 16*4*2^16 = 2^22 + // (16 uint8 in 1 SVE register) * (4 accumulators) * (2^16 steps) + // We can safely assume that the dimension is smaller than that + // So using uint32_t is safe + + svuint32_t sum0 = svdup_u32(0); + svuint32_t sum1 = svdup_u32(0); + svuint32_t sum2 = svdup_u32(0); + svuint32_t sum3 = svdup_u32(0); + + size_t offset = 0; + size_t num_main_blocks = dimension / chunk_size; + + for (size_t i = 0; i < num_main_blocks; ++i) { + L2SquareStep(pVect1, pVect2, offset, sum0, vl); + L2SquareStep(pVect1, pVect2, offset, sum1, vl); + L2SquareStep(pVect1, pVect2, offset, sum2, vl); + L2SquareStep(pVect1, pVect2, offset, sum3, vl); + } + + if constexpr (additional_steps > 0) { + if constexpr (additional_steps >= 1) { + L2SquareStep(pVect1, pVect2, offset, sum0, vl); + } + if constexpr (additional_steps >= 2) { + L2SquareStep(pVect1, pVect2, offset, sum1, vl); + } + if constexpr (additional_steps >= 3) { + L2SquareStep(pVect1, pVect2, offset, sum2, vl); + } + } + + if constexpr (partial_chunk) { + + svbool_t pg = svwhilelt_b8_u64(offset, dimension); + svuint8_t v1_ui8 = svld1_u8(pg, pVect1 + offset); // Load uint8 vectors from pVect1 + svuint8_t v2_ui8 = svld1_u8(pg, pVect2 + offset); // Load uint8 vectors from pVect2 + + svuint8_t abs_diff = svabd_u8_x(pg, v1_ui8, v2_ui8); + + sum3 = svdot_u32(sum3, abs_diff, abs_diff); + } + + sum0 = svadd_u32_x(all, sum0, sum1); + sum2 = svadd_u32_x(all, sum2, sum3); + svuint32_t sum_all = svadd_u32_x(all, sum0, sum2); + return svaddv_u32(svptrue_b32(), sum_all); +} diff --git a/src/VecSim/spaces/L2_space.cpp b/src/VecSim/spaces/L2_space.cpp index 7acf2eb6d..56bee3216 100644 --- a/src/VecSim/spaces/L2_space.cpp +++ b/src/VecSim/spaces/L2_space.cpp @@ -267,12 +267,26 @@ dist_func_t L2_INT8_GetDistFunc(size_t dim, unsigned char *alignment, con } dist_func_t ret_dist_func = INT8_L2Sqr; + + auto features = getCpuOptimizationFeatures(arch_opt); + +#ifdef CPU_FEATURES_ARCH_AARCH64 +#ifdef OPT_SVE2 + if (features.sve2) { + return Choose_INT8_L2_implementation_SVE2(dim); + } +#endif +#ifdef OPT_SVE + if (features.sve) { + return Choose_INT8_L2_implementation_SVE(dim); + } +#endif +#endif +#ifdef CPU_FEATURES_ARCH_X86_64 // Optimizations assume at least 32 int8. If we have less, we use the naive implementation. if (dim < 32) { return ret_dist_func; } -#ifdef CPU_FEATURES_ARCH_X86_64 - auto features = getCpuOptimizationFeatures(arch_opt); #ifdef OPT_AVX512_F_BW_VL_VNNI if (features.avx512f && features.avx512bw && features.avx512vl && features.avx512vnni) { if (dim % 32 == 0) // no point in aligning if we have an offsetting residual @@ -296,8 +310,22 @@ dist_func_t L2_UINT8_GetDistFunc(size_t dim, unsigned char *alignment, if (dim < 32) { return ret_dist_func; } -#ifdef CPU_FEATURES_ARCH_X86_64 + auto features = getCpuOptimizationFeatures(arch_opt); + +#ifdef CPU_FEATURES_ARCH_AARCH64 +#ifdef OPT_SVE2 + if (features.sve2) { + return Choose_UINT8_L2_implementation_SVE2(dim); + } +#endif +#ifdef OPT_SVE + if (features.sve) { + return Choose_UINT8_L2_implementation_SVE(dim); + } +#endif +#endif +#ifdef CPU_FEATURES_ARCH_X86_64 #ifdef OPT_AVX512_F_BW_VL_VNNI if (features.avx512f && features.avx512bw && features.avx512vl && features.avx512vnni) { if (dim % 32 == 0) // no point in aligning if we have an offsetting residual diff --git a/src/VecSim/spaces/functions/SVE.cpp b/src/VecSim/spaces/functions/SVE.cpp index e3ebc5209..d0ec5e514 100644 --- a/src/VecSim/spaces/functions/SVE.cpp +++ b/src/VecSim/spaces/functions/SVE.cpp @@ -15,6 +15,12 @@ #include "VecSim/spaces/IP/IP_SVE_FP64.h" #include "VecSim/spaces/L2/L2_SVE_FP64.h" +#include "VecSim/spaces/L2/L2_SVE_INT8.h" +#include "VecSim/spaces/IP/IP_SVE_INT8.h" + +#include "VecSim/spaces/L2/L2_SVE_UINT8.h" +#include "VecSim/spaces/IP/IP_SVE_UINT8.h" + namespace spaces { #include "implementation_chooser.h" @@ -52,6 +58,42 @@ dist_func_t Choose_FP64_L2_implementation_SVE(size_t dim) { return ret_dist_func; } +dist_func_t Choose_INT8_L2_implementation_SVE(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, INT8_L2SqrSIMD_SVE, dim, svcntb); + return ret_dist_func; +} + +dist_func_t Choose_INT8_IP_implementation_SVE(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, INT8_InnerProductSIMD_SVE, dim, svcntb); + return ret_dist_func; +} + +dist_func_t Choose_INT8_Cosine_implementation_SVE(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, INT8_CosineSIMD_SVE, dim, svcntb); + return ret_dist_func; +} + +dist_func_t Choose_UINT8_L2_implementation_SVE(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, UINT8_L2SqrSIMD_SVE, dim, svcntb); + return ret_dist_func; +} + +dist_func_t Choose_UINT8_IP_implementation_SVE(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, UINT8_InnerProductSIMD_SVE, dim, svcntb); + return ret_dist_func; +} + +dist_func_t Choose_UINT8_Cosine_implementation_SVE(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, UINT8_CosineSIMD_SVE, dim, svcntb); + return ret_dist_func; +} + #include "implementation_chooser_cleanup.h" } // namespace spaces diff --git a/src/VecSim/spaces/functions/SVE.h b/src/VecSim/spaces/functions/SVE.h index 35d1ffe4c..e06214a11 100644 --- a/src/VecSim/spaces/functions/SVE.h +++ b/src/VecSim/spaces/functions/SVE.h @@ -19,4 +19,16 @@ dist_func_t Choose_FP16_L2_implementation_SVE(size_t dim); dist_func_t Choose_FP64_IP_implementation_SVE(size_t dim); dist_func_t Choose_FP64_L2_implementation_SVE(size_t dim); +dist_func_t Choose_INT8_L2_implementation_SVE(size_t dim); + +dist_func_t Choose_INT8_IP_implementation_SVE(size_t dim); + +dist_func_t Choose_INT8_Cosine_implementation_SVE(size_t dim); + +dist_func_t Choose_UINT8_L2_implementation_SVE(size_t dim); + +dist_func_t Choose_UINT8_IP_implementation_SVE(size_t dim); + +dist_func_t Choose_UINT8_Cosine_implementation_SVE(size_t dim); + } // namespace spaces diff --git a/src/VecSim/spaces/functions/SVE2.cpp b/src/VecSim/spaces/functions/SVE2.cpp index c52c99da9..57f649a6c 100644 --- a/src/VecSim/spaces/functions/SVE2.cpp +++ b/src/VecSim/spaces/functions/SVE2.cpp @@ -14,6 +14,10 @@ #include "VecSim/spaces/IP/IP_SVE_FP64.h" #include "VecSim/spaces/L2/L2_SVE_FP64.h" +#include "VecSim/spaces/L2/L2_SVE_INT8.h" // SVE2 implementation is identical to SVE +#include "VecSim/spaces/IP/IP_SVE_INT8.h" // SVE2 implementation is identical to SVE +#include "VecSim/spaces/L2/L2_SVE_UINT8.h" // SVE2 implementation is identical to SVE +#include "VecSim/spaces/IP/IP_SVE_UINT8.h" // SVE2 implementation is identical to SVE namespace spaces { @@ -52,6 +56,42 @@ dist_func_t Choose_FP64_L2_implementation_SVE2(size_t dim) { return ret_dist_func; } +dist_func_t Choose_INT8_L2_implementation_SVE2(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, INT8_L2SqrSIMD_SVE, dim, svcntb); + return ret_dist_func; +} + +dist_func_t Choose_INT8_IP_implementation_SVE2(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, INT8_InnerProductSIMD_SVE, dim, svcntb); + return ret_dist_func; +} + +dist_func_t Choose_INT8_Cosine_implementation_SVE2(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, INT8_CosineSIMD_SVE, dim, svcntb); + return ret_dist_func; +} + +dist_func_t Choose_UINT8_L2_implementation_SVE2(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, UINT8_L2SqrSIMD_SVE, dim, svcntb); + return ret_dist_func; +} + +dist_func_t Choose_UINT8_IP_implementation_SVE2(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, UINT8_InnerProductSIMD_SVE, dim, svcntb); + return ret_dist_func; +} + +dist_func_t Choose_UINT8_Cosine_implementation_SVE2(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, UINT8_CosineSIMD_SVE, dim, svcntb); + return ret_dist_func; +} + #include "implementation_chooser_cleanup.h" } // namespace spaces diff --git a/src/VecSim/spaces/functions/SVE2.h b/src/VecSim/spaces/functions/SVE2.h index 67e36dc20..d9f89dfd0 100644 --- a/src/VecSim/spaces/functions/SVE2.h +++ b/src/VecSim/spaces/functions/SVE2.h @@ -19,4 +19,16 @@ dist_func_t Choose_FP16_L2_implementation_SVE2(size_t dim); dist_func_t Choose_FP64_IP_implementation_SVE2(size_t dim); dist_func_t Choose_FP64_L2_implementation_SVE2(size_t dim); +dist_func_t Choose_INT8_L2_implementation_SVE2(size_t dim); + +dist_func_t Choose_INT8_IP_implementation_SVE2(size_t dim); + +dist_func_t Choose_INT8_Cosine_implementation_SVE2(size_t dim); + +dist_func_t Choose_UINT8_L2_implementation_SVE2(size_t dim); + +dist_func_t Choose_UINT8_IP_implementation_SVE2(size_t dim); + +dist_func_t Choose_UINT8_Cosine_implementation_SVE2(size_t dim); + } // namespace spaces diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_int8.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_int8.cpp index 11eb0908b..f39ce1100 100644 --- a/tests/benchmark/spaces_benchmarks/bm_spaces_int8.cpp +++ b/tests/benchmark/spaces_benchmarks/bm_spaces_int8.cpp @@ -51,6 +51,20 @@ INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_Integers_INT8, INT8, AVX512F_BW #endif // x86_64 +#ifdef CPU_FEATURES_ARCH_AARCH64 +cpu_features::Aarch64Features opt = cpu_features::GetAarch64Info().features; +#ifdef OPT_SVE2 +bool sve2_supported = opt.sve2; // Check for SVE support +INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_Integers_INT8, INT8, SVE2, 32, sve2_supported); +INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_Integers_INT8, INT8, SVE2, 32, sve2_supported); +#endif +#ifdef OPT_SVE +bool sve_supported = opt.sve; // Check for SVE support +INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_Integers_INT8, INT8, SVE, 32, sve2_supported); +INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_Integers_INT8, INT8, SVE, 32, sve2_supported); +#endif +#endif + INITIALIZE_NAIVE_BM(BM_VecSimSpaces_Integers_INT8, INT8, InnerProduct, 32); INITIALIZE_NAIVE_BM(BM_VecSimSpaces_Integers_INT8, INT8, Cosine, 32); INITIALIZE_NAIVE_BM(BM_VecSimSpaces_Integers_INT8, INT8, L2Sqr, 32); diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_uint8.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_uint8.cpp index 248259165..71ab1d6e8 100644 --- a/tests/benchmark/spaces_benchmarks/bm_spaces_uint8.cpp +++ b/tests/benchmark/spaces_benchmarks/bm_spaces_uint8.cpp @@ -51,6 +51,20 @@ INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_Integers_UINT8, UINT8, AVX512F_ #endif // x86_64 +#ifdef CPU_FEATURES_ARCH_AARCH64 +cpu_features::Aarch64Features opt = cpu_features::GetAarch64Info().features; +#ifdef OPT_SVE2 +bool sve2_supported = opt.sve2; // Check for SVE support +INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_Integers_UINT8, UINT8, SVE2, 32, sve2_supported); +INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_Integers_UINT8, UINT8, SVE2, 32, sve2_supported); +#endif +#ifdef OPT_SVE +bool sve_supported = opt.sve; // Check for SVE support +INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_Integers_UINT8, UINT8, SVE, 32, sve2_supported); +INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_Integers_UINT8, UINT8, SVE, 32, sve2_supported); +#endif +#endif + INITIALIZE_NAIVE_BM(BM_VecSimSpaces_Integers_UINT8, UINT8, InnerProduct, 32); INITIALIZE_NAIVE_BM(BM_VecSimSpaces_Integers_UINT8, UINT8, Cosine, 32); INITIALIZE_NAIVE_BM(BM_VecSimSpaces_Integers_UINT8, UINT8, L2Sqr, 32); diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index 19487bc8d..c809e14ca 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -1304,6 +1304,30 @@ TEST_P(INT8SpacesOptimizationTest, INT8L2SqrTest) { optimization.avx512f = optimization.avx512bw = optimization.avx512vl = optimization.avx512vnni = 0; } +#endif +#ifdef OPT_SVE2 + if (optimization.sve2) { + unsigned char alignment = 0; + arch_opt_func = L2_INT8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_INT8_L2_implementation_SVE2(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_EQ(baseline, arch_opt_func(v1, v2, dim)) << "SVE2 with dim " << dim; + ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; + // Unset sve2 flag as well, so we'll choose the next option (default). + optimization.sve2 = 0; + } +#endif +#ifdef OPT_SVE + if (optimization.sve) { + unsigned char alignment = 0; + arch_opt_func = L2_INT8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_INT8_L2_implementation_SVE(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_EQ(baseline, arch_opt_func(v1, v2, dim)) << "SVE with dim " << dim; + ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; + // Unset sve flag as well, so we'll choose the next option (default). + optimization.sve = 0; + } #endif unsigned char alignment = 0; arch_opt_func = L2_INT8_GetDistFunc(dim, &alignment, &optimization); @@ -1340,6 +1364,30 @@ TEST_P(INT8SpacesOptimizationTest, INT8InnerProductTest) { optimization.avx512f = optimization.avx512bw = optimization.avx512vl = optimization.avx512vnni = 0; } +#endif +#ifdef OPT_SVE2 + if (optimization.sve2) { + unsigned char alignment = 0; + arch_opt_func = IP_INT8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_INT8_IP_implementation_SVE2(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_EQ(baseline, arch_opt_func(v1, v2, dim)) << "SVE2 with dim " << dim; + ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; + // Unset sve flag as well, so we'll choose the next option (default). + optimization.sve2 = 0; + } +#endif +#ifdef OPT_SVE + if (optimization.sve) { + unsigned char alignment = 0; + arch_opt_func = IP_INT8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_INT8_IP_implementation_SVE(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_EQ(baseline, arch_opt_func(v1, v2, dim)) << "SVE with dim " << dim; + ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; + // Unset sve flag as well, so we'll choose the next option (default). + optimization.sve = 0; + } #endif unsigned char alignment = 0; arch_opt_func = IP_INT8_GetDistFunc(dim, &alignment, &optimization); @@ -1377,6 +1425,30 @@ TEST_P(INT8SpacesOptimizationTest, INT8CosineTest) { optimization.avx512f = optimization.avx512bw = optimization.avx512vl = optimization.avx512vnni = 0; } +#endif +#ifdef OPT_SVE2 + if (optimization.sve2) { + unsigned char alignment = 0; + arch_opt_func = Cosine_INT8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_INT8_Cosine_implementation_SVE2(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_EQ(baseline, arch_opt_func(v1, v2, dim)) << "SVE2 with dim " << dim; + ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; + // Unset sve flag as well, so we'll choose the next option (default). + optimization.sve2 = 0; + } +#endif +#ifdef OPT_SVE + if (optimization.sve) { + unsigned char alignment = 0; + arch_opt_func = Cosine_INT8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_INT8_Cosine_implementation_SVE(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_EQ(baseline, arch_opt_func(v1, v2, dim)) << "SVE with dim " << dim; + ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; + // Unset sve flag as well, so we'll choose the next option (default). + optimization.sve = 0; + } #endif unsigned char alignment = 0; arch_opt_func = Cosine_INT8_GetDistFunc(dim, &alignment, &optimization); @@ -1418,6 +1490,30 @@ TEST_P(UINT8SpacesOptimizationTest, UINT8L2SqrTest) { optimization.avx512f = optimization.avx512bw = optimization.avx512vl = optimization.avx512vnni = 0; } +#endif +#ifdef OPT_SVE2 + if (optimization.sve2) { + unsigned char alignment = 0; + arch_opt_func = L2_UINT8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_UINT8_L2_implementation_SVE2(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_EQ(baseline, arch_opt_func(v1, v2, dim)) << "SVE2 with dim " << dim; + ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; + // Unset sve2 flag as well, so we'll choose the next option (default). + optimization.sve2 = 0; + } +#endif +#ifdef OPT_SVE + if (optimization.sve) { + unsigned char alignment = 0; + arch_opt_func = L2_UINT8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_UINT8_L2_implementation_SVE(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_EQ(baseline, arch_opt_func(v1, v2, dim)) << "SVE with dim " << dim; + ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; + // Unset sve flag as well, so we'll choose the next option (default). + optimization.sve = 0; + } #endif unsigned char alignment = 0; arch_opt_func = L2_UINT8_GetDistFunc(dim, &alignment, &optimization); @@ -1454,6 +1550,30 @@ TEST_P(UINT8SpacesOptimizationTest, UINT8InnerProductTest) { optimization.avx512f = optimization.avx512bw = optimization.avx512vl = optimization.avx512vnni = 0; } +#endif +#ifdef OPT_SVE2 + if (optimization.sve2) { + unsigned char alignment = 0; + arch_opt_func = IP_UINT8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_UINT8_IP_implementation_SVE2(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_EQ(baseline, arch_opt_func(v1, v2, dim)) << "SVE2 with dim " << dim; + ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; + // Unset sve2 flag as well, so we'll choose the next option (default). + optimization.sve2 = 0; + } +#endif +#ifdef OPT_SVE + if (optimization.sve) { + unsigned char alignment = 0; + arch_opt_func = IP_UINT8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_UINT8_IP_implementation_SVE(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_EQ(baseline, arch_opt_func(v1, v2, dim)) << "SVE with dim " << dim; + ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; + // Unset sve flag as well, so we'll choose the next option (default). + optimization.sve = 0; + } #endif unsigned char alignment = 0; arch_opt_func = IP_UINT8_GetDistFunc(dim, &alignment, &optimization); @@ -1477,6 +1597,30 @@ TEST_P(UINT8SpacesOptimizationTest, UINT8CosineTest) { dist_func_t arch_opt_func; float baseline = UINT8_Cosine(v1, v2, dim); +#ifdef OPT_SVE2 + if (optimization.sve2) { + unsigned char alignment = 0; + arch_opt_func = Cosine_UINT8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_UINT8_Cosine_implementation_SVE2(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_EQ(baseline, arch_opt_func(v1, v2, dim)) << "SVE2 with dim " << dim; + ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; + // Unset sve2 flag as well, so we'll choose the next option (default). + optimization.sve2 = 0; + } +#endif +#ifdef OPT_SVE + if (optimization.sve) { + unsigned char alignment = 0; + arch_opt_func = Cosine_UINT8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_UINT8_Cosine_implementation_SVE(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_EQ(baseline, arch_opt_func(v1, v2, dim)) << "SVE with dim " << dim; + ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; + // Unset sve flag as well, so we'll choose the next option (default). + optimization.sve = 0; + } +#endif #ifdef OPT_AVX512_F_BW_VL_VNNI if (optimization.avx512f && optimization.avx512bw && optimization.avx512vl && optimization.avx512vnni) { @@ -1492,12 +1636,90 @@ TEST_P(UINT8SpacesOptimizationTest, UINT8CosineTest) { optimization.avx512vnni = 0; } #endif + unsigned char alignment = 0; arch_opt_func = Cosine_UINT8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, UINT8_Cosine) << "Unexpected distance function chosen for dim " << dim; ASSERT_EQ(baseline, arch_opt_func(v1, v2, dim)) << "No optimization with dim " << dim; ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; } +TEST_P(UINT8SpacesOptimizationTest, UINT8_full_range_test) { + auto optimization = getCpuOptimizationFeatures(); + constexpr size_t dim = 512; + + uint8_t v1[dim + sizeof(float)]; + uint8_t v2[dim + sizeof(float)]; + + // v1: 0..255 followed by 255..0 + for (size_t i = 0; i < 256; i++) { + v1[i] = static_cast(i); + v1[256 + i] = static_cast(255 - i); + } + + // v2: 255..0 followed by 0..255 + for (size_t i = 0; i < 256; i++) { + v2[i] = static_cast(255 - i); + v2[256 + i] = static_cast(i); + } + + // write the norm at the end of the vector + *(float *)(v1 + dim) = test_utils::integral_compute_norm(v1, dim); + *(float *)(v2 + dim) = test_utils::integral_compute_norm(v2, dim); + + float baseline_l2 = UINT8_L2Sqr(v1, v2, dim); + float baseline_ip = UINT8_InnerProduct(v1, v2, dim); + float baseline_cosine = UINT8_Cosine(v1, v2, dim); + + dist_func_t arch_opt_func; + +#ifdef OPT_SVE2 + if (optimization.sve2) { + unsigned char alignment = 0; + + arch_opt_func = Choose_UINT8_L2_implementation_SVE2(dim); + ASSERT_EQ(baseline_l2, arch_opt_func(v1, v2, dim)) << "L2 SVE2 with dim " << dim; + arch_opt_func = Choose_UINT8_IP_implementation_SVE2(dim); + ASSERT_EQ(baseline_ip, arch_opt_func(v1, v2, dim)) << "IP SVE2 with dim " << dim; + arch_opt_func = Choose_UINT8_Cosine_implementation_SVE2(dim); + ASSERT_EQ(baseline_cosine, arch_opt_func(v1, v2, dim)) << "Cosine SVE2 with dim " << dim; + + // Unset sve2 flag as well, so we'll choose the next option (default). + optimization.sve2 = 0; + } +#endif +#ifdef OPT_SVE + if (optimization.sve) { + unsigned char alignment = 0; + + arch_opt_func = Choose_UINT8_L2_implementation_SVE(dim); + ASSERT_EQ(baseline_l2, arch_opt_func(v1, v2, dim)) << "L2 SVE with dim " << dim; + arch_opt_func = Choose_UINT8_IP_implementation_SVE(dim); + ASSERT_EQ(baseline_ip, arch_opt_func(v1, v2, dim)) << "IP SVE with dim " << dim; + arch_opt_func = Choose_UINT8_Cosine_implementation_SVE(dim); + ASSERT_EQ(baseline_cosine, arch_opt_func(v1, v2, dim)) << "Cosine SVE with dim " << dim; + + // Unset sve flag as well, so we'll choose the next option (default). + optimization.sve = 0; + } +#endif +#ifdef OPT_AVX512_F_BW_VL_VNNI + if (optimization.avx512f && optimization.avx512bw && optimization.avx512vl && + optimization.avx512vnni) { + unsigned char alignment = 0; + + arch_opt_func = Choose_UINT8_L2_implementation_AVX512F_BW_VL_VNNI(dim); + ASSERT_EQ(baseline_l2, arch_opt_func(v1, v2, dim)) << "L2 AVX512 with dim " << dim; + arch_opt_func = Choose_UINT8_IP_implementation_AVX512F_BW_VL_VNNI(dim); + ASSERT_EQ(baseline_ip, arch_opt_func(v1, v2, dim)) << "IP AVX512 with dim " << dim; + arch_opt_func = Choose_UINT8_Cosine_implementation_AVX512F_BW_VL_VNNI(dim); + ASSERT_EQ(baseline_cosine, arch_opt_func(v1, v2, dim)) << "Cosine AVX512 with dim " << dim; + + // Unset optimizations flag, so we'll choose the next optimization. + optimization.avx512f = optimization.avx512bw = optimization.avx512vl = + optimization.avx512vnni = 0; + } +#endif +} INSTANTIATE_TEST_SUITE_P(UINT8OptFuncs, UINT8SpacesOptimizationTest, testing::Range(32UL, 64 * 2UL + 1));