diff --git a/src/VecSim/spaces/IP/IP.cpp b/src/VecSim/spaces/IP/IP.cpp index 1b0d2db24..7aeee42d7 100644 --- a/src/VecSim/spaces/IP/IP.cpp +++ b/src/VecSim/spaces/IP/IP.cpp @@ -16,39 +16,57 @@ using bfloat16 = vecsim_types::bfloat16; using float16 = vecsim_types::float16; using sq8 = vecsim_types::sq8; -float FLOAT_INTEGER_InnerProduct(const float *pVect1v, const uint8_t *pVect2v, size_t dimension, - float min_val, float delta) { - float res = 0; - for (size_t i = 0; i < dimension; i++) { - float dequantized_V2 = (pVect2v[i] * delta + min_val); - res += pVect1v[i] * dequantized_V2; - } - return res; -} - +/* + * Optimized asymmetric SQ8 inner product using algebraic identity: + * IP(x, y) = Σ(x_i * y_i) + * ≈ Σ((min + delta * q_i) * y_i) + * = min * Σy_i + delta * Σ(q_i * y_i) + * = min * y_sum + delta * quantized_dot_product + * + * Uses 4x loop unrolling with multiple accumulators for ILP. + * pVect1 is a vector of float32, pVect2 is a quantized uint8_t vector + */ float SQ8_InnerProduct(const void *pVect1v, const void *pVect2v, size_t dimension) { + const auto *pVect1 = static_cast(pVect1v); const auto *pVect2 = static_cast(pVect2v); - // pVect2 is a vector of uint8_t, so we need to de-quantize it, normalize it and then multiply - // it. it is structured as [quantized values (int8_t * dim)][min_val (float)][delta - // (float)]] The last two values are used to dequantize the vector. - const float min_val = *reinterpret_cast(pVect2 + dimension); - const float delta = *reinterpret_cast(pVect2 + dimension + sizeof(float)); - // Compute inner product with dequantization - const float res = FLOAT_INTEGER_InnerProduct(pVect1, pVect2, dimension, min_val, delta); - return 1.0f - res; + + // Use 4 accumulators for instruction-level parallelism + float sum0 = 0, sum1 = 0, sum2 = 0, sum3 = 0; + + // Main loop: process 4 elements per iteration + size_t i = 0; + size_t dim4 = dimension & ~size_t(3); // dim4 is a multiple of 4 + for (; i < dim4; i += 4) { + sum0 += pVect1[i + 0] * static_cast(pVect2[i + 0]); + sum1 += pVect1[i + 1] * static_cast(pVect2[i + 1]); + sum2 += pVect1[i + 2] * static_cast(pVect2[i + 2]); + sum3 += pVect1[i + 3] * static_cast(pVect2[i + 3]); + } + + // Handle remainder (0-3 elements) + for (; i < dimension; i++) { + sum0 += pVect1[i] * static_cast(pVect2[i]); + } + + // Combine accumulators + float quantized_dot = (sum0 + sum1) + (sum2 + sum3); + + // Get quantization parameters from stored vector + const float *params = reinterpret_cast(pVect2 + dimension); + const float min_val = params[sq8::MIN_VAL]; + const float delta = params[sq8::DELTA]; + + // Get precomputed y_sum from query blob (stored after the dim floats) + const float y_sum = pVect1[dimension + sq8::SUM_QUERY]; + + // Apply formula: IP = min * y_sum + delta * Σ(q_i * y_i) + const float ip = min_val * y_sum + delta * quantized_dot; + return 1.0f - ip; } float SQ8_Cosine(const void *pVect1v, const void *pVect2v, size_t dimension) { - const auto *pVect1 = static_cast(pVect1v); - const auto *pVect2 = static_cast(pVect2v); - - // Get quantization parameters - const float min_val = *reinterpret_cast(pVect2 + dimension); - const float delta = *reinterpret_cast(pVect2 + dimension + sizeof(float)); - // Compute inner product with dequantization - const float res = FLOAT_INTEGER_InnerProduct(pVect1, pVect2, dimension, min_val, delta); - return 1.0f - res; + return SQ8_InnerProduct(pVect1v, pVect2v, dimension); } // SQ8-to-SQ8: Common inner product implementation that returns the raw inner product value diff --git a/src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8.h b/src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8.h index f8333e6e8..0aae41a04 100644 --- a/src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8.h +++ b/src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8.h @@ -6,91 +6,96 @@ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the * GNU Affero General Public License v3 (AGPLv3). */ +#pragma once #include "VecSim/spaces/space_includes.h" #include "VecSim/spaces/AVX_utils.h" +#include "VecSim/types/sq8.h" +using sq8 = vecsim_types::sq8; +/* + * Optimized asymmetric SQ8 inner product using algebraic identity: + * + * IP(x, y) = Σ(x_i * y_i) + * ≈ Σ((min + delta * q_i) * y_i) + * = min * Σy_i + delta * Σ(q_i * y_i) + * = min * y_sum + delta * quantized_dot_product + * + * where y_sum = Σy_i is precomputed and stored in the query blob. + * This avoids dequantization in the hot loop - we only compute Σ(q_i * y_i). + * + * This version uses FMA instructions for better performance. + */ + +// Helper: compute Σ(q_i * y_i) for 8 elements using FMA (no dequantization) static inline void InnerProductStepSQ8_FMA(const float *&pVect1, const uint8_t *&pVect2, - __m256 &sum256, const __m256 &min_val_vec, - const __m256 &delta_vec) { - // Load 8 float elements from pVect1 + __m256 &sum256) { + // Load 8 float elements from query __m256 v1 = _mm256_loadu_ps(pVect1); pVect1 += 8; - // Load 8 uint8 elements from pVect2, convert to int32, then to float - __m128i v2_128 = _mm_loadl_epi64((__m128i *)pVect2); + // Load 8 uint8 elements and convert to float + __m128i v2_128 = _mm_loadl_epi64(reinterpret_cast(pVect2)); pVect2 += 8; - // Zero-extend uint8 to int32 __m256i v2_256 = _mm256_cvtepu8_epi32(v2_128); - - // Convert int32 to float __m256 v2_f = _mm256_cvtepi32_ps(v2_256); - // Dequantize and compute dot product in one step using FMA - // (val * delta) + min_val -> v2_dequant - // sum256 += v1 * v2_dequant - // Using FMA: sum256 = v1 * v2_dequant + sum256 - - // First, compute v2_dequant = v2_f * delta_vec + min_val_vec - __m256 v2_dequant = _mm256_fmadd_ps(v2_f, delta_vec, min_val_vec); - - // Then, compute sum256 += v1 * v2_dequant using FMA - sum256 = _mm256_fmadd_ps(v1, v2_dequant, sum256); + // Accumulate q_i * y_i using FMA (no dequantization!) + sum256 = _mm256_fmadd_ps(v2_f, v1, sum256); } template // 0..15 float SQ8_InnerProductImp_FMA(const void *pVect1v, const void *pVect2v, size_t dimension) { const float *pVect1 = static_cast(pVect1v); - // pVect2 is a quantized uint8_t vector const uint8_t *pVect2 = static_cast(pVect2v); const float *pEnd1 = pVect1 + dimension; - // Get dequantization parameters from the end of quantized vector - const float min_val = *reinterpret_cast(pVect2 + dimension); - const float delta = *reinterpret_cast(pVect2 + dimension + sizeof(float)); - // Create broadcast vectors for SIMD operations - __m256 min_val_vec = _mm256_set1_ps(min_val); - __m256 delta_vec = _mm256_set1_ps(delta); - + // Initialize sum accumulator for Σ(q_i * y_i) __m256 sum256 = _mm256_setzero_ps(); - // Deal with 1-7 floats with mask loading, if needed. `dim` is >16, so we have at least one - // 16-float block, so mask loading is guaranteed to be safe. + // Handle residual elements first (0-7 elements) if constexpr (residual % 8) { __mmask8 constexpr mask = (1 << (residual % 8)) - 1; __m256 v1 = my_mm256_maskz_loadu_ps(pVect1); pVect1 += residual % 8; - // Load quantized values and dequantize - __m128i v2_128 = _mm_loadl_epi64((__m128i *)pVect2); + // Load uint8 elements and convert to float + __m128i v2_128 = _mm_loadl_epi64(reinterpret_cast(pVect2)); pVect2 += residual % 8; - // Zero-extend uint8 to int32 __m256i v2_256 = _mm256_cvtepu8_epi32(v2_128); - - // Convert int32 to float __m256 v2_f = _mm256_cvtepi32_ps(v2_256); - // Dequantize using FMA: (val * delta) + min_val - __m256 v2_dequant = _mm256_fmadd_ps(v2_f, delta_vec, min_val_vec); - - // Compute dot product with masking - sum256 = _mm256_mul_ps(v1, v2_dequant); + // Compute q_i * y_i (no dequantization) + sum256 = _mm256_mul_ps(v1, v2_f); } - // If the reminder is >=8, have another step of 8 floats + // If the residual is >=8, have another step of 8 floats if constexpr (residual >= 8) { - InnerProductStepSQ8_FMA(pVect1, pVect2, sum256, min_val_vec, delta_vec); + InnerProductStepSQ8_FMA(pVect1, pVect2, sum256); } - // We dealt with the residual part. We are left with some multiple of 16 floats. - // In each iteration we calculate 16 floats = 512 bits. + // Process remaining full chunks of 16 elements (2x8) + // Using do-while since dim > 16 guarantees at least one iteration do { - InnerProductStepSQ8_FMA(pVect1, pVect2, sum256, min_val_vec, delta_vec); - InnerProductStepSQ8_FMA(pVect1, pVect2, sum256, min_val_vec, delta_vec); + InnerProductStepSQ8_FMA(pVect1, pVect2, sum256); + InnerProductStepSQ8_FMA(pVect1, pVect2, sum256); } while (pVect1 < pEnd1); - return my_mm256_reduce_add_ps(sum256); + // Reduce to get Σ(q_i * y_i) + float quantized_dot = my_mm256_reduce_add_ps(sum256); + + // Get quantization parameters from stored vector (after quantized data) + const uint8_t *pVect2Base = static_cast(pVect2v); + const float *params2 = reinterpret_cast(pVect2Base + dimension); + const float min_val = params2[sq8::MIN_VAL]; + const float delta = params2[sq8::DELTA]; + + // Get precomputed y_sum from query blob (stored after the dim floats) + const float y_sum = static_cast(pVect1v)[dimension + sq8::SUM_QUERY]; + + // Apply the algebraic formula: IP = min * y_sum + delta * Σ(q_i * y_i) + return min_val * y_sum + delta * quantized_dot; } template // 0..15 @@ -100,7 +105,6 @@ float SQ8_InnerProductSIMD16_AVX2_FMA(const void *pVect1v, const void *pVect2v, template // 0..15 float SQ8_CosineSIMD16_AVX2_FMA(const void *pVect1v, const void *pVect2v, size_t dimension) { - // Calculate inner product using common implementation with normalization - float ip = SQ8_InnerProductImp_FMA(pVect1v, pVect2v, dimension); - return 1.0f - ip; + // Cosine distance = 1 - IP (vectors are pre-normalized) + return SQ8_InnerProductSIMD16_AVX2_FMA(pVect1v, pVect2v, dimension); } diff --git a/src/VecSim/spaces/IP/IP_AVX2_SQ8.h b/src/VecSim/spaces/IP/IP_AVX2_SQ8.h index 203e32fad..dca88696d 100644 --- a/src/VecSim/spaces/IP/IP_AVX2_SQ8.h +++ b/src/VecSim/spaces/IP/IP_AVX2_SQ8.h @@ -6,85 +6,96 @@ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the * GNU Affero General Public License v3 (AGPLv3). */ +#pragma once #include "VecSim/spaces/space_includes.h" #include "VecSim/spaces/AVX_utils.h" +#include "VecSim/types/sq8.h" -static inline void InnerProductStepSQ8(const float *&pVect1, const uint8_t *&pVect2, __m256 &sum256, - const __m256 &min_val_vec, const __m256 &delta_vec) { - // Load 8 float elements from pVect1 +using sq8 = vecsim_types::sq8; + +/* + * Optimized asymmetric SQ8 inner product using algebraic identity: + * + * IP(x, y) = Σ(x_i * y_i) + * ≈ Σ((min + delta * q_i) * y_i) + * = min * Σy_i + delta * Σ(q_i * y_i) + * = min * y_sum + delta * quantized_dot_product + * + * where y_sum = Σy_i is precomputed and stored in the query blob. + * This avoids dequantization in the hot loop - we only compute Σ(q_i * y_i). + */ + +// Helper: compute Σ(q_i * y_i) for 8 elements (no dequantization) +static inline void InnerProductStepSQ8(const float *&pVect1, const uint8_t *&pVect2, + __m256 &sum256) { + // Load 8 float elements from query __m256 v1 = _mm256_loadu_ps(pVect1); pVect1 += 8; - // Load 8 uint8 elements from pVect2, convert to int32, then to float - __m128i v2_128 = _mm_loadl_epi64((__m128i *)pVect2); + // Load 8 uint8 elements and convert to float + __m128i v2_128 = _mm_loadl_epi64(reinterpret_cast(pVect2)); pVect2 += 8; - // Zero-extend uint8 to int32 __m256i v2_256 = _mm256_cvtepu8_epi32(v2_128); - - // Convert int32 to float __m256 v2_f = _mm256_cvtepi32_ps(v2_256); - // Dequantize: (val * delta) + min_val - __m256 v2_dequant = _mm256_add_ps(_mm256_mul_ps(v2_f, delta_vec), min_val_vec); - - // Compute dot product and add to sum - sum256 = _mm256_add_ps(sum256, _mm256_mul_ps(v1, v2_dequant)); + // Accumulate q_i * y_i (no dequantization!) + // Using mul + add since this is the non-FMA version + sum256 = _mm256_add_ps(sum256, _mm256_mul_ps(v2_f, v1)); } template // 0..15 float SQ8_InnerProductImp_AVX2(const void *pVect1v, const void *pVect2v, size_t dimension) { const float *pVect1 = static_cast(pVect1v); - // pVect2 is a quantized uint8_t vector const uint8_t *pVect2 = static_cast(pVect2v); const float *pEnd1 = pVect1 + dimension; - // Get dequantization parameters from the end of quantized vector - const float min_val = *reinterpret_cast(pVect2 + dimension); - const float delta = *reinterpret_cast(pVect2 + dimension + sizeof(float)); - // Create broadcast vectors for SIMD operations - __m256 min_val_vec = _mm256_set1_ps(min_val); - __m256 delta_vec = _mm256_set1_ps(delta); - + // Initialize sum accumulator for Σ(q_i * y_i) __m256 sum256 = _mm256_setzero_ps(); - // Deal with 1-7 floats with mask loading, if needed. `dim` is >16, so we have at least one - // 16-float block, so mask loading is guaranteed to be safe. + // Handle residual elements first (0-7 elements) if constexpr (residual % 8) { __mmask8 constexpr mask = (1 << (residual % 8)) - 1; __m256 v1 = my_mm256_maskz_loadu_ps(pVect1); pVect1 += residual % 8; - // Load quantized values and dequantize - __m128i v2_128 = _mm_loadl_epi64((__m128i *)pVect2); + // Load uint8 elements and convert to float + __m128i v2_128 = _mm_loadl_epi64(reinterpret_cast(pVect2)); pVect2 += residual % 8; - // Zero-extend uint8 to int32 __m256i v2_256 = _mm256_cvtepu8_epi32(v2_128); - - // Convert int32 to float __m256 v2_f = _mm256_cvtepi32_ps(v2_256); - // Dequantize: (val * delta) + min_val - __m256 v2_dequant = _mm256_add_ps(_mm256_mul_ps(v2_f, delta_vec), min_val_vec); - - // Compute dot product with masking - sum256 = _mm256_mul_ps(v1, v2_dequant); + // Compute q_i * y_i (no dequantization) + sum256 = _mm256_mul_ps(v1, v2_f); } - // If the reminder is >=8, have another step of 8 floats + // If the residual is >=8, have another step of 8 floats if constexpr (residual >= 8) { - InnerProductStepSQ8(pVect1, pVect2, sum256, min_val_vec, delta_vec); + InnerProductStepSQ8(pVect1, pVect2, sum256); } - // We dealt with the residual part. We are left with some multiple of 16 floats. - // In each iteration we calculate 16 floats = 512 bits. + // Process remaining full chunks of 16 elements (2x8) + // Using do-while since dim > 16 guarantees at least one iteration do { - InnerProductStepSQ8(pVect1, pVect2, sum256, min_val_vec, delta_vec); - InnerProductStepSQ8(pVect1, pVect2, sum256, min_val_vec, delta_vec); + InnerProductStepSQ8(pVect1, pVect2, sum256); + InnerProductStepSQ8(pVect1, pVect2, sum256); } while (pVect1 < pEnd1); - return my_mm256_reduce_add_ps(sum256); + // Reduce to get Σ(q_i * y_i) + float quantized_dot = my_mm256_reduce_add_ps(sum256); + + // Get quantization parameters from stored vector (after quantized data) + const uint8_t *pVect2Base = static_cast(pVect2v); + const float *params2 = reinterpret_cast(pVect2Base + dimension); + const float min_val = params2[sq8::MIN_VAL]; + const float delta = params2[sq8::DELTA]; + + // Get precomputed y_sum from query blob (stored after the dim floats) + const float y_sum = static_cast(pVect1v)[dimension + sq8::SUM_QUERY]; + + // Apply the algebraic formula: IP = min * y_sum + delta * Σ(q_i * y_i) + return min_val * y_sum + delta * quantized_dot; } template // 0..15 @@ -95,6 +106,5 @@ float SQ8_InnerProductSIMD16_AVX2(const void *pVect1v, const void *pVect2v, size template // 0..15 float SQ8_CosineSIMD16_AVX2(const void *pVect1v, const void *pVect2v, size_t dimension) { // Calculate inner product using common implementation with normalization - float ip = SQ8_InnerProductImp_AVX2(pVect1v, pVect2v, dimension); - return 1.0f - ip; + return SQ8_InnerProductSIMD16_AVX2(pVect1v, pVect2v, dimension); } diff --git a/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8.h b/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8.h new file mode 100644 index 000000000..13dce0c0a --- /dev/null +++ b/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8.h @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2006-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ +#pragma once +#include "VecSim/spaces/space_includes.h" +#include "VecSim/types/sq8.h" +#include + +using sq8 = vecsim_types::sq8; +/* + * Optimized asymmetric SQ8 inner product using algebraic identity: + * + * IP(x, y) = Σ(x_i * y_i) + * ≈ Σ((min + delta * q_i) * y_i) + * = min * Σy_i + delta * Σ(q_i * y_i) + * = min * y_sum + delta * quantized_dot_product + * + * where y_sum = Σy_i is precomputed and stored in the query blob. + * This avoids dequantization in the hot loop - we only compute Σ(q_i * y_i). + */ + +// Helper: compute Σ(q_i * y_i) for 16 elements +static inline void SQ8_InnerProductStep(const float *&pVec1, const uint8_t *&pVec2, __m512 &sum) { + // Load 16 float elements from query (pVec1) + __m512 v1 = _mm512_loadu_ps(pVec1); + + // Load 16 uint8 elements from quantized vector and convert to float + __m128i v2_128 = _mm_loadu_si128(reinterpret_cast(pVec2)); + __m512i v2_512 = _mm512_cvtepu8_epi32(v2_128); + __m512 v2_f = _mm512_cvtepi32_ps(v2_512); + + // Accumulate q_i * y_i (no dequantization!) + sum = _mm512_fmadd_ps(v2_f, v1, sum); + + pVec1 += 16; + pVec2 += 16; +} + +// Common implementation for both inner product and cosine similarity +template // 0..15 +float SQ8_InnerProductImp_AVX512(const void *pVec1v, const void *pVec2v, size_t dimension) { + const float *pVec1 = static_cast(pVec1v); + const uint8_t *pVec2 = static_cast(pVec2v); + const float *pEnd1 = pVec1 + dimension; + + // Initialize sum accumulator for Σ(q_i * y_i) + __m512 sum = _mm512_setzero_ps(); + + // Handle residual elements first (0 to 15) + if constexpr (residual > 0) { + __mmask16 mask = (1U << residual) - 1; + + // Load masked float elements from query + __m512 v1 = _mm512_maskz_loadu_ps(mask, pVec1); + + // Load uint8 elements (safe to load 16 bytes due to padding) + __m128i v2_128 = _mm_loadu_si128(reinterpret_cast(pVec2)); + __m512i v2_512 = _mm512_cvtepu8_epi32(v2_128); + __m512 v2_f = _mm512_cvtepi32_ps(v2_512); + + // Compute q_i * y_i with mask (no dequantization) + sum = _mm512_maskz_mul_ps(mask, v2_f, v1); + + pVec1 += residual; + pVec2 += residual; + } + + // Process full chunks of 16 elements + // Using do-while since dim > 16 guarantees at least one iteration + do { + SQ8_InnerProductStep(pVec1, pVec2, sum); + } while (pVec1 < pEnd1); + + // Reduce to get Σ(q_i * y_i) + float quantized_dot = _mm512_reduce_add_ps(sum); + + // Get quantization parameters from stored vector (after quantized data) + // Use the original base pointer since pVec2 has been advanced + const uint8_t *pVec2Base = static_cast(pVec2v); + const float *params2 = reinterpret_cast(pVec2Base + dimension); + const float min_val = params2[sq8::MIN_VAL]; + const float delta = params2[sq8::DELTA]; + + // Get precomputed y_sum from query blob (stored after the dim floats) + // Use the original base pointer since pVec1 has been advanced + const float y_sum = static_cast(pVec1v)[dimension + sq8::SUM_QUERY]; + + // Apply the algebraic formula: IP = min * y_sum + delta * Σ(q_i * y_i) + return min_val * y_sum + delta * quantized_dot; +} + +template // 0..15 +float SQ8_InnerProductSIMD16_AVX512F_BW_VL_VNNI(const void *pVec1v, const void *pVec2v, + size_t dimension) { + // The inner product similarity is 1 - ip + return 1.0f - SQ8_InnerProductImp_AVX512(pVec1v, pVec2v, dimension); +} + +template // 0..15 +float SQ8_CosineSIMD16_AVX512F_BW_VL_VNNI(const void *pVec1v, const void *pVec2v, + size_t dimension) { + // Cosine distance = 1 - IP (vectors are pre-normalized) + return SQ8_InnerProductSIMD16_AVX512F_BW_VL_VNNI(pVec1v, pVec2v, dimension); +} diff --git a/src/VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h b/src/VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h deleted file mode 100644 index 35ea482fa..000000000 --- a/src/VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h +++ /dev/null @@ -1,108 +0,0 @@ -/* - * Copyright (c) 2006-Present, Redis Ltd. - * All rights reserved. - * - * Licensed under your choice of the Redis Source Available License 2.0 - * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the - * GNU Affero General Public License v3 (AGPLv3). - */ -#pragma once -#include "VecSim/spaces/space_includes.h" -#include -#include - -static inline void SQ8_InnerProductStep(const float *&pVec1, const uint8_t *&pVec2, __m512 &sum, - const __m512 &min_val_vec, const __m512 &delta_vec) { - // Load 16 float elements from pVec1 - __m512 v1 = _mm512_loadu_ps(pVec1); - - // Load 16 uint8 elements from pVec2 and convert to __m512i - __m128i v2_128 = _mm_loadu_si128((__m128i *)pVec2); - __m512i v2_512 = _mm512_cvtepu8_epi32(v2_128); - - // Convert uint8 to float - __m512 v2_f = _mm512_cvtepi32_ps(v2_512); - - // Dequantize: (val * delta) + min_val - __m512 dequantized = _mm512_fmadd_ps(v2_f, delta_vec, min_val_vec); - - // Compute dot product and add to sum - sum = _mm512_fmadd_ps(v1, dequantized, sum); - - // Advance pointers - pVec1 += 16; - pVec2 += 16; -} - -// Common implementation for both inner product and cosine similarity -template // 0..15 -float SQ8_InnerProductImp_AVX512(const void *pVec1v, const void *pVec2v, size_t dimension) { - const float *pVec1 = static_cast(pVec1v); - const uint8_t *pVec2 = static_cast(pVec2v); - const float *pEnd1 = pVec1 + dimension; - - // Get dequantization parameters from the end of pVec2 - const float min_val = *reinterpret_cast(pVec2 + dimension); - const float delta = *reinterpret_cast(pVec2 + dimension + sizeof(float)); - - // Create broadcast vectors for SIMD operations - __m512 min_val_vec = _mm512_set1_ps(min_val); - __m512 delta_vec = _mm512_set1_ps(delta); - - // Initialize sum accumulator - __m512 sum = _mm512_setzero_ps(); - - // Deal with remainder first - if constexpr (residual > 0) { - // Handle less than 16 elements - __mmask16 mask = (1U << residual) - 1; - - // Load masked float elements - __m512 v1 = _mm512_maskz_loadu_ps(mask, pVec1); - - // Load full uint8 elements - we know that the first 16 elements are safe to load - __m128i v2_128 = _mm_loadu_si128(reinterpret_cast(pVec2)); - __m512i v2_512 = _mm512_cvtepu8_epi32(v2_128); - __m512 v2_f = _mm512_cvtepi32_ps(v2_512); - - // Dequantize - __m512 dequantized = _mm512_fmadd_ps(v2_f, delta_vec, min_val_vec); - - // Compute dot product - __m512 product = _mm512_mul_ps(v1, dequantized); - - // Apply mask to product and add to sum - sum = _mm512_fmadd_ps(sum, sum, product); - - pVec1 += residual; - pVec2 += residual; - } - - // Process remaining full chunks of 16 elements - do { - SQ8_InnerProductStep(pVec1, pVec2, sum, min_val_vec, delta_vec); - } while (pVec1 < pEnd1); - - // Return the raw inner product result - return _mm512_reduce_add_ps(sum); -} - -template // 0..15 -float SQ8_InnerProductSIMD16_AVX512F_BW_VL_VNNI(const void *pVec1v, const void *pVec2v, - size_t dimension) { - // Calculate inner product using common implementation - float ip = SQ8_InnerProductImp_AVX512(pVec1v, pVec2v, dimension); - - // The inner product similarity is 1 - ip - return 1.0f - ip; -} - -template // 0..15 -float SQ8_CosineSIMD16_AVX512F_BW_VL_VNNI(const void *pVec1v, const void *pVec2v, - size_t dimension) { - // Calculate inner product using common implementation with normalization - float ip = SQ8_InnerProductImp_AVX512(pVec1v, pVec2v, dimension); - - // The cosine similarity is 1 - ip - return 1.0f - ip; -} diff --git a/src/VecSim/spaces/IP/IP_NEON_SQ8.h b/src/VecSim/spaces/IP/IP_NEON_SQ8.h index 7c3f27e10..495608762 100644 --- a/src/VecSim/spaces/IP/IP_NEON_SQ8.h +++ b/src/VecSim/spaces/IP/IP_NEON_SQ8.h @@ -6,30 +6,40 @@ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the * GNU Affero General Public License v3 (AGPLv3). */ +#pragma once #include "VecSim/spaces/space_includes.h" +#include "VecSim/types/sq8.h" #include +using sq8 = vecsim_types::sq8; -static inline void InnerProductStep(const float *&pVect1, const uint8_t *&pVect2, float32x4_t &sum, - const float32x4_t &min_val_vec, const float32x4_t &delta_vec) { - // Load 4 float elements from pVect1 +/* + * Optimized asymmetric SQ8 inner product using algebraic identity: + * + * IP(x, y) = Σ(x_i * y_i) + * ≈ Σ((min + delta * q_i) * y_i) + * = min * Σy_i + delta * Σ(q_i * y_i) + * = min * y_sum + delta * quantized_dot_product + * + * where y_sum = Σy_i is precomputed and stored in the query blob. + * This avoids dequantization in the hot loop - we only compute Σ(q_i * y_i). + */ + +// Helper: compute Σ(q_i * y_i) for 4 elements (no dequantization) +static inline void InnerProductStepSQ8(const float *&pVect1, const uint8_t *&pVect2, + float32x4_t &sum) { + // Load 4 float elements from query float32x4_t v1 = vld1q_f32(pVect1); pVect1 += 4; - // Load 4 uint8 elements from pVect2 + // Load 4 uint8 elements and convert to float uint8x8_t v2_u8 = vld1_u8(pVect2); pVect2 += 4; - // Convert uint8 to uint32 uint32x4_t v2_u32 = vmovl_u16(vget_low_u16(vmovl_u8(v2_u8))); - - // Convert uint32 to float32 float32x4_t v2_f = vcvtq_f32_u32(v2_u32); - // Dequantize: (val * delta) + min_val - float32x4_t v2_dequant = vmlaq_f32(min_val_vec, v2_f, delta_vec); - - // Compute dot product and add to sum - sum = vmlaq_f32(sum, v1, v2_dequant); + // Accumulate q_i * y_i (no dequantization!) + sum = vmlaq_f32(sum, v2_f, v1); } template // 0..15 @@ -37,14 +47,7 @@ float SQ8_InnerProductSIMD16_NEON_IMP(const void *pVect1v, const void *pVect2v, const float *pVect1 = static_cast(pVect1v); const uint8_t *pVect2 = static_cast(pVect2v); - // Get dequantization parameters from the end of quantized vector - const float min_val = *reinterpret_cast(pVect2 + dimension); - const float delta = *reinterpret_cast(pVect2 + dimension + sizeof(float)); - - // Create broadcast vectors for SIMD operations - float32x4_t min_val_vec = vdupq_n_f32(min_val); - float32x4_t delta_vec = vdupq_n_f32(delta); - + // Multiple accumulators for ILP float32x4_t sum0 = vdupq_n_f32(0.0f); float32x4_t sum1 = vdupq_n_f32(0.0f); float32x4_t sum2 = vdupq_n_f32(0.0f); @@ -54,57 +57,68 @@ float SQ8_InnerProductSIMD16_NEON_IMP(const void *pVect1v, const void *pVect2v, // Process 16 elements at a time in the main loop for (size_t i = 0; i < num_of_chunks; i++) { - InnerProductStep(pVect1, pVect2, sum0, min_val_vec, delta_vec); - InnerProductStep(pVect1, pVect2, sum1, min_val_vec, delta_vec); - InnerProductStep(pVect1, pVect2, sum2, min_val_vec, delta_vec); - InnerProductStep(pVect1, pVect2, sum3, min_val_vec, delta_vec); + InnerProductStepSQ8(pVect1, pVect2, sum0); + InnerProductStepSQ8(pVect1, pVect2, sum1); + InnerProductStepSQ8(pVect1, pVect2, sum2); + InnerProductStepSQ8(pVect1, pVect2, sum3); } - // Handle remaining complete 4-float blocks within residual + // Handle remaining complete 4-element blocks within residual if constexpr (residual >= 4) { - InnerProductStep(pVect1, pVect2, sum0, min_val_vec, delta_vec); + InnerProductStepSQ8(pVect1, pVect2, sum0); } if constexpr (residual >= 8) { - InnerProductStep(pVect1, pVect2, sum1, min_val_vec, delta_vec); + InnerProductStepSQ8(pVect1, pVect2, sum1); } if constexpr (residual >= 12) { - InnerProductStep(pVect1, pVect2, sum2, min_val_vec, delta_vec); + InnerProductStepSQ8(pVect1, pVect2, sum2); } // Handle final residual elements (0-3 elements) constexpr size_t final_residual = residual % 4; if constexpr (final_residual > 0) { float32x4_t v1 = vdupq_n_f32(0.0f); - float32x4_t v2_dequant = vdupq_n_f32(0.0f); + float32x4_t v2_f = vdupq_n_f32(0.0f); if constexpr (final_residual >= 1) { v1 = vld1q_lane_f32(pVect1, v1, 0); - float dequant0 = pVect2[0] * delta + min_val; - v2_dequant = vld1q_lane_f32(&dequant0, v2_dequant, 0); + float q0 = static_cast(pVect2[0]); + v2_f = vld1q_lane_f32(&q0, v2_f, 0); } if constexpr (final_residual >= 2) { v1 = vld1q_lane_f32(pVect1 + 1, v1, 1); - float dequant1 = pVect2[1] * delta + min_val; - v2_dequant = vld1q_lane_f32(&dequant1, v2_dequant, 1); + float q1 = static_cast(pVect2[1]); + v2_f = vld1q_lane_f32(&q1, v2_f, 1); } if constexpr (final_residual >= 3) { v1 = vld1q_lane_f32(pVect1 + 2, v1, 2); - float dequant2 = pVect2[2] * delta + min_val; - v2_dequant = vld1q_lane_f32(&dequant2, v2_dequant, 2); + float q2 = static_cast(pVect2[2]); + v2_f = vld1q_lane_f32(&q2, v2_f, 2); } - sum3 = vmlaq_f32(sum3, v1, v2_dequant); + // Compute q_i * y_i (no dequantization) + sum3 = vmlaq_f32(sum3, v2_f, v1); } // Combine all four sum accumulators float32x4_t sum_combined = vaddq_f32(vaddq_f32(sum0, sum1), vaddq_f32(sum2, sum3)); - // Horizontal sum of the 4 elements in the combined NEON register + // Horizontal sum to get Σ(q_i * y_i) float32x2_t sum_halves = vadd_f32(vget_low_f32(sum_combined), vget_high_f32(sum_combined)); float32x2_t summed = vpadd_f32(sum_halves, sum_halves); - float sum = vget_lane_f32(summed, 0); + float quantized_dot = vget_lane_f32(summed, 0); + + // Get quantization parameters from stored vector (after quantized data) + const uint8_t *pVect2Base = static_cast(pVect2v); + const float *params2 = reinterpret_cast(pVect2Base + dimension); + const float min_val = params2[sq8::MIN_VAL]; + const float delta = params2[sq8::DELTA]; + + // Get precomputed y_sum from query blob (stored after the dim floats) + const float y_sum = static_cast(pVect1v)[dimension + sq8::SUM_QUERY]; - return sum; + // Apply the algebraic formula: IP = min * y_sum + delta * Σ(q_i * y_i) + return min_val * y_sum + delta * quantized_dot; } template // 0..15 @@ -114,7 +128,6 @@ float SQ8_InnerProductSIMD16_NEON(const void *pVect1v, const void *pVect2v, size template // 0..15 float SQ8_CosineSIMD16_NEON(const void *pVect1v, const void *pVect2v, size_t dimension) { - // Compute inner product with dequantization using the common function - const float res = SQ8_InnerProductSIMD16_NEON_IMP(pVect1v, pVect2v, dimension); - return 1.0f - res; + // Cosine distance = 1 - IP (vectors are pre-normalized) + return SQ8_InnerProductSIMD16_NEON(pVect1v, pVect2v, dimension); } diff --git a/src/VecSim/spaces/IP/IP_SSE4_SQ8.h b/src/VecSim/spaces/IP/IP_SSE4_SQ8.h index d5dfc4e80..302c902b4 100644 --- a/src/VecSim/spaces/IP/IP_SSE4_SQ8.h +++ b/src/VecSim/spaces/IP/IP_SSE4_SQ8.h @@ -6,95 +6,108 @@ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the * GNU Affero General Public License v3 (AGPLv3). */ +#pragma once #include "VecSim/spaces/space_includes.h" -#include -#include +#include "VecSim/types/sq8.h" +using sq8 = vecsim_types::sq8; -static inline void InnerProductStep(const float *&pVect1, const uint8_t *&pVect2, __m128 &sum_prod, - const __m128 &min_val_vec, const __m128 &delta_vec) { - // Load 4 float elements from pVect1 +/* + * Optimized asymmetric SQ8 inner product using algebraic identity: + * + * IP(x, y) = Σ(x_i * y_i) + * ≈ Σ((min + delta * q_i) * y_i) + * = min * Σy_i + delta * Σ(q_i * y_i) + * = min * y_sum + delta * quantized_dot_product + * + * where y_sum = Σy_i is precomputed and stored in the query blob. + * This avoids dequantization in the hot loop - we only compute Σ(q_i * y_i). + */ + +// Helper: compute Σ(q_i * y_i) for 4 elements (no dequantization) +static inline void InnerProductStepSQ8(const float *&pVect1, const uint8_t *&pVect2, __m128 &sum) { + // Load 4 float elements from query __m128 v1 = _mm_loadu_ps(pVect1); pVect1 += 4; - // Load 4 uint8 elements from pVect2, convert to int32, then to float - __m128i v2_i = _mm_cvtepu8_epi32(_mm_cvtsi32_si128(*(int32_t *)pVect2)); + // Load 4 uint8 elements and convert to float + __m128i v2_i = _mm_cvtepu8_epi32(_mm_cvtsi32_si128(*reinterpret_cast(pVect2))); pVect2 += 4; - // Convert int32 to float __m128 v2_f = _mm_cvtepi32_ps(v2_i); - // Dequantize: (val * delta) + min_val - __m128 v2_dequant = _mm_add_ps(_mm_mul_ps(v2_f, delta_vec), min_val_vec); - - // Compute dot product and add to sum - sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2_dequant)); + // Accumulate q_i * y_i (no dequantization!) + // SSE doesn't have FMA, so use mul + add + sum = _mm_add_ps(sum, _mm_mul_ps(v2_f, v1)); } template // 0..15 float SQ8_InnerProductSIMD16_SSE4_IMP(const void *pVect1v, const void *pVect2v, size_t dimension) { const float *pVect1 = static_cast(pVect1v); - const uint8_t *quantized = static_cast(pVect2v); - - // Get dequantization parameters from the end of quantized vector - float min = *(float *)(quantized + dimension); - float delta = *(float *)(quantized + dimension + sizeof(float)); - - // Create broadcast vectors for SIMD operations - __m128 min_val_vec = _mm_set1_ps(min); - __m128 delta_vec = _mm_set1_ps(delta); - + const uint8_t *pVect2 = static_cast(pVect2v); const float *pEnd1 = pVect1 + dimension; + // Initialize sum accumulator for Σ(q_i * y_i) __m128 sum = _mm_setzero_ps(); - // Process residual elements if needed - if constexpr (residual) { - // Handle residual elements (1-3) - if constexpr (residual % 4) { - __m128 v1; - __m128 v2_dequant; - - if constexpr (residual % 4 == 3) { - // Set 3 floats and the last one to 0 - v1 = _mm_set_ps(0.0f, pVect1[2], pVect1[1], pVect1[0]); - - // Dequantize and set 3 values - v2_dequant = _mm_set_ps(0.0f, quantized[2] * delta + min, - quantized[1] * delta + min, quantized[0] * delta + min); - - } else if constexpr (residual % 4 == 2) { - // Set 2 floats and the last two to 0 - v1 = _mm_set_ps(0.0f, 0.0f, pVect1[1], pVect1[0]); - - // Dequantize and set 2 values - v2_dequant = - _mm_set_ps(0.0f, 0.0f, quantized[1] * delta + min, quantized[0] * delta + min); - - } else if constexpr (residual % 4 == 1) { - // Set 1 float and the last three to 0 - v1 = _mm_set_ps(0.0f, 0.0f, 0.0f, pVect1[0]); + // Process residual elements first (1-3 elements) + if constexpr (residual % 4) { + __m128 v1; + __m128 v2_f; + + if constexpr (residual % 4 == 3) { + v1 = _mm_set_ps(0.0f, pVect1[2], pVect1[1], pVect1[0]); + v2_f = _mm_set_ps(0.0f, static_cast(pVect2[2]), static_cast(pVect2[1]), + static_cast(pVect2[0])); + } else if constexpr (residual % 4 == 2) { + v1 = _mm_set_ps(0.0f, 0.0f, pVect1[1], pVect1[0]); + v2_f = _mm_set_ps(0.0f, 0.0f, static_cast(pVect2[1]), + static_cast(pVect2[0])); + } else if constexpr (residual % 4 == 1) { + v1 = _mm_set_ps(0.0f, 0.0f, 0.0f, pVect1[0]); + v2_f = _mm_set_ps(0.0f, 0.0f, 0.0f, static_cast(pVect2[0])); + } - // Dequantize and set 1 value - v2_dequant = _mm_set_ps(0.0f, 0.0f, 0.0f, quantized[0] * delta + min); - } + pVect1 += residual % 4; + pVect2 += residual % 4; - pVect1 += residual % 4; - quantized += residual % 4; - sum = _mm_mul_ps(v1, v2_dequant); - } + // Compute q_i * y_i (no dequantization) + sum = _mm_mul_ps(v1, v2_f); } - // Process 4 elements at a time - while (pVect1 < pEnd1) { - InnerProductStep(pVect1, quantized, sum, min_val_vec, delta_vec); + // Handle remaining residual in chunks of 4 (for residual 4-15) + if constexpr (residual >= 4) { + InnerProductStepSQ8(pVect1, pVect2, sum); + } + if constexpr (residual >= 8) { + InnerProductStepSQ8(pVect1, pVect2, sum); } + if constexpr (residual >= 12) { + InnerProductStepSQ8(pVect1, pVect2, sum); + } + + // Process remaining full chunks of 4 elements + // Using do-while since dim > 16 guarantees at least one iteration + do { + InnerProductStepSQ8(pVect1, pVect2, sum); + } while (pVect1 < pEnd1); - // TmpRes must be 16 bytes aligned. + // Horizontal sum to get Σ(q_i * y_i) float PORTABLE_ALIGN16 TmpRes[4]; _mm_store_ps(TmpRes, sum); - float result = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3]; + float quantized_dot = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3]; + + // Get quantization parameters from stored vector (after quantized data) + const uint8_t *pVect2Base = static_cast(pVect2v); + const float *params2 = reinterpret_cast(pVect2Base + dimension); + const float min_val = params2[sq8::MIN_VAL]; + const float delta = params2[sq8::DELTA]; - return result; + // Get precomputed y_sum from query blob (stored after the dim floats) + const float *pVect1Base = static_cast(pVect1v); + const float y_sum = pVect1Base[dimension + sq8::SUM_QUERY]; + + // Apply the algebraic formula: IP = min * y_sum + delta * Σ(q_i * y_i) + return min_val * y_sum + delta * quantized_dot; } template // 0..15 @@ -104,10 +117,6 @@ float SQ8_InnerProductSIMD16_SSE4(const void *pVect1v, const void *pVect2v, size template // 0..15 float SQ8_CosineSIMD16_SSE4(const void *pVect1v, const void *pVect2v, size_t dimension) { - // Compute inner product with dequantization using the common function - // We need to cast away const for the inner product function, but it doesn't modify the vectors - const float res = SQ8_InnerProductSIMD16_SSE4_IMP(pVect1v, pVect2v, dimension); - - // For cosine, we need to account for the vector norms - return 1.0f - res; + // Cosine distance = 1 - IP (vectors are pre-normalized) + return SQ8_InnerProductSIMD16_SSE4(pVect1v, pVect2v, dimension); } diff --git a/src/VecSim/spaces/IP/IP_SVE_SQ8.h b/src/VecSim/spaces/IP/IP_SVE_SQ8.h index 825e9c501..735f36906 100644 --- a/src/VecSim/spaces/IP/IP_SVE_SQ8.h +++ b/src/VecSim/spaces/IP/IP_SVE_SQ8.h @@ -6,32 +6,41 @@ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the * GNU Affero General Public License v3 (AGPLv3). */ +#pragma once #include "VecSim/spaces/space_includes.h" +#include "VecSim/types/sq8.h" #include -#include -#include -static inline void InnerProductStep(const float *&pVect1, const uint8_t *&pVect2, size_t &offset, - svfloat32_t &sum, const svfloat32_t &min_val_vec, - const svfloat32_t &delta_vec, const size_t chunk) { +using sq8 = vecsim_types::sq8; +/* + * Optimized asymmetric SQ8 inner product using algebraic identity: + * + * IP(x, y) = Σ(x_i * y_i) + * ≈ Σ((min + delta * q_i) * y_i) + * = min * Σy_i + delta * Σ(q_i * y_i) + * = min * y_sum + delta * quantized_dot_product + * + * where y_sum = Σy_i is precomputed and stored in the query blob. + * This avoids dequantization in the hot loop - we only compute Σ(q_i * y_i). + */ + +// Helper: compute Σ(q_i * y_i) for one SVE vector width (no dequantization) +static inline void InnerProductStepSQ8(const float *pVect1, const uint8_t *pVect2, size_t &offset, + svfloat32_t &sum, const size_t chunk) { svbool_t pg = svptrue_b32(); - // Load float elements from pVect1 + // Load float elements from query svfloat32_t v1 = svld1_f32(pg, pVect1 + offset); - // Convert uint8 to uint32 - svuint32_t v2_u32 = svld1ub_u32(pg, pVect2 + offset); // LD1UB: loa + // Load uint8 elements and zero-extend to uint32 + svuint32_t v2_u32 = svld1ub_u32(pg, pVect2 + offset); // Convert uint32 to float32 svfloat32_t v2_f = svcvt_f32_u32_x(pg, v2_u32); - // Dequantize: (val * delta) + min_val - svfloat32_t v2_dequant = svmla_f32_x(pg, min_val_vec, v2_f, delta_vec); - - // Compute dot product and add to sum - sum = svmla_f32_x(pg, sum, v1, v2_dequant); + // Accumulate q_i * y_i (no dequantization!) + sum = svmla_f32_x(pg, sum, v2_f, v1); - // Move to the next set of elements offset += chunk; } @@ -41,19 +50,12 @@ float SQ8_InnerProductSIMD_SVE_IMP(const void *pVect1v, const void *pVect2v, siz const uint8_t *pVect2 = static_cast(pVect2v); size_t offset = 0; - // Get dequantization parameters from the end of quantized vector - float min = *(float *)(pVect2 + dimension); - float delta = *(float *)(pVect2 + dimension + sizeof(float)); - - // Create broadcast vectors for SIMD operations svbool_t pg = svptrue_b32(); - svfloat32_t min_val_vec = svdup_f32(min); - svfloat32_t delta_vec = svdup_f32(delta); // Get the number of 32-bit elements per vector at runtime uint64_t chunk = svcntw(); - // Multiple accumulators to increase instruction-level parallelism + // Multiple accumulators for ILP svfloat32_t sum0 = svdup_f32(0.0f); svfloat32_t sum1 = svdup_f32(0.0f); svfloat32_t sum2 = svdup_f32(0.0f); @@ -67,24 +69,18 @@ float SQ8_InnerProductSIMD_SVE_IMP(const void *pVect1v, const void *pVect2v, siz svbool_t pg_partial = svwhilelt_b32(static_cast(0), static_cast(remaining)); - // Load float elements from pVect1 with predicate + // Load float elements from query with predicate svfloat32_t v1 = svld1_f32(pg_partial, pVect1); - // load 8-bit bytes from pVect2+offset and zero-extend each into a 32-bit lane - svuint32_t v2_u32 = svld1ub_u32( - pg_partial, pVect2 + offset); // LD1UB: load 8-bit, zero-extend to 32-bit - // :contentReference[oaicite:0]{index=0} + // Load uint8 elements and zero-extend to uint32 + svuint32_t v2_u32 = svld1ub_u32(pg_partial, pVect2 + offset); // Convert uint32 to float32 svfloat32_t v2_f = svcvt_f32_u32_z(pg_partial, v2_u32); - // Dequantize: (val * delta) + min_val - svfloat32_t v2_dequant = svmla_f32_z(pg_partial, min_val_vec, v2_f, delta_vec); - - // Compute dot product and add to sum - sum0 = svmla_f32_z(pg_partial, sum0, v1, v2_dequant); + // Compute q_i * y_i (no dequantization) + sum0 = svmla_f32_z(pg_partial, sum0, v2_f, v1); - // Move pointers past the partial chunk offset += remaining; } } @@ -95,21 +91,21 @@ float SQ8_InnerProductSIMD_SVE_IMP(const void *pVect1v, const void *pVect2v, siz (dimension - (partial_chunk ? dimension % chunk : 0)) / chunk_size; for (size_t i = 0; i < number_of_chunks; i++) { - InnerProductStep(pVect1, pVect2, offset, sum0, min_val_vec, delta_vec, chunk); - InnerProductStep(pVect1, pVect2, offset, sum1, min_val_vec, delta_vec, chunk); - InnerProductStep(pVect1, pVect2, offset, sum2, min_val_vec, delta_vec, chunk); - InnerProductStep(pVect1, pVect2, offset, sum3, min_val_vec, delta_vec, chunk); + InnerProductStepSQ8(pVect1, pVect2, offset, sum0, chunk); + InnerProductStepSQ8(pVect1, pVect2, offset, sum1, chunk); + InnerProductStepSQ8(pVect1, pVect2, offset, sum2, chunk); + InnerProductStepSQ8(pVect1, pVect2, offset, sum3, chunk); } // Handle remaining steps (0-3) if constexpr (additional_steps > 0) { - InnerProductStep(pVect1, pVect2, offset, sum0, min_val_vec, delta_vec, chunk); + InnerProductStepSQ8(pVect1, pVect2, offset, sum0, chunk); } if constexpr (additional_steps > 1) { - InnerProductStep(pVect1, pVect2, offset, sum1, min_val_vec, delta_vec, chunk); + InnerProductStepSQ8(pVect1, pVect2, offset, sum1, chunk); } if constexpr (additional_steps > 2) { - InnerProductStep(pVect1, pVect2, offset, sum2, min_val_vec, delta_vec, chunk); + InnerProductStepSQ8(pVect1, pVect2, offset, sum2, chunk); } // Combine the accumulators @@ -117,10 +113,19 @@ float SQ8_InnerProductSIMD_SVE_IMP(const void *pVect1v, const void *pVect2v, siz sum = svadd_f32_z(pg, sum, sum2); sum = svadd_f32_z(pg, sum, sum3); - // Horizontal sum of all elements in the vector - float result = svaddv_f32(pg, sum); + // Horizontal sum to get Σ(q_i * y_i) + float quantized_dot = svaddv_f32(pg, sum); + + // Get quantization parameters from stored vector (after quantized data) + const float *params2 = reinterpret_cast(pVect2 + dimension); + const float min_val = params2[sq8::MIN_VAL]; + const float delta = params2[sq8::DELTA]; - return result; + // Get precomputed y_sum from query blob (stored after the dim floats) + const float y_sum = pVect1[dimension + sq8::SUM_QUERY]; + + // Apply the algebraic formula: IP = min * y_sum + delta * Σ(q_i * y_i) + return min_val * y_sum + delta * quantized_dot; } template @@ -131,10 +136,6 @@ float SQ8_InnerProductSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t template float SQ8_CosineSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) { - // Compute inner product with dequantization using the common function - const float res = - SQ8_InnerProductSIMD_SVE_IMP(pVect1v, pVect2v, dimension); - - // For cosine, we need to account for the vector norms - return 1.0f - res; + // Cosine distance = 1 - IP (vectors are pre-normalized) + return SQ8_InnerProductSIMD_SVE(pVect1v, pVect2v, dimension); } diff --git a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp index 798773a9b..204edd700 100644 --- a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp +++ b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp @@ -14,7 +14,7 @@ #include "VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_UINT8.h" #include "VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_UINT8.h" -#include "VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h" +#include "VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8.h" #include "VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_SQ8.h" #include "VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8_SQ8.h" diff --git a/src/VecSim/types/sq8.h b/src/VecSim/types/sq8.h index 7d67928d5..dfe296321 100644 --- a/src/VecSim/types/sq8.h +++ b/src/VecSim/types/sq8.h @@ -26,7 +26,6 @@ struct sq8 { SUM_SQUARES = 3 // Only for L2 }; - // TODO: re-order metadata and merge with the above enum enum QueryMetadataIndex : size_t { SUM_QUERY = 0, SUM_SQUARES_QUERY = 1 // Only for L2 diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index dcb2171d6..05f4b2fa7 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -310,121 +310,29 @@ TEST_F(SpacesTest, uint8_Cosine_no_optimization_func_test) { ASSERT_NEAR(dist, 0.0, 0.000001); } -void common_ip_sq8(bool should_normalize, float expected_dist) { - - size_t dim = 5; - - // Create original vectors - float v1_orig[dim], v2_orig[dim]; - for (size_t i = 0; i < dim; i++) { - v1_orig[i] = float(i + 1.5); - v2_orig[i] = float(i + 1.5); - } - - // Create SQ8 compressed version of v2 - // Size: dim (uint8_t) + min_val (float) + delta (float) + sum (float) + sum_squares (float) - size_t compressed_size = dim * sizeof(uint8_t) + 3 * sizeof(float); - if (should_normalize) { - spaces::GetNormalizeFunc()(v1_orig, dim); - spaces::GetNormalizeFunc()(v2_orig, dim); - } - - // Find min and max for quantization - float min_val = v2_orig[0]; - float max_val = v2_orig[0]; - float sum = v2_orig[0]; - for (size_t i = 1; i < dim; i++) { - min_val = std::min(min_val, v2_orig[i]); - max_val = std::max(max_val, v2_orig[i]); - sum += v2_orig[i]; - } - - // Calculate delta - float delta = (max_val - min_val) / 255.0f; - if (delta == 0) - delta = 1.0f; // Avoid division by zero - - std::vector v2_compressed(compressed_size); - - // Quantize v2 - uint8_t *quant_values = reinterpret_cast(v2_compressed.data()); - float *params = reinterpret_cast(quant_values + dim); - - // Store parameters - params[0] = min_val; - params[1] = delta; - params[2] = sum; - - // Quantize each value - for (size_t i = 0; i < dim; i++) { - float quantized = (v2_orig[i] - min_val) / delta; - quantized = std::max(0.0f, std::min(255.0f, quantized)); - quant_values[i] = static_cast(std::round(quantized)); - } - - float dist = SQ8_InnerProduct((const void *)v1_orig, (const void *)v2_compressed.data(), dim); - - // Since we're comparing identical vectors, the inner product distance should be close to - // expected - ASSERT_NEAR(dist, expected_dist, 0.01) << "SQ8_InnerProduct failed to match expected distance"; -} - /* ======================== Tests SQ8 ========================= */ -TEST_F(SpacesTest, SQ8_ip_no_optimization_func_test) { - float expected_dist = -70.2147f; // Expected distance for identical vectors - common_ip_sq8(false, expected_dist); -} -TEST_F(SpacesTest, SQ8_ip_no_optimization_norm_func_test) { common_ip_sq8(true, 0.0f); } - -TEST_F(SpacesTest, SQ8_Cosine_no_optimization_func_test) { - // create a vector with extra space for the norm +TEST_F(SpacesTest, SQ8_ip_no_optimization_norm_func_test) { size_t dim = 5; - // Create original vectors - float v1_orig[dim], v2_orig[dim]; - for (size_t i = 0; i < dim; i++) { - v1_orig[i] = float(i + 1.5); - v2_orig[i] = float(i + 1.5); - } + // Create V1 fp32 query with precomputed sum + std::vector v1_orig(dim + 1); + test_utils::populate_fp32_sq8_ip_query(v1_orig.data(), dim, 1234); - // Size: dim (uint8_t) + min_val (float) + delta (float) + inv_norm (float) - size_t compressed_size = dim * sizeof(uint8_t) + 3 * sizeof(float); - spaces::GetNormalizeFunc()(v1_orig, dim); - spaces::GetNormalizeFunc()(v2_orig, dim); - // Find min and max for quantization - float min_val = v2_orig[0]; - float max_val = v2_orig[0]; - float sum = v2_orig[0]; - for (size_t i = 1; i < dim; i++) { - min_val = std::min(min_val, v2_orig[i]); - max_val = std::max(max_val, v2_orig[i]); - sum += v2_orig[i]; - } - // Calculate delta and inverse norm - float delta = (max_val - min_val) / 255.0f; - if (delta == 0) - delta = 1.0f; // Avoid division by zero + // Create V2 as SQ8 quantized vector with different seed + size_t quantized_size = dim * sizeof(uint8_t) + 4 * sizeof(float); + std::vector v2_compressed(quantized_size); + test_utils::populate_float_vec_to_sq8_with_metadata(v2_compressed.data(), dim, true, 5678); - // Compress v2 - std::vector v2_compressed(compressed_size); - uint8_t *quant_values = reinterpret_cast(v2_compressed.data()); - float *params = reinterpret_cast(quant_values + dim); + float baseline = + test_utils::SQ8_NotOptimized_InnerProduct(v1_orig.data(), v2_compressed.data(), dim); - // Quantize each value - for (size_t i = 0; i < dim; i++) { - float normalized = (v2_orig[i] - min_val) / delta; - normalized = std::max(0.0f, std::min(255.0f, normalized)); - quant_values[i] = static_cast(std::round(normalized)); - } - // Store parameters - params[0] = min_val; - params[1] = delta; - params[2] = sum; + float dist = + SQ8_InnerProduct((const void *)v1_orig.data(), (const void *)v2_compressed.data(), dim); - float dist = SQ8_Cosine((const void *)v1_orig, (const void *)v2_compressed.data(), dim); - ASSERT_NEAR(dist, 0.0f, 0.001f) << "SQ8_Cosine failed to match expected distance"; + ASSERT_NEAR(dist, baseline, 0.01) << "SQ8_InnerProduct failed to match expected distance"; } + TEST_F(SpacesTest, SQ8_l2sqr_no_optimization_func_test) { // create a vector with extra space for the norm size_t dim = 5; @@ -2273,18 +2181,11 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8InnerProductTest) { size_t dim = GetParam(); // Create original vectors - std::vector v1_orig(dim); - std::vector v2_orig(dim); - for (size_t i = 0; i < dim; i++) { - v1_orig[i] = float(i + 1.5); - v2_orig[i] = float(i * 0.75 + 1.0); - } - spaces::GetNormalizeFunc()(v1_orig.data(), dim); - - // Create SQ8 compressed version of v2 - std::vector v2_compressed = CreateSQ8CompressedVector(v2_orig.data(), dim); - // print min and delta - float *params = reinterpret_cast(v2_compressed.data() + dim); + std::vector v1_orig(dim + 1); + test_utils::populate_fp32_sq8_ip_query(v1_orig.data(), dim, 1234); + size_t quantized_size = dim * sizeof(uint8_t) + 4 * sizeof(float); + std::vector v2_compressed(quantized_size); + test_utils::populate_float_vec_to_sq8_with_metadata(v2_compressed.data(), dim, true, 456); auto expected_alignment = [](size_t reg_bit_size, size_t dim) { size_t elements_in_reg = reg_bit_size / sizeof(uint8_t) / 8; @@ -2391,20 +2292,13 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { auto optimization = getCpuOptimizationFeatures(); size_t dim = GetParam(); - // Create original vectors - std::vector v1_orig(dim); - std::vector v2_orig(dim); - for (size_t i = 0; i < dim; i++) { - v1_orig[i] = float(i + 1.5); - v2_orig[i] = float(i * 0.75 + 1.0); - } - - // Normalize v1 - spaces::GetNormalizeFunc()(v1_orig.data(), dim); - spaces::GetNormalizeFunc()(v2_orig.data(), dim); + // Create original vectors - v1 needs extra space for precomputed sum + std::vector v1_orig(dim + 1); + size_t quantized_size = dim * sizeof(uint8_t) + 4 * sizeof(float); + std::vector v2_quantized(quantized_size); - // Create SQ8 compressed version of v2 (with normalization) - std::vector v2_compressed = CreateSQ8CompressedVector(v2_orig.data(), dim); + test_utils::populate_fp32_sq8_ip_query(v1_orig.data(), dim, 1234); + test_utils::populate_float_vec_to_sq8_with_metadata(v2_quantized.data(), dim, false, 456); auto expected_alignment = [](size_t reg_bit_size, size_t dim) { size_t elements_in_reg = reg_bit_size / sizeof(uint8_t) / 8; @@ -2412,7 +2306,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { }; dist_func_t arch_opt_func; - float baseline = SQ8_Cosine(v1_orig.data(), v2_compressed.data(), dim); + float baseline = SQ8_Cosine(v1_orig.data(), v2_quantized.data(), dim); #ifdef OPT_SVE2 if (optimization.sve2) { @@ -2420,7 +2314,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, Choose_SQ8_Cosine_implementation_SVE2(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_quantized.data(), dim), 0.01) << "SVE2 with dim " << dim; optimization.sve2 = 0; } @@ -2431,7 +2325,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, Choose_SQ8_Cosine_implementation_SVE(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_quantized.data(), dim), 0.01) << "SVE with dim " << dim; optimization.sve = 0; } @@ -2442,7 +2336,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, Choose_SQ8_Cosine_implementation_NEON(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_quantized.data(), dim), 0.01) << "NEON with dim " << dim; optimization.asimd = 0; } @@ -2455,7 +2349,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, Choose_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_quantized.data(), dim), 0.01) << "AVX512 with dim " << dim; optimization.avx512f = 0; } @@ -2466,7 +2360,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, Choose_SQ8_Cosine_implementation_AVX2_FMA(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_quantized.data(), dim), 0.01) << "AVX with dim " << dim; optimization.fma3 = 0; } @@ -2477,7 +2371,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, Choose_SQ8_Cosine_implementation_AVX2(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_quantized.data(), dim), 0.01) << "AVX with dim " << dim; optimization.avx2 = 0; } @@ -2489,7 +2383,7 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, Choose_SQ8_Cosine_implementation_SSE4(dim)) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_quantized.data(), dim), 0.01) << "SSE with dim " << dim; optimization.sse4_1 = 0; } @@ -2499,11 +2393,374 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { unsigned char alignment = 0; arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, SQ8_Cosine) << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_compressed.data(), dim), 0.01) + ASSERT_NEAR(baseline, arch_opt_func(v1_orig.data(), v2_quantized.data(), dim), 0.01) << "No optimization with dim " << dim; ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; } +// Test self-distance: distance to itself should be 0 for cosine (normalized vectors) +TEST(SQ8_EdgeCases, SelfDistanceCosine) { + auto optimization = getCpuOptimizationFeatures(); + size_t dim = 128; + std::vector v_orig(dim + 1); + test_utils::populate_fp32_sq8_ip_query(v_orig.data(), dim, 1234); + + size_t quantized_size = dim * sizeof(uint8_t) + 4 * sizeof(float); + std::vector v_quantized(quantized_size); + test_utils::populate_float_vec_to_sq8_with_metadata(v_quantized.data(), dim, true, 1234); + + float baseline = SQ8_Cosine(v_orig.data(), v_quantized.data(), dim); + + // Self-distance for cosine should be close to 0 + ASSERT_NEAR(baseline, 0.0f, 0.001f) << "Self-distance should be ~0 for cosine"; + +#ifdef OPT_SVE2 + if (optimization.sve2) { + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v_orig.data(), v_quantized.data(), dim); + ASSERT_NEAR(result, baseline, 0.01f) << "Optimized self-distance should match baseline"; + optimization.sve2 = 0; + } +#endif +#ifdef OPT_SVE + if (optimization.sve) { + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v_orig.data(), v_quantized.data(), dim); + ASSERT_NEAR(result, baseline, 0.01f) << "Optimized self-distance should match baseline"; + optimization.sve = 0; + } +#endif +#ifdef OPT_NEON_DOTPROD + if (optimization.asimddp) { + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v_orig.data(), v_quantized.data(), dim); + ASSERT_NEAR(result, baseline, 0.01f) << "Optimized self-distance should match baseline"; + optimization.asimddp = 0; + } +#endif +#ifdef OPT_NEON + if (optimization.asimd) { + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v_orig.data(), v_quantized.data(), dim); + ASSERT_NEAR(result, baseline, 0.01f) << "Optimized self-distance should match baseline"; + optimization.asimd = 0; + } +#endif +#ifdef OPT_AVX512_F_BW_VL_VNNI + if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) { + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v_orig.data(), v_quantized.data(), dim); + ASSERT_NEAR(result, baseline, 0.01f) << "Optimized self-distance should match baseline"; + optimization.avx512f = 0; + } +#endif + + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); + auto result = arch_opt_func(v_orig.data(), v_quantized.data(), dim); + ASSERT_NEAR(baseline, result, 0.00001) << "No optimization self-distance should match baseline"; + ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; +} + +// Test symmetry: dist(v1, v2) == dist(v2, v1) +TEST(SQ8_EdgeCases, CosineSymmetryTest) { + size_t dim = 128; + auto optimization = getCpuOptimizationFeatures(); + std::vector v1_fp32(dim + 1); + test_utils::populate_fp32_sq8_ip_query(v1_fp32.data(), dim, 1234); + + size_t quantized_size = dim * sizeof(uint8_t) + 4 * sizeof(float); + std::vector v1_quantized(quantized_size); + test_utils::populate_float_vec_to_sq8_with_metadata(v1_quantized.data(), dim, true, 1234); + + std::vector v2_fp32(dim + 1); + test_utils::populate_fp32_sq8_ip_query(v2_fp32.data(), dim, 456); + std::vector v2_quantized(quantized_size); + test_utils::populate_float_vec_to_sq8_with_metadata(v2_quantized.data(), dim, true, 456); + float baseline_1 = SQ8_Cosine(v1_fp32.data(), v2_quantized.data(), dim); + float baseline_2 = SQ8_Cosine(v2_fp32.data(), v1_quantized.data(), dim); + ASSERT_NEAR(baseline_1, baseline_2, 0.001f) << "Cosine should be symmetric"; + + unsigned char alignment = 0; + +#ifdef OPT_SVE2 + if (optimization.sve2) { + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); + float cos_12 = arch_opt_func(v1_fp32.data(), v2_quantized.data(), dim); + float cos_21 = arch_opt_func(v2_fp32.data(), v1_quantized.data(), dim); + ASSERT_NEAR(cos_12, cos_21, 0.001f) << "Optimized cosine should be symmetric"; + optimization.sve2 = 0; + } +#endif +#ifdef OPT_SVE + if (optimization.sve) { + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); + float cos_12 = arch_opt_func(v1_fp32.data(), v2_quantized.data(), dim); + float cos_21 = arch_opt_func(v2_fp32.data(), v1_quantized.data(), dim); + ASSERT_NEAR(cos_12, cos_21, 0.001f) << "Optimized cosine should be symmetric"; + optimization.sve = 0; + } +#endif +#ifdef OPT_NEON_DOTPROD + if (optimization.asimddp) { + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); + float cos_12 = arch_opt_func(v1_fp32.data(), v2_quantized.data(), dim); + float cos_21 = arch_opt_func(v2_fp32.data(), v1_quantized.data(), dim); + ASSERT_NEAR(cos_12, cos_21, 0.001f) << "Optimized cosine should be symmetric"; + optimization.asimddp = 0; + } +#endif +#ifdef OPT_NEON + if (optimization.asimd) { + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); + float cos_12 = arch_opt_func(v1_fp32.data(), v2_quantized.data(), dim); + float cos_21 = arch_opt_func(v2_fp32.data(), v1_quantized.data(), dim); + ASSERT_NEAR(cos_12, cos_21, 0.001f) << "Optimized cosine should be symmetric"; + optimization.asimd = 0; + } +#endif +#ifdef OPT_AVX512_F_BW_VL_VNNI + if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) { + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); + float cos_12 = arch_opt_func(v1_fp32.data(), v2_quantized.data(), dim); + float cos_21 = arch_opt_func(v2_fp32.data(), v1_quantized.data(), dim); + ASSERT_NEAR(cos_12, cos_21, 0.001f) << "Optimized cosine should be symmetric"; + optimization.avx512f = 0; + } +#endif + auto cosine_func = Cosine_SQ8_GetDistFunc(dim, &alignment, nullptr); + float cos_12 = cosine_func(v1_fp32.data(), v2_quantized.data(), dim); + float cos_21 = cosine_func(v2_fp32.data(), v1_quantized.data(), dim); + ASSERT_NEAR(cos_12, cos_21, 0.001f) << "Cosine should be symmetric"; +} + +// Test with zero vector +TEST(SQ8_EdgeCases, CosineZeroVectorTest) { + auto optimization = getCpuOptimizationFeatures(); + size_t dim = 128; + std::vector v_zero(dim + 1, 0.0f); + + size_t quantized_size = dim * sizeof(uint8_t) + 4 * sizeof(float); + std::vector v_nonzero_quantized(quantized_size); + test_utils::populate_float_vec_to_sq8_with_metadata(v_nonzero_quantized.data(), dim, true); + + float baseline = SQ8_Cosine(v_zero.data(), v_nonzero_quantized.data(), dim); + +#ifdef OPT_SVE2 + if (optimization.sve2) { + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v_zero.data(), v_nonzero_quantized.data(), dim); + ASSERT_NEAR(result, baseline, 0.01f) << "Optimized zero vector IP should match baseline"; + optimization.sve2 = 0; + } +#endif +#ifdef OPT_SVE + if (optimization.sve) { + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v_zero.data(), v_nonzero_quantized.data(), dim); + ASSERT_NEAR(result, baseline, 0.01f) << "Optimized zero vector IP should match baseline"; + optimization.sve = 0; + } +#endif +#ifdef OPT_NEON_DOTPROD + if (optimization.asimddp) { + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v_zero.data(), v_nonzero_quantized.data(), dim); + ASSERT_NEAR(result, baseline, 0.01f) << "Optimized zero vector IP should match baseline"; + optimization.asimddp = 0; + } +#endif +#ifdef OPT_NEON + if (optimization.asimd) { + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v_zero.data(), v_nonzero_quantized.data(), dim); + ASSERT_NEAR(result, baseline, 0.01f) << "Optimized zero vector IP should match baseline"; + optimization.asimd = 0; + } +#endif +#ifdef OPT_AVX512_F_BW_VL_VNNI + if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) { + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v_zero.data(), v_nonzero_quantized.data(), dim); + ASSERT_NEAR(result, baseline, 0.01f) << "Optimized zero vector IP should match baseline"; + optimization.avx512f = 0; + } +#endif + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, nullptr); + float result = arch_opt_func(v_zero.data(), v_nonzero_quantized.data(), dim); + + ASSERT_EQ(result, baseline) << "Zero vector Cosine should match baseline"; +} + +// Test with constant quantized vector (all same values - edge case where delta = 0) +TEST(SQ8_EdgeCases, CosineConstantVectorTest) { + auto optimization = getCpuOptimizationFeatures(); + size_t dim = 128; + + // Create a random query vector (preprocessed for FP32->SQ8 cosine) + std::vector v_query(dim + 1); + test_utils::populate_float_vec(v_query.data(), dim); + test_utils::preprocess_fp32_sq8_ip_query(v_query.data(), dim); + + // Create a constant quantized vector (all same values) + // This tests the edge case where delta = 0 (or set to 1.0 to avoid division by zero) + size_t quantized_size = dim * sizeof(uint8_t) + 4 * sizeof(float); + std::vector v_const_quantized(quantized_size); + std::vector v_const(dim, 0.5f); + spaces::GetNormalizeFunc()(v_const.data(), dim); + test_utils::quantize_float_vec_to_sq8_with_metadata(v_const.data(), dim, + v_const_quantized.data()); + + float baseline = SQ8_Cosine(v_query.data(), v_const_quantized.data(), dim); +#ifdef OPT_SVE2 + if (optimization.sve2) { + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v_query.data(), v_const_quantized.data(), dim); + ASSERT_NEAR(result, baseline, 0.01f) + << "Optimized constant vector Cosine should match baseline"; + optimization.sve2 = 0; + } +#endif +#ifdef OPT_SVE + if (optimization.sve) { + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v_query.data(), v_const_quantized.data(), dim); + ASSERT_NEAR(result, baseline, 0.01f) + << "Optimized constant vector Cosine should match baseline"; + optimization.sve = 0; + } +#endif +#ifdef OPT_NEON_DOTPROD + if (optimization.asimddp) { + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v_query.data(), v_const_quantized.data(), dim); + ASSERT_NEAR(result, baseline, 0.01f) + << "Optimized constant vector Cosine should match baseline"; + optimization.asimddp = 0; + } +#endif +#ifdef OPT_NEON + if (optimization.asimd) { + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v_query.data(), v_const_quantized.data(), dim); + ASSERT_NEAR(result, baseline, 0.01f) + << "Optimized constant vector Cosine should match baseline"; + optimization.asimd = 0; + } +#endif +#ifdef OPT_AVX512_F_BW_VL_VNNI + if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) { + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v_query.data(), v_const_quantized.data(), dim); + ASSERT_NEAR(result, baseline, 0.01f) + << "Optimized constant vector Cosine should match baseline"; + optimization.avx512f = 0; + } +#endif + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, nullptr); + float result = arch_opt_func(v_query.data(), v_const_quantized.data(), dim); + + ASSERT_NEAR(result, baseline, 0.01f) + << "Constant quantized vector Cosine should match baseline"; +} + +// Test with extreme values (-1 and 1 only) +TEST(SQ8_EdgeCases, CosineExtremeValuesTest) { + auto optimization = getCpuOptimizationFeatures(); + size_t dim = 128; + std::vector v1(dim + 1), v2(dim); + + // Alternating extreme values + for (size_t i = 0; i < dim; i++) { + v1[i] = (i % 2 == 0) ? 1.0f : -1.0f; + v2[i] = (i % 3 == 0) ? 1.0f : -1.0f; + } + test_utils::preprocess_fp32_sq8_ip_query(v1.data(), dim); + size_t quantized_size = dim * sizeof(uint8_t) + 4 * sizeof(float); + std::vector v2_quantized(quantized_size); + test_utils::quantize_float_vec_to_sq8_with_metadata(v2.data(), dim, v2_quantized.data()); + + float baseline = SQ8_Cosine(v1.data(), v2_quantized.data(), dim); + +#ifdef OPT_SVE2 + if (optimization.sve2) { + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v1.data(), v2_quantized.data(), dim); + ASSERT_NEAR(result, baseline, 0.01f) + << "Optimized extreme values Cosine should match baseline"; + optimization.sve2 = 0; + } +#endif +#ifdef OPT_SVE + if (optimization.sve) { + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v1.data(), v2_quantized.data(), dim); + ASSERT_NEAR(result, baseline, 0.01f) + << "Optimized extreme values Cosine should match baseline"; + optimization.sve = 0; + } +#endif +#ifdef OPT_NEON_DOTPROD + if (optimization.asimddp) { + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v1.data(), v2_quantized.data(), dim); + ASSERT_NEAR(result, baseline, 0.01f) + << "Optimized extreme values Cosine should match baseline"; + optimization.asimddp = 0; + } +#endif +#ifdef OPT_NEON + if (optimization.asimd) { + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v1.data(), v2_quantized.data(), dim); + ASSERT_NEAR(result, baseline, 0.01f) + << "Optimized extreme values Cosine should match baseline"; + optimization.asimd = 0; + } +#endif +#ifdef OPT_AVX512_F_BW_VL_VNNI + if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) { + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v1.data(), v2_quantized.data(), dim); + ASSERT_NEAR(result, baseline, 0.01f) + << "Optimized extreme values Cosine should match baseline"; + optimization.avx512f = 0; + } +#endif + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_GetDistFunc(dim, &alignment, nullptr); + float result = arch_opt_func(v1.data(), v2_quantized.data(), dim); + + ASSERT_NEAR(result, baseline, 0.01f) << "Extreme values Cosine should match baseline"; +} + /* ======================== Tests SQ8_SQ8 ========================= */ TEST_F(SpacesTest, SQ8_SQ8_ip_no_optimization_func_test) { @@ -2594,6 +2851,7 @@ TEST_F(SpacesTest, SQ8_SQ8_L2_no_optimization_func_test) { << "Unexpected distance function chosen for dim " << dim; ASSERT_NEAR(baseline, arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim), 0.001f) << "SQ8_SQ8_L2Sqr failed to match expected distance"; + ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; } class SQ8_SQ8_SpacesOptimizationTest : public testing::TestWithParam {}; diff --git a/tests/utils/tests_utils.h b/tests/utils/tests_utils.h index 4f1bf33e8..03d2ed0d7 100644 --- a/tests/utils/tests_utils.h +++ b/tests/utils/tests_utils.h @@ -70,6 +70,36 @@ static void populate_float16_vec(vecsim_types::float16 *v, const size_t dim, int } } +/* + * SQ8 distance function without the algebraic optimizations + * uses the regular dequantization formula: + * IP = Σ((min + delta * q_i) * v_i) + */ +static float SQ8_NotOptimized_InnerProduct(const void *pVect1v, const void *pVect2v, + size_t dimension) { + + const auto *pVect1 = static_cast(pVect1v); + const auto *pVect2 = static_cast(pVect2v); + + // Get quantization parameters from pVect2 + const float min_val = *reinterpret_cast(pVect2 + dimension); + const float delta = *reinterpret_cast(pVect2 + dimension + sizeof(float)); + // Compute inner product with dequantization + float res = 0.0f; + for (size_t i = 0; i < dimension; i++) { + res += (pVect2[i] * delta + min_val) * pVect1[i]; + } + return 1.0f - res; +} + +/* + * SQ8 Cosine distance function without the algebraic optimizations + * For normalized vectors, cosine distance equals inner product distance. + */ +static float SQ8_NotOptimized_Cosine(const void *pVect1v, const void *pVect2v, size_t dimension) { + return SQ8_NotOptimized_InnerProduct(pVect1v, pVect2v, dimension); +} + /* * SQ8_SQ8 distance function without the algebraic optimizations * uses the regular dequantization formula: @@ -173,6 +203,24 @@ static void quantize_float_vec_to_sq8_with_metadata(const float *v, size_t dim, params[sq8::SUM_SQUARES] = square_sum; } +// Preprocess fp32 query for SQ8 space. +// Query layout: [float values (dim)] [sum (float)] +// Assuming v is a memory allocation of size (dim + 1) * sizeof(float) +static void preprocess_fp32_sq8_ip_query(float *v, size_t dim) { + spaces::GetNormalizeFunc()(v, dim); + float sum = 0.0f; + for (size_t i = 0; i < dim; i++) { + sum += v[i]; + } + v[dim] = sum; +} + +// Assuming v is a memory allocation of size (dim + 1) * sizeof(float) +static void populate_fp32_sq8_ip_query(float *v, size_t dim, int seed = 1234, float min = -1.0f, + float max = 1.0f) { + populate_float_vec(v, dim, seed, min, max); + preprocess_fp32_sq8_ip_query(v, dim); +} /** * Populate a float vector and quantize to SQ8 with precomputed sum and sum_squares. * Vector layout: [uint8_t values (dim)] [min (float)] [delta (float)] [sum (float)] [sum_squares