RedisAI
diff --git a/‎src/VecSim/spaces/IP/IP.cpp‎
Lines changed: 45 additions & 27 deletions b/‎src/VecSim/spaces/IP/IP.cpp‎
Lines changed: 45 additions & 27 deletions
diff --git a/‎src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8.h‎
Lines changed: 52 additions & 48 deletions b/‎src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8.h‎
Lines changed: 52 additions & 48 deletions
diff --git a/‎src/VecSim/spaces/IP/IP_AVX2_SQ8.h‎
Lines changed: 52 additions & 42 deletions b/‎src/VecSim/spaces/IP/IP_AVX2_SQ8.h‎
Lines changed: 52 additions & 42 deletions
@@ -16,39 +16,57 @@ using bfloat16 = vecsim_types::bfloat16;
 using float16 = vecsim_types::float16;
 using sq8 = vecsim_types::sq8;
 
-float FLOAT_INTEGER_InnerProduct(const float *pVect1v, const uint8_t *pVect2v, size_t dimension,
-                                 float min_val, float delta) {
-    float res = 0;
-    for (size_t i = 0; i < dimension; i++) {
-        float dequantized_V2 = (pVect2v[i] * delta + min_val);
-        res += pVect1v[i] * dequantized_V2;
-    }
-    return res;
-}
-
+/*
+ * Optimized asymmetric SQ8 inner product using algebraic identity:
+ *   IP(x, y) = Σ(x_i * y_i)
+ *            ≈ Σ((min + delta * q_i) * y_i)
+ *            = min * Σy_i + delta * Σ(q_i * y_i)
+ *            = min * y_sum + delta * quantized_dot_product
+ *
+ * Uses 4x loop unrolling with multiple accumulators for ILP.
+ * pVect1 is a vector of float32, pVect2 is a quantized uint8_t vector
+ */
 float SQ8_InnerProduct(const void *pVect1v, const void *pVect2v, size_t dimension) {
+
     const auto *pVect1 = static_cast<const float *>(pVect1v);
     const auto *pVect2 = static_cast<const uint8_t *>(pVect2v);
-    // pVect2 is a vector of uint8_t, so we need to de-quantize it, normalize it and then multiply
-    // it. it is structured as [quantized values (int8_t * dim)][min_val (float)][delta
-    // (float)]] The last two values are used to dequantize the vector.
-    const float min_val = *reinterpret_cast<const float *>(pVect2 + dimension);
-    const float delta = *reinterpret_cast<const float *>(pVect2 + dimension + sizeof(float));
-    // Compute inner product with dequantization
-    const float res = FLOAT_INTEGER_InnerProduct(pVect1, pVect2, dimension, min_val, delta);
-    return 1.0f - res;
+
+    // Use 4 accumulators for instruction-level parallelism
+    float sum0 = 0, sum1 = 0, sum2 = 0, sum3 = 0;
+
+    // Main loop: process 4 elements per iteration
+    size_t i = 0;
+    size_t dim4 = dimension & ~size_t(3); // dim4 is a multiple of 4
+    for (; i < dim4; i += 4) {
+        sum0 += pVect1[i + 0] * static_cast<float>(pVect2[i + 0]);
+        sum1 += pVect1[i + 1] * static_cast<float>(pVect2[i + 1]);
+        sum2 += pVect1[i + 2] * static_cast<float>(pVect2[i + 2]);
+        sum3 += pVect1[i + 3] * static_cast<float>(pVect2[i + 3]);
+    }
+
+    // Handle remainder (0-3 elements)
+    for (; i < dimension; i++) {
+        sum0 += pVect1[i] * static_cast<float>(pVect2[i]);
+    }
+
+    // Combine accumulators
+    float quantized_dot = (sum0 + sum1) + (sum2 + sum3);
+
+    // Get quantization parameters from stored vector
+    const float *params = reinterpret_cast<const float *>(pVect2 + dimension);
+    const float min_val = params[sq8::MIN_VAL];
+    const float delta = params[sq8::DELTA];
+
+    // Get precomputed y_sum from query blob (stored after the dim floats)
+    const float y_sum = pVect1[dimension + sq8::SUM_QUERY];
+
+    // Apply formula: IP = min * y_sum + delta * Σ(q_i * y_i)
+    const float ip = min_val * y_sum + delta * quantized_dot;
+    return 1.0f - ip;
 }
 
 float SQ8_Cosine(const void *pVect1v, const void *pVect2v, size_t dimension) {
-    const auto *pVect1 = static_cast<const float *>(pVect1v);
-    const auto *pVect2 = static_cast<const uint8_t *>(pVect2v);
-
-    // Get quantization parameters
-    const float min_val = *reinterpret_cast<const float *>(pVect2 + dimension);
-    const float delta = *reinterpret_cast<const float *>(pVect2 + dimension + sizeof(float));
-    // Compute inner product with dequantization
-    const float res = FLOAT_INTEGER_InnerProduct(pVect1, pVect2, dimension, min_val, delta);
-    return 1.0f - res;
+    return SQ8_InnerProduct(pVect1v, pVect2v, dimension);
 }
 
 // SQ8-to-SQ8: Common inner product implementation that returns the raw inner product value
 
@@ -6,91 +6,96 @@
  * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
  * GNU Affero General Public License v3 (AGPLv3).
  */
+#pragma once
 #include "VecSim/spaces/space_includes.h"
 #include "VecSim/spaces/AVX_utils.h"
+#include "VecSim/types/sq8.h"
+using sq8 = vecsim_types::sq8;
 
+/*
+ * Optimized asymmetric SQ8 inner product using algebraic identity:
+ *
+ *   IP(x, y) = Σ(x_i * y_i)
+ *            ≈ Σ((min + delta * q_i) * y_i)
+ *            = min * Σy_i + delta * Σ(q_i * y_i)
+ *            = min * y_sum + delta * quantized_dot_product
+ *
+ * where y_sum = Σy_i is precomputed and stored in the query blob.
+ * This avoids dequantization in the hot loop - we only compute Σ(q_i * y_i).
+ *
+ * This version uses FMA instructions for better performance.
+ */
+
+// Helper: compute Σ(q_i * y_i) for 8 elements using FMA (no dequantization)
 static inline void InnerProductStepSQ8_FMA(const float *&pVect1, const uint8_t *&pVect2,
-                                           __m256 &sum256, const __m256 &min_val_vec,
-                                           const __m256 &delta_vec) {
-    // Load 8 float elements from pVect1
+                                           __m256 &sum256) {
+    // Load 8 float elements from query
     __m256 v1 = _mm256_loadu_ps(pVect1);
     pVect1 += 8;
 
-    // Load 8 uint8 elements from pVect2, convert to int32, then to float
-    __m128i v2_128 = _mm_loadl_epi64((__m128i *)pVect2);
+    // Load 8 uint8 elements and convert to float
+    __m128i v2_128 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(pVect2));
     pVect2 += 8;
 
-    // Zero-extend uint8 to int32
     __m256i v2_256 = _mm256_cvtepu8_epi32(v2_128);
-
-    // Convert int32 to float
     __m256 v2_f = _mm256_cvtepi32_ps(v2_256);
 
-    // Dequantize and compute dot product in one step using FMA
-    // (val * delta) + min_val -> v2_dequant
-    // sum256 += v1 * v2_dequant
-    // Using FMA: sum256 = v1 * v2_dequant + sum256
-
-    // First, compute v2_dequant = v2_f * delta_vec + min_val_vec
-    __m256 v2_dequant = _mm256_fmadd_ps(v2_f, delta_vec, min_val_vec);
-
-    // Then, compute sum256 += v1 * v2_dequant using FMA
-    sum256 = _mm256_fmadd_ps(v1, v2_dequant, sum256);
+    // Accumulate q_i * y_i using FMA (no dequantization!)
+    sum256 = _mm256_fmadd_ps(v2_f, v1, sum256);
 }
 
 template <unsigned char residual> // 0..15
 float SQ8_InnerProductImp_FMA(const void *pVect1v, const void *pVect2v, size_t dimension) {
     const float *pVect1 = static_cast<const float *>(pVect1v);
-    // pVect2 is a quantized uint8_t vector
     const uint8_t *pVect2 = static_cast<const uint8_t *>(pVect2v);
     const float *pEnd1 = pVect1 + dimension;
 
-    // Get dequantization parameters from the end of quantized vector
-    const float min_val = *reinterpret_cast<const float *>(pVect2 + dimension);
-    const float delta = *reinterpret_cast<const float *>(pVect2 + dimension + sizeof(float));
-    // Create broadcast vectors for SIMD operations
-    __m256 min_val_vec = _mm256_set1_ps(min_val);
-    __m256 delta_vec = _mm256_set1_ps(delta);
-
+    // Initialize sum accumulator for Σ(q_i * y_i)
     __m256 sum256 = _mm256_setzero_ps();
 
-    // Deal with 1-7 floats with mask loading, if needed. `dim` is >16, so we have at least one
-    // 16-float block, so mask loading is guaranteed to be safe.
+    // Handle residual elements first (0-7 elements)
     if constexpr (residual % 8) {
         __mmask8 constexpr mask = (1 << (residual % 8)) - 1;
         __m256 v1 = my_mm256_maskz_loadu_ps<mask>(pVect1);
         pVect1 += residual % 8;
 
-        // Load quantized values and dequantize
-        __m128i v2_128 = _mm_loadl_epi64((__m128i *)pVect2);
+        // Load uint8 elements and convert to float
+        __m128i v2_128 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(pVect2));
         pVect2 += residual % 8;
 
-        // Zero-extend uint8 to int32
         __m256i v2_256 = _mm256_cvtepu8_epi32(v2_128);
-
-        // Convert int32 to float
         __m256 v2_f = _mm256_cvtepi32_ps(v2_256);
 
-        // Dequantize using FMA: (val * delta) + min_val
-        __m256 v2_dequant = _mm256_fmadd_ps(v2_f, delta_vec, min_val_vec);
-
-        // Compute dot product with masking
-        sum256 = _mm256_mul_ps(v1, v2_dequant);
+        // Compute q_i * y_i (no dequantization)
+        sum256 = _mm256_mul_ps(v1, v2_f);
     }
 
-    // If the reminder is >=8, have another step of 8 floats
+    // If the residual is >=8, have another step of 8 floats
     if constexpr (residual >= 8) {
-        InnerProductStepSQ8_FMA(pVect1, pVect2, sum256, min_val_vec, delta_vec);
+        InnerProductStepSQ8_FMA(pVect1, pVect2, sum256);
     }
 
-    // We dealt with the residual part. We are left with some multiple of 16 floats.
-    // In each iteration we calculate 16 floats = 512 bits.
+    // Process remaining full chunks of 16 elements (2x8)
+    // Using do-while since dim > 16 guarantees at least one iteration
     do {
-        InnerProductStepSQ8_FMA(pVect1, pVect2, sum256, min_val_vec, delta_vec);
-        InnerProductStepSQ8_FMA(pVect1, pVect2, sum256, min_val_vec, delta_vec);
+        InnerProductStepSQ8_FMA(pVect1, pVect2, sum256);
+        InnerProductStepSQ8_FMA(pVect1, pVect2, sum256);
     } while (pVect1 < pEnd1);
 
-    return my_mm256_reduce_add_ps(sum256);
+    // Reduce to get Σ(q_i * y_i)
+    float quantized_dot = my_mm256_reduce_add_ps(sum256);
+
+    // Get quantization parameters from stored vector (after quantized data)
+    const uint8_t *pVect2Base = static_cast<const uint8_t *>(pVect2v);
+    const float *params2 = reinterpret_cast<const float *>(pVect2Base + dimension);
+    const float min_val = params2[sq8::MIN_VAL];
+    const float delta = params2[sq8::DELTA];
+
+    // Get precomputed y_sum from query blob (stored after the dim floats)
+    const float y_sum = static_cast<const float *>(pVect1v)[dimension + sq8::SUM_QUERY];
+
+    // Apply the algebraic formula: IP = min * y_sum + delta * Σ(q_i * y_i)
+    return min_val * y_sum + delta * quantized_dot;
 }
 
 template <unsigned char residual> // 0..15
@@ -100,7 +105,6 @@ float SQ8_InnerProductSIMD16_AVX2_FMA(const void *pVect1v, const void *pVect2v,
 
 template <unsigned char residual> // 0..15
 float SQ8_CosineSIMD16_AVX2_FMA(const void *pVect1v, const void *pVect2v, size_t dimension) {
-    // Calculate inner product using common implementation with normalization
-    float ip = SQ8_InnerProductImp_FMA<residual>(pVect1v, pVect2v, dimension);
-    return 1.0f - ip;
+    // Cosine distance = 1 - IP (vectors are pre-normalized)
+    return SQ8_InnerProductSIMD16_AVX2_FMA<residual>(pVect1v, pVect2v, dimension);
 }
@@ -6,85 +6,96 @@
  * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
  * GNU Affero General Public License v3 (AGPLv3).
  */
+#pragma once
 #include "VecSim/spaces/space_includes.h"
 #include "VecSim/spaces/AVX_utils.h"
+#include "VecSim/types/sq8.h"
 
-static inline void InnerProductStepSQ8(const float *&pVect1, const uint8_t *&pVect2, __m256 &sum256,
-                                       const __m256 &min_val_vec, const __m256 &delta_vec) {
-    // Load 8 float elements from pVect1
+using sq8 = vecsim_types::sq8;
+
+/*
+ * Optimized asymmetric SQ8 inner product using algebraic identity:
+ *
+ *   IP(x, y) = Σ(x_i * y_i)
+ *            ≈ Σ((min + delta * q_i) * y_i)
+ *            = min * Σy_i + delta * Σ(q_i * y_i)
+ *            = min * y_sum + delta * quantized_dot_product
+ *
+ * where y_sum = Σy_i is precomputed and stored in the query blob.
+ * This avoids dequantization in the hot loop - we only compute Σ(q_i * y_i).
+ */
+
+// Helper: compute Σ(q_i * y_i) for 8 elements (no dequantization)
+static inline void InnerProductStepSQ8(const float *&pVect1, const uint8_t *&pVect2,
+                                       __m256 &sum256) {
+    // Load 8 float elements from query
     __m256 v1 = _mm256_loadu_ps(pVect1);
     pVect1 += 8;
 
-    // Load 8 uint8 elements from pVect2, convert to int32, then to float
-    __m128i v2_128 = _mm_loadl_epi64((__m128i *)pVect2);
+    // Load 8 uint8 elements and convert to float
+    __m128i v2_128 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(pVect2));
     pVect2 += 8;
 
-    // Zero-extend uint8 to int32
     __m256i v2_256 = _mm256_cvtepu8_epi32(v2_128);
-
-    // Convert int32 to float
     __m256 v2_f = _mm256_cvtepi32_ps(v2_256);
 
-    // Dequantize: (val * delta) + min_val
-    __m256 v2_dequant = _mm256_add_ps(_mm256_mul_ps(v2_f, delta_vec), min_val_vec);
-
-    // Compute dot product and add to sum
-    sum256 = _mm256_add_ps(sum256, _mm256_mul_ps(v1, v2_dequant));
+    // Accumulate q_i * y_i (no dequantization!)
+    // Using mul + add since this is the non-FMA version
+    sum256 = _mm256_add_ps(sum256, _mm256_mul_ps(v2_f, v1));
 }
 
 template <unsigned char residual> // 0..15
 float SQ8_InnerProductImp_AVX2(const void *pVect1v, const void *pVect2v, size_t dimension) {
     const float *pVect1 = static_cast<const float *>(pVect1v);
-    // pVect2 is a quantized uint8_t vector
     const uint8_t *pVect2 = static_cast<const uint8_t *>(pVect2v);
     const float *pEnd1 = pVect1 + dimension;
 
-    // Get dequantization parameters from the end of quantized vector
-    const float min_val = *reinterpret_cast<const float *>(pVect2 + dimension);
-    const float delta = *reinterpret_cast<const float *>(pVect2 + dimension + sizeof(float));
-    // Create broadcast vectors for SIMD operations
-    __m256 min_val_vec = _mm256_set1_ps(min_val);
-    __m256 delta_vec = _mm256_set1_ps(delta);
-
+    // Initialize sum accumulator for Σ(q_i * y_i)
     __m256 sum256 = _mm256_setzero_ps();
 
-    // Deal with 1-7 floats with mask loading, if needed. `dim` is >16, so we have at least one
-    // 16-float block, so mask loading is guaranteed to be safe.
+    // Handle residual elements first (0-7 elements)
     if constexpr (residual % 8) {
         __mmask8 constexpr mask = (1 << (residual % 8)) - 1;
         __m256 v1 = my_mm256_maskz_loadu_ps<mask>(pVect1);
         pVect1 += residual % 8;
 
-        // Load quantized values and dequantize
-        __m128i v2_128 = _mm_loadl_epi64((__m128i *)pVect2);
+        // Load uint8 elements and convert to float
+        __m128i v2_128 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(pVect2));
         pVect2 += residual % 8;
 
-        // Zero-extend uint8 to int32
         __m256i v2_256 = _mm256_cvtepu8_epi32(v2_128);
-
-        // Convert int32 to float
         __m256 v2_f = _mm256_cvtepi32_ps(v2_256);
 
-        // Dequantize: (val * delta) + min_val
-        __m256 v2_dequant = _mm256_add_ps(_mm256_mul_ps(v2_f, delta_vec), min_val_vec);
-
-        // Compute dot product with masking
-        sum256 = _mm256_mul_ps(v1, v2_dequant);
+        // Compute q_i * y_i (no dequantization)
+        sum256 = _mm256_mul_ps(v1, v2_f);
     }
 
-    // If the reminder is >=8, have another step of 8 floats
+    // If the residual is >=8, have another step of 8 floats
     if constexpr (residual >= 8) {
-        InnerProductStepSQ8(pVect1, pVect2, sum256, min_val_vec, delta_vec);
+        InnerProductStepSQ8(pVect1, pVect2, sum256);
     }
 
-    // We dealt with the residual part. We are left with some multiple of 16 floats.
-    // In each iteration we calculate 16 floats = 512 bits.
+    // Process remaining full chunks of 16 elements (2x8)
+    // Using do-while since dim > 16 guarantees at least one iteration
     do {
-        InnerProductStepSQ8(pVect1, pVect2, sum256, min_val_vec, delta_vec);
-        InnerProductStepSQ8(pVect1, pVect2, sum256, min_val_vec, delta_vec);
+        InnerProductStepSQ8(pVect1, pVect2, sum256);
+        InnerProductStepSQ8(pVect1, pVect2, sum256);
     } while (pVect1 < pEnd1);
 
-    return my_mm256_reduce_add_ps(sum256);
+    // Reduce to get Σ(q_i * y_i)
+    float quantized_dot = my_mm256_reduce_add_ps(sum256);
+
+    // Get quantization parameters from stored vector (after quantized data)
+    const uint8_t *pVect2Base = static_cast<const uint8_t *>(pVect2v);
+    const float *params2 = reinterpret_cast<const float *>(pVect2Base + dimension);
+    const float min_val = params2[sq8::MIN_VAL];
+    const float delta = params2[sq8::DELTA];
+
+    // Get precomputed y_sum from query blob (stored after the dim floats)
+    const float y_sum = static_cast<const float *>(pVect1v)[dimension + sq8::SUM_QUERY];
+
+    // Apply the algebraic formula: IP = min * y_sum + delta * Σ(q_i * y_i)
+    return min_val * y_sum + delta * quantized_dot;
 }
 
 template <unsigned char residual> // 0..15
@@ -95,6 +106,5 @@ float SQ8_InnerProductSIMD16_AVX2(const void *pVect1v, const void *pVect2v, size
 template <unsigned char residual> // 0..15
 float SQ8_CosineSIMD16_AVX2(const void *pVect1v, const void *pVect2v, size_t dimension) {
     // Calculate inner product using common implementation with normalization
-    float ip = SQ8_InnerProductImp_AVX2<residual>(pVect1v, pVect2v, dimension);
-    return 1.0f - ip;
+    return SQ8_InnerProductSIMD16_AVX2<residual>(pVect1v, pVect2v, dimension);
 }