RedisAI · dor-forer · Jan 4, 2026 · Dec 28, 2025 · Dec 28, 2025 · Dec 28, 2025
diff --git a/src/VecSim/spaces/IP/IP.cpp b/src/VecSim/spaces/IP/IP.cpp
@@ -51,6 +51,53 @@ float SQ8_Cosine(const void *pVect1v, const void *pVect2v, size_t dimension) {
     return 1.0f - res;
 }
 
+// SQ8-to-SQ8: Both vectors are uint8 quantized
+float SQ8_SQ8_InnerProduct(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    const auto *pVect1 = static_cast<const uint8_t *>(pVect1v);
+    const auto *pVect2 = static_cast<const uint8_t *>(pVect2v);
+
+    // Get quantization parameters from pVect1
+    const float min_val1 = *reinterpret_cast<const float *>(pVect1 + dimension);
+    const float delta1 = *reinterpret_cast<const float *>(pVect1 + dimension + sizeof(float));
+
+    // Get quantization parameters from pVect2
+    const float min_val2 = *reinterpret_cast<const float *>(pVect2 + dimension);
+    const float delta2 = *reinterpret_cast<const float *>(pVect2 + dimension + sizeof(float));
+
+    // Compute inner product with dequantization of both vectors
+    float res = 0;
+    for (size_t i = 0; i < dimension; i++) {
+        float dequant1 = pVect1[i] * delta1 + min_val1;
+        float dequant2 = pVect2[i] * delta2 + min_val2;
+        res += dequant1 * dequant2;
+    }
+    return 1.0f - res;
+}
+
+// SQ8-to-SQ8: Both vectors are uint8 quantized (cosine version)
+float SQ8_SQ8_Cosine(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    const auto *pVect1 = static_cast<const uint8_t *>(pVect1v);
+    const auto *pVect2 = static_cast<const uint8_t *>(pVect2v);
+
+    // Get quantization parameters from pVect1
+    const float min_val1 = *reinterpret_cast<const float *>(pVect1 + dimension);
+    const float delta1 = *reinterpret_cast<const float *>(pVect1 + dimension + sizeof(float));
+
+    // Get quantization parameters from pVect2
+    const float min_val2 = *reinterpret_cast<const float *>(pVect2 + dimension);
+    const float delta2 = *reinterpret_cast<const float *>(pVect2 + dimension + sizeof(float));
+
+    // Compute inner product with dequantization of both vectors
+    float res = 0;
+    for (size_t i = 0; i < dimension; i++) {
+        float dequant1 = pVect1[i] * delta1 + min_val1;
+        float dequant2 = pVect2[i] * delta2 + min_val2;
+        res += dequant1 * dequant2;
+    }
+    // Assume both vectors are normalized.
+    return 1.0f - res;
+}
+
 float FP32_InnerProduct(const void *pVect1, const void *pVect2, size_t dimension) {
     auto *vec1 = (float *)pVect1;
     auto *vec2 = (float *)pVect2;

diff --git a/src/VecSim/spaces/IP/IP.h b/src/VecSim/spaces/IP/IP.h
@@ -16,6 +16,12 @@ float SQ8_InnerProduct(const void *pVect1v, const void *pVect2v, size_t dimensio
 // pVect1v vector of type fp32 and pVect2v vector of type uint8
 float SQ8_Cosine(const void *pVect1v, const void *pVect2v, size_t dimension);
 
+// SQ8-to-SQ8: Both vectors are uint8 quantized
+float SQ8_SQ8_InnerProduct(const void *pVect1v, const void *pVect2v, size_t dimension);
+
+// SQ8-to-SQ8: Both vectors are uint8 quantized and normalized
+float SQ8_SQ8_Cosine(const void *pVect1v, const void *pVect2v, size_t dimension);
+
 float FP32_InnerProduct(const void *pVect1, const void *pVect2, size_t dimension);
 
 double FP64_InnerProduct(const void *pVect1, const void *pVect2, size_t dimension);

diff --git a/src/VecSim/spaces/IP/IP_AVX2_SQ8.h b/src/VecSim/spaces/IP/IP_AVX2_SQ8.h
@@ -33,7 +33,7 @@ static inline void InnerProductStepSQ8(const float *&pVect1, const uint8_t *&pVe
 }
 
 template <unsigned char residual> // 0..15
-float SQ8_InnerProductImp(const void *pVect1v, const void *pVect2v, size_t dimension) {
+float SQ8_InnerProductImp_AVX2(const void *pVect1v, const void *pVect2v, size_t dimension) {
     const float *pVect1 = static_cast<const float *>(pVect1v);
     // pVect2 is a quantized uint8_t vector
     const uint8_t *pVect2 = static_cast<const uint8_t *>(pVect2v);
@@ -89,7 +89,7 @@ float SQ8_InnerProductImp(const void *pVect1v, const void *pVect2v, size_t dimen
 
 template <unsigned char residual> // 0..15
 float SQ8_InnerProductSIMD16_AVX2(const void *pVect1v, const void *pVect2v, size_t dimension) {
-    return 1.0f - SQ8_InnerProductImp<residual>(pVect1v, pVect2v, dimension);
+    return 1.0f - SQ8_InnerProductImp_AVX2<residual>(pVect1v, pVect2v, dimension);
 }
 
 template <unsigned char residual> // 0..15
@@ -99,7 +99,7 @@ float SQ8_CosineSIMD16_AVX2(const void *pVect1v, const void *pVect2v, size_t dim
     const float inv_norm = *reinterpret_cast<const float *>(pVect2 + dimension + 2 * sizeof(float));
 
     // Calculate inner product using common implementation with normalization
-    float ip = SQ8_InnerProductImp<residual>(pVect1v, pVect2v, dimension);
+    float ip = SQ8_InnerProductImp_AVX2<residual>(pVect1v, pVect2v, dimension);
 
     // For cosine, we need to account for the vector norms
     // The inv_norm parameter is stored after min_val and delta in the quantized vector

diff --git a/src/VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h b/src/VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h
@@ -36,8 +36,7 @@ static inline void SQ8_InnerProductStep(const float *&pVec1, const uint8_t *&pVe
 
 // Common implementation for both inner product and cosine similarity
 template <unsigned char residual> // 0..15
-float SQ8_InnerProductImp(const void *pVec1v, const void *pVec2v, size_t dimension,
-                          float inv_norm = 1.0f) {
+float SQ8_InnerProductImp_AVX512(const void *pVec1v, const void *pVec2v, size_t dimension) {
     const float *pVec1 = static_cast<const float *>(pVec1v);
     const uint8_t *pVec2 = static_cast<const uint8_t *>(pVec2v);
     const float *pEnd1 = pVec1 + dimension;
@@ -92,7 +91,7 @@ template <unsigned char residual> // 0..15
 float SQ8_InnerProductSIMD16_AVX512F_BW_VL_VNNI(const void *pVec1v, const void *pVec2v,
                                                 size_t dimension) {
     // Calculate inner product using common implementation
-    float ip = SQ8_InnerProductImp<residual>(pVec1v, pVec2v, dimension);
+    float ip = SQ8_InnerProductImp_AVX512<residual>(pVec1v, pVec2v, dimension);
 
     // The inner product similarity is 1 - ip
     return 1.0f - ip;
@@ -106,8 +105,8 @@ float SQ8_CosineSIMD16_AVX512F_BW_VL_VNNI(const void *pVec1v, const void *pVec2v
     const float inv_norm = *reinterpret_cast<const float *>(pVec2 + dimension + 2 * sizeof(float));
 
     // Calculate inner product using common implementation with normalization
-    float ip = SQ8_InnerProductImp<residual>(pVec1v, pVec2v, dimension, inv_norm);
+    float ip = SQ8_InnerProductImp_AVX512<residual>(pVec1v, pVec2v, dimension);
 
     // The cosine similarity is 1 - ip
-    return 1.0f - ip;
+    return 1.0f - ip * inv_norm;
 }
diff --git a/src/VecSim/spaces/IP/IP_AVX512F_SQ8_SQ8_BW_VL_VNNI.h b/src/VecSim/spaces/IP/IP_AVX512F_SQ8_SQ8_BW_VL_VNNI.h
@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) 2006-Present, Redis Ltd.
+ * All rights reserved.
+ *
+ * Licensed under your choice of the Redis Source Available License 2.0
+ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
+ * GNU Affero General Public License v3 (AGPLv3).
+ */
+#pragma once
+#include "VecSim/spaces/space_includes.h"
+#include <immintrin.h>
+
+/**
+ * SQ8-to-SQ8 distance functions.
+ * These functions compute distance between two SQ8 (scalar quantized 8-bit) vectors,
+ * where BOTH vectors are uint8 quantized and dequantization is applied to both
+ * during computation.
+ *
+ * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [inv_norm (float)]
+ * Dequantization formula: dequantized_value = quantized_value * delta + min_val
+ */
+
+// Helper function to perform inner product step for 16 elements with dual dequantization
+static inline void SQ8_SQ8_InnerProductStep(const uint8_t *&pVec1, const uint8_t *&pVec2,
+                                            __m512 &sum, const __m512 &min_val_vec1,
+                                            const __m512 &delta_vec1, const __m512 &min_val_vec2,
+                                            const __m512 &delta_vec2) {
+    // Load 16 uint8 elements from pVec1 and convert to float
+    __m128i v1_128 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pVec1));
+    __m512i v1_512 = _mm512_cvtepu8_epi32(v1_128);
+    __m512 v1_f = _mm512_cvtepi32_ps(v1_512);
+
+    // Dequantize v1: (val * delta1) + min_val1
+    __m512 v1_dequant = _mm512_fmadd_ps(v1_f, delta_vec1, min_val_vec1);
+
+    // Load 16 uint8 elements from pVec2 and convert to float
+    __m128i v2_128 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pVec2));
+    __m512i v2_512 = _mm512_cvtepu8_epi32(v2_128);
+    __m512 v2_f = _mm512_cvtepi32_ps(v2_512);
+
+    // Dequantize v2: (val * delta2) + min_val2
+    __m512 v2_dequant = _mm512_fmadd_ps(v2_f, delta_vec2, min_val_vec2);
+
+    // Compute dot product and add to sum: sum += v1_dequant * v2_dequant
+    sum = _mm512_fmadd_ps(v1_dequant, v2_dequant, sum);
+
+    // Advance pointers
+    pVec1 += 16;
+    pVec2 += 16;
+}
+
+// Common implementation for inner product between two SQ8 vectors
+template <unsigned char residual> // 0..15
+float SQ8_SQ8_InnerProductImp(const void *pVec1v, const void *pVec2v, size_t dimension) {
+    const uint8_t *pVec1 = static_cast<const uint8_t *>(pVec1v);
+    const uint8_t *pVec2 = static_cast<const uint8_t *>(pVec2v);
+    const uint8_t *pEnd1 = pVec1 + dimension;
+
+    // Get dequantization parameters from the end of pVec1
+    const float min_val1 = *reinterpret_cast<const float *>(pVec1 + dimension);
+    const float delta1 = *reinterpret_cast<const float *>(pVec1 + dimension + sizeof(float));
+
+    // Get dequantization parameters from the end of pVec2
+    const float min_val2 = *reinterpret_cast<const float *>(pVec2 + dimension);
+    const float delta2 = *reinterpret_cast<const float *>(pVec2 + dimension + sizeof(float));
+
+    // Create broadcast vectors for SIMD operations
+    __m512 min_val_vec1 = _mm512_set1_ps(min_val1);
+    __m512 delta_vec1 = _mm512_set1_ps(delta1);
+    __m512 min_val_vec2 = _mm512_set1_ps(min_val2);
+    __m512 delta_vec2 = _mm512_set1_ps(delta2);
+
+    // Initialize sum accumulator
+    __m512 sum = _mm512_setzero_ps();
+
+    // Deal with remainder first
+    if constexpr (residual > 0) {
+        // Handle less than 16 elements
+        __mmask16 mask = (1U << residual) - 1;
+
+        // Load and convert v1 elements (safe to load 16 elements, masked later)
+        __m128i v1_128 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pVec1));
+        __m512i v1_512 = _mm512_cvtepu8_epi32(v1_128);
+        __m512 v1_f = _mm512_cvtepi32_ps(v1_512);
+
+        // Dequantize v1
+        __m512 v1_dequant = _mm512_fmadd_ps(v1_f, delta_vec1, min_val_vec1);
+
+        // Load and convert v2 elements
+        __m128i v2_128 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(pVec2));
+        __m512i v2_512 = _mm512_cvtepu8_epi32(v2_128);
+        __m512 v2_f = _mm512_cvtepi32_ps(v2_512);
+
+        // Dequantize v2
+        __m512 v2_dequant = _mm512_fmadd_ps(v2_f, delta_vec2, min_val_vec2);
+
+        // Compute masked dot product
+        __m512 product = _mm512_mul_ps(v1_dequant, v2_dequant);
+        sum = _mm512_maskz_mov_ps(mask, product);
+
+        pVec1 += residual;
+        pVec2 += residual;
+    }
+
+    // Process remaining full chunks of 16 elements
+    do {
+        SQ8_SQ8_InnerProductStep(pVec1, pVec2, sum, min_val_vec1, delta_vec1, min_val_vec2,
+                                 delta_vec2);
+    } while (pVec1 < pEnd1);
+
+    // Horizontal sum and return
+    return _mm512_reduce_add_ps(sum);
+}
+
+// SQ8-to-SQ8 Inner Product distance function
+// Assumes both vectors are normalized.
+// Returns 1 - inner_product (distance form)
+template <unsigned char residual> // 0..15
+float SQ8_SQ8_InnerProductSIMD16_AVX512F_BW_VL_VNNI(const void *pVec1v, const void *pVec2v,
+                                                    size_t dimension) {
+    float ip = SQ8_SQ8_InnerProductImp<residual>(pVec1v, pVec2v, dimension);
+    return 1.0f - ip;
+}
+
+// SQ8-to-SQ8 Cosine distance function
+// Assumes both vectors are normalized.
+// Returns 1 - (inner_product)
+template <unsigned char residual> // 0..15
+float SQ8_SQ8_CosineSIMD16_AVX512F_BW_VL_VNNI(const void *pVec1v, const void *pVec2v,
+                                              size_t dimension) {
+    // Calculate inner product
+    float ip = SQ8_SQ8_InnerProductImp<residual>(pVec1v, pVec2v, dimension);
+
+    // Return cosine similarity
+    return 1.0f - ip;
+}