Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
97 commits
Select commit Hold shift + click to select a range
746bf31
Add SQ8-to-SQ8 distance functions and optimizations
dor-forer Dec 28, 2025
8697a3e
Add SQ8-to-SQ8 benchmark tests and update related scripts
dor-forer Dec 28, 2025
e0ce268
Format
dor-forer Dec 28, 2025
ab6b077
Orgnizing
dor-forer Dec 28, 2025
931e339
Add full sq8 bencharks
dor-forer Dec 28, 2025
a56474d
Optimize the sq8 sq8
dor-forer Dec 28, 2025
a25f45c
Optimize SQ8 distance functions for NEON by reducing operations and i…
dor-forer Dec 28, 2025
0ad941e
format
dor-forer Dec 28, 2025
68cd068
Add NEON DOTPROD-optimized distance functions for SQ8-to-SQ8 calculat…
dor-forer Dec 28, 2025
0b4b568
PR
dor-forer Dec 28, 2025
d0fd2e4
Remove NEON DOTPROD-optimized distance functions for INT8, UINT8, and…
dor-forer Dec 28, 2025
9de6163
Fix vector layout documentation by removing inv_norm from comments in…
dor-forer Dec 28, 2025
63a46a1
Remove 'constexpr' from ones vector declaration in NEON inner product…
dor-forer Dec 28, 2025
101aa69
Add SQ8-to-SQ8 L2 squared distance functions with SIMD optimizations
dor-forer Dec 28, 2025
5bef023
Change the name
dor-forer Dec 28, 2025
72053af
Add full range tests for SQ8 distance functions with SIMD optimizations
dor-forer Dec 29, 2025
525f8da
Refactor distance functions to remove inv_norm parameter and update d…
dor-forer Dec 29, 2025
13a477b
Update SQ8 Cosine test to normalize both input vectors and adjust dis…
dor-forer Dec 29, 2025
c18000e
Rename 'compressed' to 'quantized' in SQ8 functions for clarity and c…
dor-forer Dec 29, 2025
b58f8ef
Merge branch 'dorer-sq8-dist-functions-ip-cosine' of https://github.c…
dor-forer Dec 29, 2025
286990a
Rename 'compressed' to 'quantized' in SQ8 distance tests for clarity
dor-forer Dec 29, 2025
8cdc3fc
Refactor quantization function to remove unused normalization calcula…
dor-forer Dec 29, 2025
189290e
Add TODO to store vector's norm and sum in L2 squared distance calcul…
dor-forer Dec 29, 2025
bbf810e
Implement SQ8-to-SQ8 distance functions with precomputed sum and norm…
dor-forer Dec 29, 2025
dbbb7d9
Add edge case tests for SQ8-to-SQ8 precomputed cosine distance functions
dor-forer Dec 29, 2025
36ab068
Refactor SQ8 test cases to use CreateSQ8QuantizedVector for vector po…
dor-forer Dec 29, 2025
00617d7
Implement SQ8-to-SQ8 precomputed distance functions using ARM NEON, S…
dor-forer Dec 29, 2025
4331d91
Implement SQ8-to-SQ8 precomputed inner product and cosine functions; …
dor-forer Dec 29, 2025
2e7b30d
Refactor SQ8 distance functions and remove precomputed variants
dor-forer Dec 30, 2025
a111e36
Refactor SQ8 distance functions and tests for improved clarity and co…
dor-forer Dec 30, 2025
d510b8a
Refactor SQ8 benchmarks by removing precomputed variants and updating…
dor-forer Dec 30, 2025
ee26740
foramt
dor-forer Dec 30, 2025
afe1a4f
Remove serialization benchmark script for HNSW disk serialization
dor-forer Dec 30, 2025
a31f95c
Refactor SQ8 distance functions and tests to remove precomputed norm …
dor-forer Dec 31, 2025
f12ecf4
format
dor-forer Dec 31, 2025
0e36030
Merge branch 'main' of https://github.com/RedisAI/VectorSimilarity in…
dor-forer Dec 31, 2025
fdc16c6
Refactor SQ8 distance tests to use compressed vectors and improve nor…
dor-forer Dec 31, 2025
e5f519c
Update vector layout documentation to reflect removal of sum of squar…
dor-forer Dec 31, 2025
53f8e0e
Merge branch 'dorer-sq8-dist-functions-ip-cosine' of https://github.c…
dor-forer Jan 1, 2026
b12c796
Refactor L2 SQ8 distance computation to remove unused accumulators an…
dor-forer Jan 1, 2026
db1e671
Refactor SQ8 distance functions to remove norm computation
dor-forer Jan 1, 2026
d5b8587
Update SQ8-to-SQ8 distance function comment to remove norm reference
dor-forer Jan 1, 2026
91f48df
Refactor cosine similarity functions to remove unnecessary subtractio…
dor-forer Jan 1, 2026
0050bb9
Merge branch 'dorer-sq8-dist-functions-ip-cosine' of https://github.c…
dor-forer Jan 1, 2026
a75ddd6
Refactor L2 SQ8 distance functions to eliminate unused accumulators a…
dor-forer Jan 1, 2026
a37918b
Refactor SQ8 L2 and IP implementations to use common inner product fu…
dor-forer Jan 1, 2026
b660111
Refactor cosine similarity functions to use specific SIMD implementat…
dor-forer Jan 1, 2026
40ef6a3
Merge branch 'dorer-sq8-dist-functions-ip-cosine' of https://github.c…
dor-forer Jan 1, 2026
5a544db
Refactor L2 distance functions for SQ8 vectors to utilize common inne…
dor-forer Jan 1, 2026
9166cac
Refactor benchmark setup to allocate additional space for sum and sum…
dor-forer Jan 4, 2026
f28f4e7
Add CPU feature checks to disable optimizations for AArch64 in SQ8 di…
dor-forer Jan 4, 2026
e50dc45
Add CPU feature checks to disable optimizations for AArch64 in SQ8 di…
dor-forer Jan 4, 2026
d24ea8e
Merge branch 'dorer-sq8-dist-functions-ip-cosine' of https://github.c…
dor-forer Jan 4, 2026
6bbbc38
Fix formatting issues in SQ8 inner product function and clean up cond…
dor-forer Jan 4, 2026
7983b70
Refactor SQ8 distance functions and tests for improved readability an…
dor-forer Jan 4, 2026
7f4af80
Merge branch 'dorer-sq8-dist-functions-ip-cosine' of https://github.c…
dor-forer Jan 4, 2026
c6353cb
Refactor SQ8 L2Sqr tests to use quantized vectors and improve alignme…
dor-forer Jan 4, 2026
66a5f88
Enhance SQ8 Inner Product Implementations with Optimized Dot Product …
dor-forer Jan 4, 2026
d7972e9
Fix header guard duplication and update test assertion for floating-p…
dor-forer Jan 4, 2026
a8075bf
Add missing pragma once directive in NEON header files
dor-forer Jan 4, 2026
cddc497
Refactor SQ8 distance functions for improved performance and clarity
dor-forer Jan 4, 2026
4f0fec7
Update SQ8 vector population functions to include metadata and adjust…
dor-forer Jan 4, 2026
8ab4192
Refactor SQ8 inner product functions for improved clarity and perform…
dor-forer Jan 4, 2026
63f4e87
Merge branch 'dorer-sq8-dist-functions-ip-cosine' of https://github.c…
dor-forer Jan 4, 2026
5a52b79
Refactor L2 distance functions to utilize common inner product implem…
dor-forer Jan 4, 2026
8c59cb2
Rename inner product implementation functions for AVX2 and AVX512 for…
dor-forer Jan 4, 2026
a0796db
Merge branch 'dorer-sq8-dist-functions-ip-cosine' of https://github.c…
dor-forer Jan 4, 2026
a4ff5d0
Refactor SQ8 cosine function to utilize inner product function for im…
dor-forer Jan 4, 2026
c22158f
Remove redundant inner product edge case tests for SQ8 distance funct…
dor-forer Jan 4, 2026
4c19d9e
Add SVE2 support to SQ8-to-SQ8 Inner Product distance function
dor-forer Jan 4, 2026
e2ad287
Merge branch 'dorer-sq8-dist-functions-ip-cosine' of https://github.c…
dor-forer Jan 4, 2026
668315b
Fix SQ8_Cosine to call the correct inner product function for improve…
dor-forer Jan 4, 2026
5c22af8
Remove SVE2 and other optimizations from SQ8 cosine function test for…
dor-forer Jan 4, 2026
ad515ba
Merge branch 'dorer-sq8-dist-functions-ip-cosine' of https://github.c…
dor-forer Jan 4, 2026
695bbc0
Merge branch 'main' of https://github.com/RedisAI/VectorSimilarity in…
dor-forer Jan 5, 2026
cae2dd6
Add L2 distance function without optimizations for testing purposes
dor-forer Jan 5, 2026
b2506b9
Refactor L2 distance function and update test assertions for precision
dor-forer Jan 5, 2026
59784db
Update L2 squared distance functions to support 64 residuals in NEON …
dor-forer Jan 5, 2026
8d24786
Refactor L2 distance function conditions for NEON optimizations
dor-forer Jan 5, 2026
0dde4d5
Adjust NEON_DOTPROD benchmark initialization to use a dimension of 16
dor-forer Jan 5, 2026
3b38d8e
Update NEON benchmarks to support 64 dimensions for L2 and Cosine met…
dor-forer Jan 5, 2026
2e04721
Optimize SQ8 Inner Product Implementation
dor-forer Jan 6, 2026
d06a728
Refactor SQ8 inner product functions to clarify FMA usage and improve…
dor-forer Jan 6, 2026
84d315c
Merge branch 'main' of https://github.com/RedisAI/VectorSimilarity in…
dor-forer Jan 6, 2026
131b127
Update SQ8 test cases to improve alignment checks and adjust quantize…
dor-forer Jan 7, 2026
aeec78b
Add optimized SQ8 inner product implementation and update test cases
dor-forer Jan 7, 2026
f709f96
Merge branch 'main' of https://github.com/RedisAI/VectorSimilarity in…
dor-forer Jan 7, 2026
0d221f2
Fix pointer usage in SQ8 inner product implementation to reference or…
dor-forer Jan 7, 2026
d193fc5
Merge branch 'main' of https://github.com/RedisAI/VectorSimilarity in…
dor-forer Jan 7, 2026
fa2122d
Add sq8 type definition and update inner product implementations for …
dor-forer Jan 7, 2026
5506f55
Refactor SQ8 inner product implementations to use structured quantiza…
dor-forer Jan 7, 2026
8cbc649
Fix SQ8 EdgeCases test by adjusting vector size for constant vector test
dor-forer Jan 8, 2026
7b34dc4
Fix formatting in SQ8_EdgeCases test by adjusting vector initialization
dor-forer Jan 8, 2026
053411e
Merge branch 'main' of https://github.com/RedisAI/VectorSimilarity in…
dor-forer Jan 8, 2026
8186454
Refactor SQ8 inner product implementations to use precomputed y_sum f…
dor-forer Jan 8, 2026
37600e1
Fix formatting in SQ8_EdgeCases test for better readability
dor-forer Jan 8, 2026
77d03be
Refactor SQ8 cosine distance calculation to use optimized function
dor-forer Jan 8, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 45 additions & 27 deletions src/VecSim/spaces/IP/IP.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,39 +16,57 @@ using bfloat16 = vecsim_types::bfloat16;
using float16 = vecsim_types::float16;
using sq8 = vecsim_types::sq8;

float FLOAT_INTEGER_InnerProduct(const float *pVect1v, const uint8_t *pVect2v, size_t dimension,
float min_val, float delta) {
float res = 0;
for (size_t i = 0; i < dimension; i++) {
float dequantized_V2 = (pVect2v[i] * delta + min_val);
res += pVect1v[i] * dequantized_V2;
}
return res;
}

/*
* Optimized asymmetric SQ8 inner product using algebraic identity:
* IP(x, y) = Σ(x_i * y_i)
* ≈ Σ((min + delta * q_i) * y_i)
* = min * Σy_i + delta * Σ(q_i * y_i)
* = min * y_sum + delta * quantized_dot_product
*
* Uses 4x loop unrolling with multiple accumulators for ILP.
* pVect1 is a vector of float32, pVect2 is a quantized uint8_t vector
*/
float SQ8_InnerProduct(const void *pVect1v, const void *pVect2v, size_t dimension) {

const auto *pVect1 = static_cast<const float *>(pVect1v);
const auto *pVect2 = static_cast<const uint8_t *>(pVect2v);
// pVect2 is a vector of uint8_t, so we need to de-quantize it, normalize it and then multiply
// it. it is structured as [quantized values (int8_t * dim)][min_val (float)][delta
// (float)]] The last two values are used to dequantize the vector.
const float min_val = *reinterpret_cast<const float *>(pVect2 + dimension);
const float delta = *reinterpret_cast<const float *>(pVect2 + dimension + sizeof(float));
// Compute inner product with dequantization
const float res = FLOAT_INTEGER_InnerProduct(pVect1, pVect2, dimension, min_val, delta);
return 1.0f - res;

// Use 4 accumulators for instruction-level parallelism
float sum0 = 0, sum1 = 0, sum2 = 0, sum3 = 0;

// Main loop: process 4 elements per iteration
size_t i = 0;
size_t dim4 = dimension & ~size_t(3); // dim4 is a multiple of 4
for (; i < dim4; i += 4) {
sum0 += pVect1[i + 0] * static_cast<float>(pVect2[i + 0]);
sum1 += pVect1[i + 1] * static_cast<float>(pVect2[i + 1]);
sum2 += pVect1[i + 2] * static_cast<float>(pVect2[i + 2]);
sum3 += pVect1[i + 3] * static_cast<float>(pVect2[i + 3]);
}

// Handle remainder (0-3 elements)
for (; i < dimension; i++) {
sum0 += pVect1[i] * static_cast<float>(pVect2[i]);
}

// Combine accumulators
float quantized_dot = (sum0 + sum1) + (sum2 + sum3);

// Get quantization parameters from stored vector
const float *params = reinterpret_cast<const float *>(pVect2 + dimension);
const float min_val = params[sq8::MIN_VAL];
const float delta = params[sq8::DELTA];

// Get precomputed y_sum from query blob (stored after the dim floats)
const float y_sum = pVect1[dimension + sq8::SUM_QUERY];

// Apply formula: IP = min * y_sum + delta * Σ(q_i * y_i)
const float ip = min_val * y_sum + delta * quantized_dot;
return 1.0f - ip;
}

float SQ8_Cosine(const void *pVect1v, const void *pVect2v, size_t dimension) {
const auto *pVect1 = static_cast<const float *>(pVect1v);
const auto *pVect2 = static_cast<const uint8_t *>(pVect2v);

// Get quantization parameters
const float min_val = *reinterpret_cast<const float *>(pVect2 + dimension);
const float delta = *reinterpret_cast<const float *>(pVect2 + dimension + sizeof(float));
// Compute inner product with dequantization
const float res = FLOAT_INTEGER_InnerProduct(pVect1, pVect2, dimension, min_val, delta);
return 1.0f - res;
return SQ8_InnerProduct(pVect1v, pVect2v, dimension);
}

// SQ8-to-SQ8: Common inner product implementation that returns the raw inner product value
Expand Down
100 changes: 52 additions & 48 deletions src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,91 +6,96 @@
* (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
* GNU Affero General Public License v3 (AGPLv3).
*/
#pragma once
#include "VecSim/spaces/space_includes.h"
#include "VecSim/spaces/AVX_utils.h"
#include "VecSim/types/sq8.h"
using sq8 = vecsim_types::sq8;

/*
* Optimized asymmetric SQ8 inner product using algebraic identity:
*
* IP(x, y) = Σ(x_i * y_i)
* ≈ Σ((min + delta * q_i) * y_i)
* = min * Σy_i + delta * Σ(q_i * y_i)
* = min * y_sum + delta * quantized_dot_product
*
* where y_sum = Σy_i is precomputed and stored in the query blob.
* This avoids dequantization in the hot loop - we only compute Σ(q_i * y_i).
*
* This version uses FMA instructions for better performance.
*/

// Helper: compute Σ(q_i * y_i) for 8 elements using FMA (no dequantization)
static inline void InnerProductStepSQ8_FMA(const float *&pVect1, const uint8_t *&pVect2,
__m256 &sum256, const __m256 &min_val_vec,
const __m256 &delta_vec) {
// Load 8 float elements from pVect1
__m256 &sum256) {
// Load 8 float elements from query
__m256 v1 = _mm256_loadu_ps(pVect1);
pVect1 += 8;

// Load 8 uint8 elements from pVect2, convert to int32, then to float
__m128i v2_128 = _mm_loadl_epi64((__m128i *)pVect2);
// Load 8 uint8 elements and convert to float
__m128i v2_128 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(pVect2));
pVect2 += 8;

// Zero-extend uint8 to int32
__m256i v2_256 = _mm256_cvtepu8_epi32(v2_128);

// Convert int32 to float
__m256 v2_f = _mm256_cvtepi32_ps(v2_256);

// Dequantize and compute dot product in one step using FMA
// (val * delta) + min_val -> v2_dequant
// sum256 += v1 * v2_dequant
// Using FMA: sum256 = v1 * v2_dequant + sum256

// First, compute v2_dequant = v2_f * delta_vec + min_val_vec
__m256 v2_dequant = _mm256_fmadd_ps(v2_f, delta_vec, min_val_vec);

// Then, compute sum256 += v1 * v2_dequant using FMA
sum256 = _mm256_fmadd_ps(v1, v2_dequant, sum256);
// Accumulate q_i * y_i using FMA (no dequantization!)
sum256 = _mm256_fmadd_ps(v2_f, v1, sum256);
}

template <unsigned char residual> // 0..15
float SQ8_InnerProductImp_FMA(const void *pVect1v, const void *pVect2v, size_t dimension) {
const float *pVect1 = static_cast<const float *>(pVect1v);
// pVect2 is a quantized uint8_t vector
const uint8_t *pVect2 = static_cast<const uint8_t *>(pVect2v);
const float *pEnd1 = pVect1 + dimension;

// Get dequantization parameters from the end of quantized vector
const float min_val = *reinterpret_cast<const float *>(pVect2 + dimension);
const float delta = *reinterpret_cast<const float *>(pVect2 + dimension + sizeof(float));
// Create broadcast vectors for SIMD operations
__m256 min_val_vec = _mm256_set1_ps(min_val);
__m256 delta_vec = _mm256_set1_ps(delta);

// Initialize sum accumulator for Σ(q_i * y_i)
__m256 sum256 = _mm256_setzero_ps();

// Deal with 1-7 floats with mask loading, if needed. `dim` is >16, so we have at least one
// 16-float block, so mask loading is guaranteed to be safe.
// Handle residual elements first (0-7 elements)
if constexpr (residual % 8) {
__mmask8 constexpr mask = (1 << (residual % 8)) - 1;
__m256 v1 = my_mm256_maskz_loadu_ps<mask>(pVect1);
pVect1 += residual % 8;

// Load quantized values and dequantize
__m128i v2_128 = _mm_loadl_epi64((__m128i *)pVect2);
// Load uint8 elements and convert to float
__m128i v2_128 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(pVect2));
pVect2 += residual % 8;

// Zero-extend uint8 to int32
__m256i v2_256 = _mm256_cvtepu8_epi32(v2_128);

// Convert int32 to float
__m256 v2_f = _mm256_cvtepi32_ps(v2_256);

// Dequantize using FMA: (val * delta) + min_val
__m256 v2_dequant = _mm256_fmadd_ps(v2_f, delta_vec, min_val_vec);

// Compute dot product with masking
sum256 = _mm256_mul_ps(v1, v2_dequant);
// Compute q_i * y_i (no dequantization)
sum256 = _mm256_mul_ps(v1, v2_f);
}

// If the reminder is >=8, have another step of 8 floats
// If the residual is >=8, have another step of 8 floats
if constexpr (residual >= 8) {
InnerProductStepSQ8_FMA(pVect1, pVect2, sum256, min_val_vec, delta_vec);
InnerProductStepSQ8_FMA(pVect1, pVect2, sum256);
}

// We dealt with the residual part. We are left with some multiple of 16 floats.
// In each iteration we calculate 16 floats = 512 bits.
// Process remaining full chunks of 16 elements (2x8)
// Using do-while since dim > 16 guarantees at least one iteration
do {
InnerProductStepSQ8_FMA(pVect1, pVect2, sum256, min_val_vec, delta_vec);
InnerProductStepSQ8_FMA(pVect1, pVect2, sum256, min_val_vec, delta_vec);
InnerProductStepSQ8_FMA(pVect1, pVect2, sum256);
InnerProductStepSQ8_FMA(pVect1, pVect2, sum256);
} while (pVect1 < pEnd1);

return my_mm256_reduce_add_ps(sum256);
// Reduce to get Σ(q_i * y_i)
float quantized_dot = my_mm256_reduce_add_ps(sum256);

// Get quantization parameters from stored vector (after quantized data)
const uint8_t *pVect2Base = static_cast<const uint8_t *>(pVect2v);
const float *params2 = reinterpret_cast<const float *>(pVect2Base + dimension);
const float min_val = params2[sq8::MIN_VAL];
const float delta = params2[sq8::DELTA];

// Get precomputed y_sum from query blob (stored after the dim floats)
const float y_sum = static_cast<const float *>(pVect1v)[dimension + sq8::SUM_QUERY];

// Apply the algebraic formula: IP = min * y_sum + delta * Σ(q_i * y_i)
return min_val * y_sum + delta * quantized_dot;
}

template <unsigned char residual> // 0..15
Expand All @@ -100,7 +105,6 @@ float SQ8_InnerProductSIMD16_AVX2_FMA(const void *pVect1v, const void *pVect2v,

template <unsigned char residual> // 0..15
float SQ8_CosineSIMD16_AVX2_FMA(const void *pVect1v, const void *pVect2v, size_t dimension) {
// Calculate inner product using common implementation with normalization
float ip = SQ8_InnerProductImp_FMA<residual>(pVect1v, pVect2v, dimension);
return 1.0f - ip;
// Cosine distance = 1 - IP (vectors are pre-normalized)
return SQ8_InnerProductSIMD16_AVX2_FMA<residual>(pVect1v, pVect2v, dimension);
}
94 changes: 52 additions & 42 deletions src/VecSim/spaces/IP/IP_AVX2_SQ8.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,85 +6,96 @@
* (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
* GNU Affero General Public License v3 (AGPLv3).
*/
#pragma once
#include "VecSim/spaces/space_includes.h"
#include "VecSim/spaces/AVX_utils.h"
#include "VecSim/types/sq8.h"

static inline void InnerProductStepSQ8(const float *&pVect1, const uint8_t *&pVect2, __m256 &sum256,
const __m256 &min_val_vec, const __m256 &delta_vec) {
// Load 8 float elements from pVect1
using sq8 = vecsim_types::sq8;

/*
* Optimized asymmetric SQ8 inner product using algebraic identity:
*
* IP(x, y) = Σ(x_i * y_i)
* ≈ Σ((min + delta * q_i) * y_i)
* = min * Σy_i + delta * Σ(q_i * y_i)
* = min * y_sum + delta * quantized_dot_product
*
* where y_sum = Σy_i is precomputed and stored in the query blob.
* This avoids dequantization in the hot loop - we only compute Σ(q_i * y_i).
*/

// Helper: compute Σ(q_i * y_i) for 8 elements (no dequantization)
static inline void InnerProductStepSQ8(const float *&pVect1, const uint8_t *&pVect2,
__m256 &sum256) {
// Load 8 float elements from query
__m256 v1 = _mm256_loadu_ps(pVect1);
pVect1 += 8;

// Load 8 uint8 elements from pVect2, convert to int32, then to float
__m128i v2_128 = _mm_loadl_epi64((__m128i *)pVect2);
// Load 8 uint8 elements and convert to float
__m128i v2_128 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(pVect2));
pVect2 += 8;

// Zero-extend uint8 to int32
__m256i v2_256 = _mm256_cvtepu8_epi32(v2_128);

// Convert int32 to float
__m256 v2_f = _mm256_cvtepi32_ps(v2_256);

// Dequantize: (val * delta) + min_val
__m256 v2_dequant = _mm256_add_ps(_mm256_mul_ps(v2_f, delta_vec), min_val_vec);

// Compute dot product and add to sum
sum256 = _mm256_add_ps(sum256, _mm256_mul_ps(v1, v2_dequant));
// Accumulate q_i * y_i (no dequantization!)
// Using mul + add since this is the non-FMA version
sum256 = _mm256_add_ps(sum256, _mm256_mul_ps(v2_f, v1));
}

template <unsigned char residual> // 0..15
float SQ8_InnerProductImp_AVX2(const void *pVect1v, const void *pVect2v, size_t dimension) {
const float *pVect1 = static_cast<const float *>(pVect1v);
// pVect2 is a quantized uint8_t vector
const uint8_t *pVect2 = static_cast<const uint8_t *>(pVect2v);
const float *pEnd1 = pVect1 + dimension;

// Get dequantization parameters from the end of quantized vector
const float min_val = *reinterpret_cast<const float *>(pVect2 + dimension);
const float delta = *reinterpret_cast<const float *>(pVect2 + dimension + sizeof(float));
// Create broadcast vectors for SIMD operations
__m256 min_val_vec = _mm256_set1_ps(min_val);
__m256 delta_vec = _mm256_set1_ps(delta);

// Initialize sum accumulator for Σ(q_i * y_i)
__m256 sum256 = _mm256_setzero_ps();

// Deal with 1-7 floats with mask loading, if needed. `dim` is >16, so we have at least one
// 16-float block, so mask loading is guaranteed to be safe.
// Handle residual elements first (0-7 elements)
if constexpr (residual % 8) {
__mmask8 constexpr mask = (1 << (residual % 8)) - 1;
__m256 v1 = my_mm256_maskz_loadu_ps<mask>(pVect1);
pVect1 += residual % 8;

// Load quantized values and dequantize
__m128i v2_128 = _mm_loadl_epi64((__m128i *)pVect2);
// Load uint8 elements and convert to float
__m128i v2_128 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(pVect2));
pVect2 += residual % 8;

// Zero-extend uint8 to int32
__m256i v2_256 = _mm256_cvtepu8_epi32(v2_128);

// Convert int32 to float
__m256 v2_f = _mm256_cvtepi32_ps(v2_256);

// Dequantize: (val * delta) + min_val
__m256 v2_dequant = _mm256_add_ps(_mm256_mul_ps(v2_f, delta_vec), min_val_vec);

// Compute dot product with masking
sum256 = _mm256_mul_ps(v1, v2_dequant);
// Compute q_i * y_i (no dequantization)
sum256 = _mm256_mul_ps(v1, v2_f);
}

// If the reminder is >=8, have another step of 8 floats
// If the residual is >=8, have another step of 8 floats
if constexpr (residual >= 8) {
InnerProductStepSQ8(pVect1, pVect2, sum256, min_val_vec, delta_vec);
InnerProductStepSQ8(pVect1, pVect2, sum256);
}

// We dealt with the residual part. We are left with some multiple of 16 floats.
// In each iteration we calculate 16 floats = 512 bits.
// Process remaining full chunks of 16 elements (2x8)
// Using do-while since dim > 16 guarantees at least one iteration
do {
InnerProductStepSQ8(pVect1, pVect2, sum256, min_val_vec, delta_vec);
InnerProductStepSQ8(pVect1, pVect2, sum256, min_val_vec, delta_vec);
InnerProductStepSQ8(pVect1, pVect2, sum256);
InnerProductStepSQ8(pVect1, pVect2, sum256);
} while (pVect1 < pEnd1);

return my_mm256_reduce_add_ps(sum256);
// Reduce to get Σ(q_i * y_i)
float quantized_dot = my_mm256_reduce_add_ps(sum256);

// Get quantization parameters from stored vector (after quantized data)
const uint8_t *pVect2Base = static_cast<const uint8_t *>(pVect2v);
const float *params2 = reinterpret_cast<const float *>(pVect2Base + dimension);
const float min_val = params2[sq8::MIN_VAL];
const float delta = params2[sq8::DELTA];

// Get precomputed y_sum from query blob (stored after the dim floats)
const float y_sum = static_cast<const float *>(pVect1v)[dimension + sq8::SUM_QUERY];

// Apply the algebraic formula: IP = min * y_sum + delta * Σ(q_i * y_i)
return min_val * y_sum + delta * quantized_dot;
}

template <unsigned char residual> // 0..15
Expand All @@ -95,6 +106,5 @@ float SQ8_InnerProductSIMD16_AVX2(const void *pVect1v, const void *pVect2v, size
template <unsigned char residual> // 0..15
float SQ8_CosineSIMD16_AVX2(const void *pVect1v, const void *pVect2v, size_t dimension) {
// Calculate inner product using common implementation with normalization
float ip = SQ8_InnerProductImp_AVX2<residual>(pVect1v, pVect2v, dimension);
return 1.0f - ip;
return SQ8_InnerProductSIMD16_AVX2<residual>(pVect1v, pVect2v, dimension);
}
Loading
Loading