RedisAI · meiravgri · Jan 4, 2026 · Jan 4, 2026 · Jan 4, 2026 · Jan 4, 2026
diff --git a/src/VecSim/spaces/computer/preprocessors.h b/src/VecSim/spaces/computer/preprocessors.h
@@ -164,22 +164,33 @@ class CosinePreprocessor : public PreprocessorInterface {
  * Given quantized value q_i, the original value is reconstructed as:
  *   x_i ≈ min + delta * q_i
  *
+ * Query processing:
+ * The query vector is not quantized. It remains as DataType, but we precompute
+ * and store metric-specific values to accelerate asymmetric distance computation:
+ * - For IP/Cosine: y_sum = Σy_i (sum of query values)
+ * - For L2: y_sum_squares = Σy_i² (sum of squared query values)
+ *
+ * Query blob layout:
+ * | query_values[dim] | y_sum (IP/Cosine) OR y_sum_squares (L2) |
+ *
+ * Query blob size: (dim + 1) * sizeof(DataType)
+ *
  * === Asymmetric distance (storage x quantized, query y remains float) ===
  *
  * For IP/Cosine:
  *   IP(x, y) = Σ(x_i * y_i)
  *            ≈ Σ((min + delta * q_i) * y_i)
  *            = min * Σy_i + delta * Σ(q_i * y_i)
- *            = min * sum_query + delta * quantized_dot_product
- *   where sum_query = Σy_i is computed at query time.
+ *            = min * y_sum + delta * quantized_dot_product
+ *   where y_sum = Σy_i is precomputed and stored in the query blob.
  *
  * For L2:
  *   ||x - y||² = Σx_i² - 2*Σ(x_i * y_i) + Σy_i²
- *              = sum_squares - 2 * IP(x, y) + sum_sq_query
+ *              = x_sum_squares - 2 * IP(x, y) + y_sum_squares
  *   where:
- *     - sum_squares = Σx_i² is precomputed and stored
+ *     - x_sum_squares = Σx_i² is precomputed and stored in the storage blob
  *     - IP(x, y) is computed using the formula above
- *     - sum_sq_query = Σy_i² is computed at query time
+ *     - y_sum_squares = Σy_i² is precomputed and stored in the query blob
  *
  * === Symmetric distance (both x and y are quantized) ===
  *
@@ -208,7 +219,7 @@ class QuantPreprocessor : public PreprocessorInterface {
 
     // For L2:   store sum + sum_of_squares (2 extra values)
     // For IP/Cosine: store only sum (1 extra value)
-    static constexpr size_t extra_values_count = (Metric == VecSimMetric_L2) ? 2 : 1;
+    static constexpr size_t extra_storage_values_count = (Metric == VecSimMetric_L2) ? 2 : 1;
     static_assert(Metric == VecSimMetric_L2 || Metric == VecSimMetric_IP ||
                       Metric == VecSimMetric_Cosine,
                   "QuantPreprocessor only supports L2, IP and Cosine metrics");
@@ -227,21 +238,56 @@ class QuantPreprocessor : public PreprocessorInterface {
         const DataType inv_delta = DataType{1} / delta;
 
         // Compute sum (and sum of squares for L2) while quantizing
-        DataType sum = DataType{0};
-        DataType sum_squares = DataType{0};
+        // 4 independent accumulators (sum)
+        DataType s0{}, s1{}, s2{}, s3{};
+
+        // 4 independent accumulators (sum of squares), only used for L2
+        DataType q0{}, q1{}, q2{}, q3{};
+
+        size_t i = 0;
+        // round dim down to the nearest multiple of 4
+        size_t dim_round_down = this->dim & ~size_t(3);
 
         // Quantize the values
-        for (size_t i = 0; i < this->dim; i++) {
+        for (; i < dim_round_down; i += 4) {
+            // Load once
+            const DataType x0 = input[i + 0];
+            const DataType x1 = input[i + 1];
+            const DataType x2 = input[i + 2];
+            const DataType x3 = input[i + 3];
             // We know (input - min) => 0
             // If min == max, all values are the same and should be quantized to 0.
             // reconstruction will yield the same original value for all vectors.
-            quantized[i] = static_cast<OUTPUT_TYPE>(std::round((input[i] - min_val) * inv_delta));
+            quantized[i + 0] = static_cast<OUTPUT_TYPE>(std::round((x0 - min_val) * inv_delta));
+            quantized[i + 1] = static_cast<OUTPUT_TYPE>(std::round((x1 - min_val) * inv_delta));
+            quantized[i + 2] = static_cast<OUTPUT_TYPE>(std::round((x2 - min_val) * inv_delta));
+            quantized[i + 3] = static_cast<OUTPUT_TYPE>(std::round((x3 - min_val) * inv_delta));
 
             // Accumulate sum for all metrics
-            sum += input[i];
+            s0 += x0;
+            s1 += x1;
+            s2 += x2;
+            s3 += x3;
+
             // Accumulate sum of squares only for L2 metric
             if constexpr (Metric == VecSimMetric_L2) {
-                sum_squares += input[i] * input[i];
+                q0 += x0 * x0;
+                q1 += x1 * x1;
+                q2 += x2 * x2;
+                q3 += x3 * x3;
+            }
+        }
+
+        // Tail: 0..3 remaining elements (still the same pass, just finishing work)
+        DataType sum = (s0 + s1) + (s2 + s3);
+        DataType sum_squares = (q0 + q1) + (q2 + q3);
+
+        for (; i < this->dim; ++i) {
+            const DataType x = input[i];
+            quantized[i] = static_cast<OUTPUT_TYPE>(std::round((x - min_val) * inv_delta));
+            sum += x;
+            if constexpr (Metric == VecSimMetric_L2) {
+                sum_squares += x * x;
             }
         }
 
@@ -258,11 +304,56 @@ class QuantPreprocessor : public PreprocessorInterface {
         }
     }
 
+    DataType sum_fast(const DataType *p) const {
+        DataType s0{}, s1{}, s2{}, s3{};
+
+        size_t i = 0;
+        // round dim down to the nearest multiple of 4
+        size_t dim_round_down = this->dim & ~size_t(3);
+
+        for (; i < dim_round_down; i += 4) {
+            s0 += p[i + 0];
+            s1 += p[i + 1];
+            s2 += p[i + 2];
+            s3 += p[i + 3];
+        }
+
+        DataType sum = (s0 + s1) + (s2 + s3);
+
+        for (; i < dim; ++i) {
+            sum += p[i];
+        }
+        return sum;
+    }
+
+    DataType sum_squares_fast(const DataType *p) const {
+        DataType s0{}, s1{}, s2{}, s3{};
+
+        size_t i = 0;
+        // round dim down to the nearest multiple of 4
+        size_t dim_round_down = this->dim & ~size_t(3);
+
+        for (; i < dim_round_down; i += 4) {
+            s0 += p[i + 0] * p[i + 0];
+            s1 += p[i + 1] * p[i + 1];
+            s2 += p[i + 2] * p[i + 2];
+            s3 += p[i + 3] * p[i + 3];
+        }
+
+        DataType sum = (s0 + s1) + (s2 + s3);
+
+        for (; i < dim; ++i) {
+            sum += p[i] * p[i];
+        }
+        return sum;
+    }
+
 public:
     QuantPreprocessor(std::shared_ptr<VecSimAllocator> allocator, size_t dim)
         : PreprocessorInterface(allocator), dim(dim),
           storage_bytes_count(dim * sizeof(OUTPUT_TYPE) +
-                              (2 + extra_values_count) * sizeof(DataType)) {
+                              (2 + extra_storage_values_count) * sizeof(DataType)),
+          query_bytes_count((dim + 1) * sizeof(DataType)) {
         static_assert(std::is_floating_point_v<DataType>,
                       "QuantPreprocessor only supports floating-point types");
     }
@@ -274,20 +365,21 @@ class QuantPreprocessor : public PreprocessorInterface {
     }
 
     /**
-     * Quantizes the storage blob (DataType → OUTPUT_TYPE) while leaving the query blob unchanged.
+     * Preprocesses the original blob into separate storage and query blobs.
      *
      * Storage vectors are quantized to uint8_t values, with metadata (min, delta, sum, and
-     * sum_squares for L2) appended for distance reconstruction. Query vectors remain as DataType
-     * for asymmetric distance computation.
+     * sum_squares for L2) appended for distance reconstruction.
      *
-     * Note: query_blob and query_blob_size are not modified, nor allocated by this function.
+     * Query vectors remain as DataType for asymmetric distance computation, with a precomputed
+     * sum (for IP/Cosine) or sum of squares (for L2) appended for efficient distance calculation.
      *
      * Possible scenarios (currently only CASE 1 is implemented):
-     * - CASE 1: STORAGE BLOB NEEDS ALLOCATION (storage_blob == nullptr)
+     * - CASE 1: STORAGE BLOB AND QUERY BLOB NEED ALLOCATION (storage_blob == query_blob == nullptr)
      * - CASE 2: STORAGE BLOB EXISTS (storage_blob != nullptr)
      *   - CASE 2A: STORAGE BLOB EXISTS and its size is insufficient
      * (storage_blob_size < required_size) - reallocate storage
-     *   - CASE 2B: STORAGE AND QUERY SHARE MEMORY (storage_blob == query_blob) - reallocate storage
+     *   - CASE 2B: STORAGE AND QUERY SHARE MEMORY (storage_blob == query_blob != nullptr) -
+     * reallocate storage
      *   - CASE 2C: SEPARATE STORAGE AND QUERY BLOBS (storage_blob != query_blob) - quantize storage
      * in-place
      */
@@ -296,6 +388,10 @@ class QuantPreprocessor : public PreprocessorInterface {
                     unsigned char alignment) const override {
         // CASE 1: STORAGE BLOB NEEDS ALLOCATION - the only implemented case
         assert(!storage_blob && "CASE 1: storage_blob must be nullptr");
+        assert(!query_blob && "CASE 1: query_blob must be nullptr");
+
+        // storage_blob_size and query_blob_size must point to different memory slots.
+        assert(&storage_blob_size != &query_blob_size);
 
         // CASE 2A: STORAGE BLOB EXISTS and its size is insufficient - not implemented
         // storage_blob && storage_blob_size < required_size
@@ -307,15 +403,8 @@ class QuantPreprocessor : public PreprocessorInterface {
         // We can quantize the storage blob in-place (if we already checked storage_blob_size is
         // sufficient)
 
-        // Allocate aligned memory for the quantized storage blob
-        storage_blob = static_cast<OUTPUT_TYPE *>(
-            this->allocator->allocate_aligned(this->storage_bytes_count, alignment));
-
-        // Quantize directly from original data
-        const DataType *input = static_cast<const DataType *>(original_blob);
-        quantize(input, static_cast<OUTPUT_TYPE *>(storage_blob));
-
-        storage_blob_size = this->storage_bytes_count;
+        preprocessForStorage(original_blob, storage_blob, storage_blob_size);
+        preprocessQuery(original_blob, query_blob, query_blob_size, alignment);
     }
 
     void preprocessForStorage(const void *original_blob, void *&blob,
@@ -331,9 +420,33 @@ class QuantPreprocessor : public PreprocessorInterface {
         input_blob_size = storage_bytes_count;
     }
 
+    /**
+     * Preprocesses the query vector for asymmetric distance computation.
+     *
+     * The query blob contains the original float values followed by a precomputed value:
+     * - For IP/Cosine: y_sum = Σy_i (sum of query values)
+     * - For L2: y_sum_squares = Σy_i² (sum of squared query values)
+     *
+     * Query blob layout: | query_values[dim] | y_sum OR y_sum_squares |
+     * Query blob size: (dim + 1) * sizeof(DataType)
+     */
     void preprocessQuery(const void *original_blob, void *&blob, size_t &query_blob_size,
                          unsigned char alignment) const override {
-        // No-op: queries remain as original DataType
+        assert(!blob && "query_blob must be nullptr");
+
+        // Allocate aligned memory for the query blob
+        blob = this->allocator->allocate_aligned(this->query_bytes_count, alignment);
+        memcpy(blob, original_blob, this->dim * sizeof(DataType));
+        const DataType *input = static_cast<const DataType *>(original_blob);
+        // For IP/Cosine, we need to store the sum of the query vector.
+        if constexpr (Metric == VecSimMetric_IP || Metric == VecSimMetric_Cosine) {
+            static_cast<DataType *>(blob)[this->dim] = sum_fast(input);
+        } // For L2, compute the sum of squares.
+        else if constexpr (Metric == VecSimMetric_L2) {
+            static_cast<DataType *>(blob)[this->dim] = sum_squares_fast(input);
+        }
+
+        query_blob_size = this->query_bytes_count;
     }
 
     void preprocessStorageInPlace(void *original_blob, size_t input_blob_size) const override {
@@ -353,4 +466,5 @@ class QuantPreprocessor : public PreprocessorInterface {
 
     const size_t dim;
     const size_t storage_bytes_count;
+    const size_t query_bytes_count;
 };