diff --git a/src/VecSim/spaces/computer/preprocessors.h b/src/VecSim/spaces/computer/preprocessors.h index 15e5a4716..9dd3d6e1a 100644 --- a/src/VecSim/spaces/computer/preprocessors.h +++ b/src/VecSim/spaces/computer/preprocessors.h @@ -164,22 +164,33 @@ class CosinePreprocessor : public PreprocessorInterface { * Given quantized value q_i, the original value is reconstructed as: * x_i ≈ min + delta * q_i * + * Query processing: + * The query vector is not quantized. It remains as DataType, but we precompute + * and store metric-specific values to accelerate asymmetric distance computation: + * - For IP/Cosine: y_sum = Σy_i (sum of query values) + * - For L2: y_sum_squares = Σy_i² (sum of squared query values) + * + * Query blob layout: + * | query_values[dim] | y_sum (IP/Cosine) OR y_sum_squares (L2) | + * + * Query blob size: (dim + 1) * sizeof(DataType) + * * === Asymmetric distance (storage x quantized, query y remains float) === * * For IP/Cosine: * IP(x, y) = Σ(x_i * y_i) * ≈ Σ((min + delta * q_i) * y_i) * = min * Σy_i + delta * Σ(q_i * y_i) - * = min * sum_query + delta * quantized_dot_product - * where sum_query = Σy_i is computed at query time. + * = min * y_sum + delta * quantized_dot_product + * where y_sum = Σy_i is precomputed and stored in the query blob. * * For L2: * ||x - y||² = Σx_i² - 2*Σ(x_i * y_i) + Σy_i² - * = sum_squares - 2 * IP(x, y) + sum_sq_query + * = x_sum_squares - 2 * IP(x, y) + y_sum_squares * where: - * - sum_squares = Σx_i² is precomputed and stored + * - x_sum_squares = Σx_i² is precomputed and stored in the storage blob * - IP(x, y) is computed using the formula above - * - sum_sq_query = Σy_i² is computed at query time + * - y_sum_squares = Σy_i² is precomputed and stored in the query blob * * === Symmetric distance (both x and y are quantized) === * @@ -208,7 +219,7 @@ class QuantPreprocessor : public PreprocessorInterface { // For L2: store sum + sum_of_squares (2 extra values) // For IP/Cosine: store only sum (1 extra value) - static constexpr size_t extra_values_count = (Metric == VecSimMetric_L2) ? 2 : 1; + static constexpr size_t extra_storage_values_count = (Metric == VecSimMetric_L2) ? 2 : 1; static_assert(Metric == VecSimMetric_L2 || Metric == VecSimMetric_IP || Metric == VecSimMetric_Cosine, "QuantPreprocessor only supports L2, IP and Cosine metrics"); @@ -227,21 +238,56 @@ class QuantPreprocessor : public PreprocessorInterface { const DataType inv_delta = DataType{1} / delta; // Compute sum (and sum of squares for L2) while quantizing - DataType sum = DataType{0}; - DataType sum_squares = DataType{0}; + // 4 independent accumulators (sum) + DataType s0{}, s1{}, s2{}, s3{}; + + // 4 independent accumulators (sum of squares), only used for L2 + DataType q0{}, q1{}, q2{}, q3{}; + + size_t i = 0; + // round dim down to the nearest multiple of 4 + size_t dim_round_down = this->dim & ~size_t(3); // Quantize the values - for (size_t i = 0; i < this->dim; i++) { + for (; i < dim_round_down; i += 4) { + // Load once + const DataType x0 = input[i + 0]; + const DataType x1 = input[i + 1]; + const DataType x2 = input[i + 2]; + const DataType x3 = input[i + 3]; // We know (input - min) => 0 // If min == max, all values are the same and should be quantized to 0. // reconstruction will yield the same original value for all vectors. - quantized[i] = static_cast(std::round((input[i] - min_val) * inv_delta)); + quantized[i + 0] = static_cast(std::round((x0 - min_val) * inv_delta)); + quantized[i + 1] = static_cast(std::round((x1 - min_val) * inv_delta)); + quantized[i + 2] = static_cast(std::round((x2 - min_val) * inv_delta)); + quantized[i + 3] = static_cast(std::round((x3 - min_val) * inv_delta)); // Accumulate sum for all metrics - sum += input[i]; + s0 += x0; + s1 += x1; + s2 += x2; + s3 += x3; + // Accumulate sum of squares only for L2 metric if constexpr (Metric == VecSimMetric_L2) { - sum_squares += input[i] * input[i]; + q0 += x0 * x0; + q1 += x1 * x1; + q2 += x2 * x2; + q3 += x3 * x3; + } + } + + // Tail: 0..3 remaining elements (still the same pass, just finishing work) + DataType sum = (s0 + s1) + (s2 + s3); + DataType sum_squares = (q0 + q1) + (q2 + q3); + + for (; i < this->dim; ++i) { + const DataType x = input[i]; + quantized[i] = static_cast(std::round((x - min_val) * inv_delta)); + sum += x; + if constexpr (Metric == VecSimMetric_L2) { + sum_squares += x * x; } } @@ -258,11 +304,56 @@ class QuantPreprocessor : public PreprocessorInterface { } } + DataType sum_fast(const DataType *p) const { + DataType s0{}, s1{}, s2{}, s3{}; + + size_t i = 0; + // round dim down to the nearest multiple of 4 + size_t dim_round_down = this->dim & ~size_t(3); + + for (; i < dim_round_down; i += 4) { + s0 += p[i + 0]; + s1 += p[i + 1]; + s2 += p[i + 2]; + s3 += p[i + 3]; + } + + DataType sum = (s0 + s1) + (s2 + s3); + + for (; i < dim; ++i) { + sum += p[i]; + } + return sum; + } + + DataType sum_squares_fast(const DataType *p) const { + DataType s0{}, s1{}, s2{}, s3{}; + + size_t i = 0; + // round dim down to the nearest multiple of 4 + size_t dim_round_down = this->dim & ~size_t(3); + + for (; i < dim_round_down; i += 4) { + s0 += p[i + 0] * p[i + 0]; + s1 += p[i + 1] * p[i + 1]; + s2 += p[i + 2] * p[i + 2]; + s3 += p[i + 3] * p[i + 3]; + } + + DataType sum = (s0 + s1) + (s2 + s3); + + for (; i < dim; ++i) { + sum += p[i] * p[i]; + } + return sum; + } + public: QuantPreprocessor(std::shared_ptr allocator, size_t dim) : PreprocessorInterface(allocator), dim(dim), storage_bytes_count(dim * sizeof(OUTPUT_TYPE) + - (2 + extra_values_count) * sizeof(DataType)) { + (2 + extra_storage_values_count) * sizeof(DataType)), + query_bytes_count((dim + 1) * sizeof(DataType)) { static_assert(std::is_floating_point_v, "QuantPreprocessor only supports floating-point types"); } @@ -274,20 +365,21 @@ class QuantPreprocessor : public PreprocessorInterface { } /** - * Quantizes the storage blob (DataType → OUTPUT_TYPE) while leaving the query blob unchanged. + * Preprocesses the original blob into separate storage and query blobs. * * Storage vectors are quantized to uint8_t values, with metadata (min, delta, sum, and - * sum_squares for L2) appended for distance reconstruction. Query vectors remain as DataType - * for asymmetric distance computation. + * sum_squares for L2) appended for distance reconstruction. * - * Note: query_blob and query_blob_size are not modified, nor allocated by this function. + * Query vectors remain as DataType for asymmetric distance computation, with a precomputed + * sum (for IP/Cosine) or sum of squares (for L2) appended for efficient distance calculation. * * Possible scenarios (currently only CASE 1 is implemented): - * - CASE 1: STORAGE BLOB NEEDS ALLOCATION (storage_blob == nullptr) + * - CASE 1: STORAGE BLOB AND QUERY BLOB NEED ALLOCATION (storage_blob == query_blob == nullptr) * - CASE 2: STORAGE BLOB EXISTS (storage_blob != nullptr) * - CASE 2A: STORAGE BLOB EXISTS and its size is insufficient * (storage_blob_size < required_size) - reallocate storage - * - CASE 2B: STORAGE AND QUERY SHARE MEMORY (storage_blob == query_blob) - reallocate storage + * - CASE 2B: STORAGE AND QUERY SHARE MEMORY (storage_blob == query_blob != nullptr) - + * reallocate storage * - CASE 2C: SEPARATE STORAGE AND QUERY BLOBS (storage_blob != query_blob) - quantize storage * in-place */ @@ -296,6 +388,10 @@ class QuantPreprocessor : public PreprocessorInterface { unsigned char alignment) const override { // CASE 1: STORAGE BLOB NEEDS ALLOCATION - the only implemented case assert(!storage_blob && "CASE 1: storage_blob must be nullptr"); + assert(!query_blob && "CASE 1: query_blob must be nullptr"); + + // storage_blob_size and query_blob_size must point to different memory slots. + assert(&storage_blob_size != &query_blob_size); // CASE 2A: STORAGE BLOB EXISTS and its size is insufficient - not implemented // storage_blob && storage_blob_size < required_size @@ -307,15 +403,8 @@ class QuantPreprocessor : public PreprocessorInterface { // We can quantize the storage blob in-place (if we already checked storage_blob_size is // sufficient) - // Allocate aligned memory for the quantized storage blob - storage_blob = static_cast( - this->allocator->allocate_aligned(this->storage_bytes_count, alignment)); - - // Quantize directly from original data - const DataType *input = static_cast(original_blob); - quantize(input, static_cast(storage_blob)); - - storage_blob_size = this->storage_bytes_count; + preprocessForStorage(original_blob, storage_blob, storage_blob_size); + preprocessQuery(original_blob, query_blob, query_blob_size, alignment); } void preprocessForStorage(const void *original_blob, void *&blob, @@ -331,9 +420,33 @@ class QuantPreprocessor : public PreprocessorInterface { input_blob_size = storage_bytes_count; } + /** + * Preprocesses the query vector for asymmetric distance computation. + * + * The query blob contains the original float values followed by a precomputed value: + * - For IP/Cosine: y_sum = Σy_i (sum of query values) + * - For L2: y_sum_squares = Σy_i² (sum of squared query values) + * + * Query blob layout: | query_values[dim] | y_sum OR y_sum_squares | + * Query blob size: (dim + 1) * sizeof(DataType) + */ void preprocessQuery(const void *original_blob, void *&blob, size_t &query_blob_size, unsigned char alignment) const override { - // No-op: queries remain as original DataType + assert(!blob && "query_blob must be nullptr"); + + // Allocate aligned memory for the query blob + blob = this->allocator->allocate_aligned(this->query_bytes_count, alignment); + memcpy(blob, original_blob, this->dim * sizeof(DataType)); + const DataType *input = static_cast(original_blob); + // For IP/Cosine, we need to store the sum of the query vector. + if constexpr (Metric == VecSimMetric_IP || Metric == VecSimMetric_Cosine) { + static_cast(blob)[this->dim] = sum_fast(input); + } // For L2, compute the sum of squares. + else if constexpr (Metric == VecSimMetric_L2) { + static_cast(blob)[this->dim] = sum_squares_fast(input); + } + + query_blob_size = this->query_bytes_count; } void preprocessStorageInPlace(void *original_blob, size_t input_blob_size) const override { @@ -353,4 +466,5 @@ class QuantPreprocessor : public PreprocessorInterface { const size_t dim; const size_t storage_bytes_count; + const size_t query_bytes_count; }; diff --git a/tests/unit/test_components.cpp b/tests/unit/test_components.cpp index 5f134386b..b7ebfa701 100644 --- a/tests/unit/test_components.cpp +++ b/tests/unit/test_components.cpp @@ -985,11 +985,10 @@ TEST(PreprocessorsTest, Int8NormalizeThenIncreaseSize) { final_blob_bytes_count)); } } -// TODO: test edge case where all entries equal. // Tests the quantization preprocessor with a single preprocessor in the chain. -// The QuantPreprocessor allocates the storage blob and processes it, while the query blob -// is unprocessed and allocated by the preprocessors container. +// The QuantPreprocessor allocates and processes both the storage blob (quantized) and the query +// blob (original values + precomputed sum_squares for L2). TEST(PreprocessorsTest, QuantizationTest) { std::shared_ptr allocator = VecSimAllocator::newVecsimAllocator(); constexpr size_t n_preprocessors = 1; @@ -998,10 +997,20 @@ TEST(PreprocessorsTest, QuantizationTest) { constexpr size_t original_blob_size = dim * sizeof(float); float original_blob[dim] = {1, 2, 3, 4, 5, 6}; - // For L2 metric: quantized values + min + delta + sum + sum_squares = dim + 4 floats + // === Storage blob expected values === + // For L2 metric: quantized values + min + delta + sum + sum_squares = dim bytes + 4 floats constexpr size_t quantized_blob_bytes_count = dim * sizeof(uint8_t) + 4 * sizeof(float); - uint8_t expected_processed_blob[quantized_blob_bytes_count] = {0}; - ComputeSQ8Quantization(original_blob, dim, expected_processed_blob); + uint8_t expected_storage_blob[quantized_blob_bytes_count] = {0}; + ComputeSQ8Quantization(original_blob, dim, expected_storage_blob); + + // === Query blob expected values === + // Query layout: | query_values[dim] | y_sum_squares (for L2) | + constexpr size_t query_blob_bytes_count = (dim + 1) * sizeof(float); + // Compute expected sum of squares for L2: 1² + 2² + 3² + 4² + 5² + 6² = 91 + float expected_query_sum_squares = 0; + for (size_t i = 0; i < dim; ++i) { + expected_query_sum_squares += original_blob[i] * original_blob[i]; + } auto quant_preprocessor = new (allocator) QuantPreprocessor(allocator, dim); @@ -1009,19 +1018,27 @@ TEST(PreprocessorsTest, QuantizationTest) { MultiPreprocessorsContainer(allocator, alignment); multiPPContainer.addPreprocessor(quant_preprocessor); - // Test preprocess + // Test preprocess (both storage and query) { ProcessedBlobs processed_blobs = multiPPContainer.preprocess(original_blob, original_blob_size); const void *storage_blob = processed_blobs.getStorageBlob(); const void *query_blob = processed_blobs.getQueryBlob(); - // blobs should NOT point to the same memory slot + + // Verify storage and query blobs are separate ASSERT_NE(storage_blob, nullptr); + ASSERT_NE(query_blob, nullptr); ASSERT_NE(storage_blob, query_blob); + // Verify storage blob content EXPECT_NO_FATAL_FAILURE(CompareVectors(static_cast(storage_blob), - expected_processed_blob, + expected_storage_blob, quantized_blob_bytes_count)); + + // Verify query blob content + const float *query_floats = static_cast(query_blob); + EXPECT_NO_FATAL_FAILURE(CompareVectors(query_floats, original_blob, dim)); + ASSERT_FLOAT_EQ(query_floats[dim], expected_query_sum_squares); } // Test preprocessForStorage @@ -1032,17 +1049,19 @@ TEST(PreprocessorsTest, QuantizationTest) { ASSERT_NE(storage_blob.get(), nullptr); EXPECT_NO_FATAL_FAILURE( CompareVectors(static_cast(storage_blob.get()), - expected_processed_blob, quantized_blob_bytes_count)); + expected_storage_blob, quantized_blob_bytes_count)); } - // Test preprocessQuery (content should not be changed, only reallocated to the required - // alignment) + // Test preprocessQuery (query values + precomputed sum_squares for L2) { auto query_blob = multiPPContainer.preprocessQuery(original_blob, original_blob_size); ASSERT_NE(query_blob.get(), nullptr); - // Verify content is unchanged (original floats, not quantized) - EXPECT_NO_FATAL_FAILURE(CompareVectors(static_cast(query_blob.get()), - original_blob, dim)); + + // Verify query blob content: original floats followed by sum_squares + const float *query_floats = static_cast(query_blob.get()); + EXPECT_NO_FATAL_FAILURE(CompareVectors(query_floats, original_blob, dim)); + ASSERT_FLOAT_EQ(query_floats[dim], expected_query_sum_squares); + // Check address is aligned unsigned char address_alignment = (uintptr_t)(query_blob.get()) % alignment; ASSERT_EQ(address_alignment, 0) << "expected alignment " << alignment; @@ -1058,7 +1077,7 @@ TEST(PreprocessorsTest, QuantizationTest) { multiPPContainer.preprocessStorageInPlace(buffer, original_blob_size); EXPECT_NO_FATAL_FAILURE(CompareVectors(reinterpret_cast(buffer), - expected_processed_blob, + expected_storage_blob, quantized_blob_bytes_count)); #if !defined(NDEBUG) EXPECT_EXIT( @@ -1068,28 +1087,6 @@ TEST(PreprocessorsTest, QuantizationTest) { } } -// Test preprocessQuery with alignment 0 - no copy, same memory address -TEST(PreprocessorsTest, QuantizationTestPreprocessQueryNoAlignment) { - std::shared_ptr allocator = VecSimAllocator::newVecsimAllocator(); - constexpr size_t n_preprocessors = 1; - constexpr size_t no_alignment = 0; - constexpr size_t dim = 5; - constexpr size_t original_blob_size = dim * sizeof(float); - float original_blob[dim] = {1, 2, 3, 4, 5}; - - auto quant_preprocessor = - new (allocator) QuantPreprocessor(allocator, dim); - auto multiPPContainer = - MultiPreprocessorsContainer(allocator, no_alignment); - multiPPContainer.addPreprocessor(quant_preprocessor); - - auto query_blob = multiPPContainer.preprocessQuery(original_blob, original_blob_size); - ASSERT_EQ(query_blob.get(), original_blob); // same memory address - // Verify content is unchanged - EXPECT_NO_FATAL_FAILURE( - CompareVectors(static_cast(query_blob.get()), original_blob, dim)); -} - // Test edge case where all entries are equal TEST(PreprocessorsTest, QuantizationTestAllEntriesEqual) { std::shared_ptr allocator = VecSimAllocator::newVecsimAllocator(); @@ -1134,6 +1131,7 @@ TEST(PreprocessorsTest, QuantizationTestAllEntriesEqual) { } allocator->free_allocation(storage_blob); + allocator->free_allocation(query_blob); delete quant_preprocessor; } @@ -1149,22 +1147,47 @@ class QuantPreprocessorMetricTest : public testing::TestWithParam void SetUp() override { allocator = VecSimAllocator::newVecsimAllocator(); } - // Helper to get expected storage size based on metric + // === Storage blob helpers === + + // Storage layout: | quantized_values[dim] | min | delta | sum | (sum_squares for L2) | + // L2: dim bytes + 4 floats (min, delta, sum, sum_squares) + // IP/Cosine: dim bytes + 3 floats (min, delta, sum) static size_t getExpectedStorageSize(VecSimMetric metric) { - // L2: dim + 4 floats (min, delta, sum, sum_squares) - // IP/Cosine: dim + 3 floats (min, delta, sum) size_t extra_floats = (metric == VecSimMetric_L2) ? 4 : 3; return dim * sizeof(uint8_t) + extra_floats * sizeof(float); } + // === Query blob helpers === + + // Query layout: | query_values[dim] | y_sum (IP/Cosine) OR y_sum_squares (L2) | + // All metrics: (dim + 1) floats + static constexpr size_t getExpectedQuerySize() { return (dim + 1) * sizeof(float); } + + // Compute expected precomputed value for query blob based on metric + template + float getExpectedQueryPrecomputedValue() { + float sum = 0; + for (size_t i = 0; i < dim; ++i) { + if constexpr (Metric == VecSimMetric_L2) { + // sum of squares: 1² + 2² + 3² + 4² + 5² = 55 + sum += original_blob[i] * original_blob[i]; + } else { + // sum: 1 + 2 + 3 + 4 + 5 = 15 + sum += original_blob[i]; + } + } + return sum; + } + // Helper to run quantization test for a specific metric template void runQuantizationTest() { size_t expected_storage_size = getExpectedStorageSize(Metric); + size_t expected_query_size = getExpectedQuerySize(); auto quant_preprocessor = new (allocator) QuantPreprocessor(allocator, dim); - // Test preprocess + // Test preprocess (both storage and query) { void *storage_blob = nullptr; void *query_blob = nullptr; @@ -1174,25 +1197,36 @@ class QuantPreprocessorMetricTest : public testing::TestWithParam quant_preprocessor->preprocess(original_blob, storage_blob, query_blob, storage_blob_size, query_blob_size, alignment); + // Verify storage blob ASSERT_NE(storage_blob, nullptr); ASSERT_EQ(storage_blob_size, expected_storage_size); - ASSERT_EQ(query_blob_size, original_blob_size); - // Compute expected quantization using utility function - // Allocate buffer large enough for L2 (4 floats), even for non-L2 metrics + // Verify query blob + ASSERT_NE(query_blob, nullptr); + ASSERT_EQ(query_blob_size, expected_query_size); + + // === Verify storage blob content === constexpr size_t max_storage_size = dim * sizeof(uint8_t) + 4 * sizeof(float); - uint8_t expected_blob[max_storage_size]; - ComputeSQ8Quantization(original_blob, dim, expected_blob); + uint8_t expected_storage_blob[max_storage_size]; + ComputeSQ8Quantization(original_blob, dim, expected_storage_blob); - // Compare quantized values and metadata // For non-L2 metrics, compare only dim + 3 floats (excluding sum_squares) size_t compare_size = (Metric == VecSimMetric_L2) ? expected_storage_size : dim * sizeof(uint8_t) + 3 * sizeof(float); EXPECT_NO_FATAL_FAILURE(CompareVectors( - static_cast(storage_blob), expected_blob, compare_size)); + static_cast(storage_blob), expected_storage_blob, compare_size)); + + // === Verify query blob content === + const float *query_floats = static_cast(query_blob); + EXPECT_NO_FATAL_FAILURE(CompareVectors(query_floats, original_blob, dim)); + + // Verify precomputed value (sum for IP/Cosine, sum_squares for L2) + float expected_precomputed = getExpectedQueryPrecomputedValue(); + ASSERT_FLOAT_EQ(query_floats[dim], expected_precomputed); allocator->free_allocation(storage_blob); + allocator->free_allocation(query_blob); } // Test preprocessForStorage @@ -1206,15 +1240,25 @@ class QuantPreprocessorMetricTest : public testing::TestWithParam allocator->free_allocation(blob); } - // Test preprocessQuery - should be no-op + // Test preprocessQuery { void *blob = nullptr; size_t blob_size = original_blob_size; quant_preprocessor->preprocessQuery(original_blob, blob, blob_size, alignment); - ASSERT_EQ(blob_size, original_blob_size); - ASSERT_EQ(blob, nullptr); + ASSERT_NE(blob, nullptr); + ASSERT_EQ(blob_size, expected_query_size); + + // Verify query blob content: original values followed by precomputed value + const float *query_floats = static_cast(blob); + EXPECT_NO_FATAL_FAILURE(CompareVectors(query_floats, original_blob, dim)); + + // Verify precomputed value (sum for IP/Cosine, sum_squares for L2) + float expected_precomputed = getExpectedQueryPrecomputedValue(); + ASSERT_FLOAT_EQ(query_floats[dim], expected_precomputed); + + allocator->free_allocation(blob); } delete quant_preprocessor;