intel
diff --git a/‎bindings/python/src/svs/__init__.py‎
Lines changed: 3 additions & 0 deletions b/‎bindings/python/src/svs/__init__.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎bindings/python/src/svs/common.py‎
Lines changed: 26 additions & 0 deletions b/‎bindings/python/src/svs/common.py‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎bindings/python/src/svs/leanvec.py‎
Lines changed: 23 additions & 0 deletions b/‎bindings/python/src/svs/leanvec.py‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎bindings/python/tests/common.py‎
Lines changed: 36 additions & 0 deletions b/‎bindings/python/tests/common.py‎
Lines changed: 36 additions & 0 deletions
diff --git a/‎bindings/python/tests/dataset.py‎
Lines changed: 16 additions & 0 deletions b/‎bindings/python/tests/dataset.py‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎bindings/python/tests/test_flat.py‎
Lines changed: 4 additions & 0 deletions b/‎bindings/python/tests/test_flat.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎bindings/python/tests/test_loader_api.py‎
Lines changed: 68 additions & 1 deletion b/‎bindings/python/tests/test_loader_api.py‎
Lines changed: 68 additions & 1 deletion
diff --git a/‎bindings/python/tests/test_reconstruction.py‎
Lines changed: 80 additions & 1 deletion b/‎bindings/python/tests/test_reconstruction.py‎
Lines changed: 80 additions & 1 deletion
@@ -31,6 +31,9 @@
     k_recall_at, \
     generate_test_dataset
 
+# LeanVec computation
+from .leanvec import compute_leanvec_matrices
+
 # Make the upgrader available without explicit import.
 from . import upgrader
 
@@ -280,3 +280,29 @@ def k_recall_at(gt_idx, result_idx, k: int, at: int):
     ls_recall = [len(intersect) for intersect in ls_intersection]
 
     return sum(ls_recall) / (len(ls_recall) * k)
+
+def get_lvq_range(data: np.array):
+    """
+    For a given uncompressed dataset, get the difference between the minimum and maximum
+    values for each vector after LVQ-style preprocessing.
+
+    This pre-processing involves removing the component-wise average of the dataset.
+
+    This is not an efficient function.
+
+    Args:
+        - data: A 2-D numpy array
+
+    Returns:
+        - A 1-D numpy array returning the difference between each vector's maximum and
+          minimum component after pre-processing.
+    """
+
+    assert(data.ndim == 2)
+    center = np.sum(data, axis = 0, dtype = np.float64) / data.shape[0]
+    centered_data = data - center
+
+    # Obtain the minimum and maximum values for each dimension.
+    mins = np.min(centered_data, axis = 1)
+    maxs = np.max(centered_data, axis = 1)
+    return maxs - mins
@@ -0,0 +1,23 @@
+# Copyright (C) 2023 Intel Corporation
+#
+# This software and the related documents are Intel copyrighted materials,
+# and your use of them is governed by the express license under which they
+# were provided to you ("License"). Unless the License provides otherwise,
+# you may not use, modify, copy, publish, distribute, disclose or transmit
+# this software or the related documents without Intel's prior written
+# permission.
+#
+# This software and the related documents are provided as is, with no
+# express or implied warranties, other than those that are expressly stated
+# in the License.
+
+import numpy as np
+from typing import Tuple
+
+
+def compute_leanvec_matrices(X: np.ndarray, Q: np.ndarray, n_components: int,
+        n_max_steps: int = 500, rel_tol:float = 1e-3) -> Tuple[np.ndarray, np.ndarray]:
+    A = np.zeros((Q.shape[1], n_components))
+    B = np.zeros((X.shape[1], n_components))
+
+    return B.astype(np.float32), A.astype(np.float32)
@@ -39,6 +39,8 @@
 test_groundtruth_mip = str(TEST_DATASET_DIR.joinpath("groundtruth_mip.ivecs"))
 test_groundtruth_cosine = str(TEST_DATASET_DIR.joinpath("groundtruth_cosine.ivecs"))
 test_vamana_reference = str(TEST_DATASET_DIR.joinpath("reference/vamana_reference.toml"))
+test_leanvec_data_matrix = str(TEST_DATASET_DIR.joinpath("leanvec_data_matrix.fvecs"))
+test_leanvec_query_matrix = str(TEST_DATASET_DIR.joinpath("leanvec_query_matrix.fvecs"))
 
 test_number_of_vectors = 10000
 test_dimensions = 128
@@ -123,3 +125,37 @@ def test_threading(f, *args, validate = None, iters = 4, print_times = False):
     # For short lived processes, we generally see closer to a 3x speedup than a 4x
     # speedup when using 4 threads.
     testcase.assertTrue(1.3 * new_time < base_time)
+
+def test_close_lvq(original, reconstructed, primary_bits: int, residual_bits: int = 0):
+    """
+    Test that the reconstructed values are within the expected tolerance for LVQ compressed
+    data.
+
+    Arguments:
+        - original: The original, uncompressed data.
+        - reconstucted: The reconstructed data.
+
+    Keyword Arguments:
+        - primary_bits: The number of bits in the primary encoding.
+        - residual_bits: The number of bits in the residual encoding.
+    """
+
+    # Obtain the difference between the maximum and minimum values in the pre-processed
+    # dataset.
+    spans = svs.common.get_lvq_range(original)
+
+    # Compute the max delta for each component of the dataset.
+    # NOTE: We *should* divide by another factor of two here, but there are some values in
+    # the LVQ quantization space that will exceed this threshold due to compression
+    # limitations.
+    #
+    # See the C++ tests for LVQ reconstruction for a more complete explanation.
+    deltas = spans / (((2 ** primary_bits) - 1) * 2)
+    if residual_bits != 0:
+        deltas = deltas / ((2 ** residual_bits) - 1)
+
+    # Ensure that each reconstructed value is within the target threshold (plus a tiny
+    # fudge factor to help offset rounding imprecision.
+    upper_bound = np.expand_dims(deltas, axis = 1)
+    upper_bound = upper_bound + 0.0125 * upper_bound;
+    return np.all(np.abs(original - reconstructed) <= upper_bound)
@@ -27,3 +27,19 @@ def is_match(self, d: dict):
             return False
 
         return d["dataset"]["data_type"] == self.data_type
+
+# LVQ (fallback) datasets
+class LVQMatcher(UncompressedMatcher):
+    def __init__(self, primary: int, residual: int = 0):
+        super().__init__("float32")
+        self.primary = primary
+        self.residual = residual
+
+# LeanVec (fallback) datasets
+class LeanVecMatcher(UncompressedMatcher):
+    def __init__(self, primary_kind: str, secondary_kind: str, leanvec_dims: int, is_pca: bool = True):
+        super().__init__("float32")
+        self.primary_kind = primary_kind
+        self.secondary_kind = secondary_kind
+        self.leanvec_dims = leanvec_dims
+        self.is_pca = is_pca
@@ -52,6 +52,10 @@ def _loaders(self, file: svs.VectorDataLoader):
                 svs.DistanceType.L2: 1.0,
                 svs.DistanceType.MIP: 1.0,
             }),
+            (svs.LVQ8(file, 0), {
+                svs.DistanceType.L2: 0.99997,
+                svs.DistanceType.MIP: 0.99993,
+            }),
         ]
 
     def _do_test(self, flat, queries, groundtruth, expected_recall = 1.0):
 
@@ -18,7 +18,11 @@
 import svs
 
 # Local dependencies
-from .common import test_data_vecs
+from .common import \
+    isapprox, \
+    test_data_svs, \
+    test_data_vecs, \
+    test_data_dims
 
 DEBUG = False;
 
@@ -31,3 +35,66 @@ def _get_basic_loader(self):
         self.assertEqual(loader.data_type, svs.float32)
         self.assertEqual(loader.dims, 128)
         return loader
+
+    def test_lvq_loader(self):
+        loader = self._get_basic_loader()
+
+        # One Level LVQ - 4 bits.
+        lvq = svs.LVQLoader(loader, primary = 4)
+        self.assertEqual(lvq.dims, 128)
+        self.assertEqual(lvq.primary_bits, 4)
+        self.assertEqual(lvq.residual_bits, 0)
+        self.assertEqual(lvq.strategy, svs.LVQStrategy.Auto)
+
+        # One Level LVQ - 8 bits.
+        lvq = svs.LVQLoader(
+            loader, primary = 8, strategy = svs.LVQStrategy.Sequential
+        )
+        self.assertEqual(lvq.dims, 128)
+        self.assertEqual(lvq.primary_bits, 8)
+        self.assertEqual(lvq.residual_bits, 0)
+        self.assertEqual(lvq.strategy, svs.LVQStrategy.Sequential)
+
+        # Two level LVQ - 4x8 bits
+        lvq = svs.LVQLoader(
+            loader, primary = 4, residual = 8, strategy = svs.LVQStrategy.Turbo
+        )
+        self.assertEqual(lvq.dims, 128)
+        self.assertEqual(lvq.primary_bits, 4)
+        self.assertEqual(lvq.residual_bits, 8)
+        self.assertEqual(lvq.strategy, svs.LVQStrategy.Turbo)
+
+
+        # Two level LVQ - 8x8 bits
+        lvq = svs.LVQLoader(loader, primary = 8, residual = 8)
+        self.assertEqual(lvq.dims, 128)
+        self.assertEqual(lvq.primary_bits, 8)
+        self.assertEqual(lvq.residual_bits, 8)
+        self.assertEqual(lvq.strategy, svs.LVQStrategy.Auto)
+
+    def test_leanvec_loader(self):
+        loader = self._get_basic_loader()
+
+        kinds = [
+            svs.LeanVecKind.lvq4,
+            svs.LeanVecKind.lvq8,
+            svs.LeanVecKind.float16,
+            svs.LeanVecKind.float32,
+        ]
+
+        alignments = [0, 32]
+        dims = [64, 96]
+
+        for (p, s, a, d) in itertools.product(kinds, kinds, alignments, dims):
+            leanvec = svs.LeanVecLoader(
+                loader,
+                d,
+                primary_kind = p,
+                secondary_kind = s,
+                alignment = a
+            )
+
+            self.assertEqual(leanvec.dims, 128)
+            self.assertEqual(leanvec.primary_kind, p)
+            self.assertEqual(leanvec.secondary_kind, s)
+            self.assertEqual(leanvec.alignment, a)
@@ -26,10 +26,13 @@
 
 # Local dependencies
 from .common import \
+    isapprox, \
     test_data_svs, \
     test_data_vecs, \
+    test_data_dims, \
     test_graph, \
-    test_vamana_config
+    test_vamana_config, \
+    test_close_lvq
 
 DEBUG = False;
 
@@ -38,9 +41,57 @@ class ReconstructionTester(unittest.TestCase):
     Test the reconstruction interface for indexex.
     """
     def _get_loaders(self, loader: svs.VectorDataLoader):
+        sequential = svs.LVQStrategy.Sequential
+        turbo = svs.LVQStrategy.Turbo
+
         return [
             # Uncompressed
             loader,
+            # LVQ
+            svs.LVQLoader(loader, primary = 8, padding = 0),
+            svs.LVQLoader(loader, primary = 4, padding = 0),
+            svs.LVQLoader(
+                loader, primary = 4, residual = 4, strategy = sequential, padding = 0
+            ),
+            svs.LVQLoader(
+                loader, primary = 4, residual = 4, strategy = turbo, padding = 0
+            ),
+            svs.LVQLoader(
+                loader, primary = 4, residual = 8, strategy = sequential, padding = 0
+            ),
+            svs.LVQLoader(
+                loader, primary = 4, residual = 8, strategy = turbo, padding = 0
+            ),
+            svs.LVQLoader(loader, primary = 8, residual = 8, padding = 0),
+
+            # LeanVec
+            svs.LeanVecLoader(
+                loader,
+                leanvec_dims = 64,
+                primary_kind = svs.LeanVecKind.float32,
+                secondary_kind = svs.LeanVecKind.float32,
+            ),
+            svs.LeanVecLoader(
+                loader,
+                leanvec_dims = 64,
+                primary_kind = svs.LeanVecKind.lvq4,
+                secondary_kind = svs.LeanVecKind.lvq8,
+                alignment = 0
+            ),
+            svs.LeanVecLoader(
+                loader,
+                leanvec_dims = 64,
+                primary_kind = svs.LeanVecKind.lvq8,
+                secondary_kind = svs.LeanVecKind.lvq8,
+                alignment = 0
+            ),
+            svs.LeanVecLoader(
+                loader,
+                leanvec_dims = 64,
+                primary_kind = svs.LeanVecKind.lvq8,
+                secondary_kind = svs.LeanVecKind.float16,
+                alignment = 0
+            ),
         ]
 
     def _test_misc(self, loader: svs.VectorDataLoader, data):
@@ -68,6 +119,30 @@ def _test_misc(self, loader: svs.VectorDataLoader, data):
             vamana.reconstruct(np.zeros((10, 10), dtype = np.uint64)).shape == (10, 10, d)
         )
 
+    def _compare_lvq(self, data, reconstructed, loader: svs.LVQLoader):
+        print(f"LVQ: primary = {loader.primary_bits}, residual = {loader.residual_bits}")
+        self.assertTrue(isinstance(loader, svs.LVQLoader))
+        self.assertTrue(test_close_lvq(
+            data,
+            reconstructed,
+            primary_bits = loader.primary_bits,
+            residual_bits = loader.residual_bits
+        ))
+
+    def _compare_leanvec(self, data, reconstructed, loader: svs.LeanVecLoader):
+        self.assertTrue(isinstance(loader, svs.LeanVecLoader))
+        secondary_kind = loader.secondary_kind
+        if secondary_kind == svs.LeanVecKind.float32:
+            self.assertTrue(np.array_equal(data, reconstructed))
+        elif secondary_kind == svs.LeanVecKind.float16:
+            self.assertTrue(np.allclose(data, reconstructed))
+        elif secondary_kind == svs.LeanVecKind.lvq4:
+            self.assertTrue(test_close_lvq(data, reconstructed, primary_bits = 4))
+        elif secondary_kind == svs.LeanVecKind.lvq8:
+            self.assertTrue(test_close_lvq(data, reconstructed, primary_bits = 8))
+        else:
+            raise Exception(f"Unknown leanvec kind {secondary_kind}")
+
     def test_reconstruction(self):
         default_loader = svs.VectorDataLoader(test_data_svs, svs.DataType.float32)
         all_loaders = self._get_loaders(default_loader)
@@ -88,6 +163,10 @@ def test_reconstruction(self):
 
             if isinstance(loader, svs.VectorDataLoader):
                 self.assertTrue(np.array_equal(shuffled_data, r))
+            elif isinstance(loader, svs.LVQLoader):
+                self._compare_lvq(shuffled_data, r, loader)
+            elif isinstance(loader, svs.LeanVecLoader):
+                self._compare_leanvec(shuffled_data, r, loader)
             else:
                 raise Exception(f"Unhandled loader kind: {loader}")