Skip to content

Commit edbec56

Browse files
committed
Migrate python scripts to public and support fallback in tests
1 parent 0d6777f commit edbec56

File tree

17 files changed

+605
-57
lines changed

17 files changed

+605
-57
lines changed

bindings/python/src/svs/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,9 @@
3131
k_recall_at, \
3232
generate_test_dataset
3333

34+
# LeanVec computation
35+
from .leanvec import compute_leanvec_matrices
36+
3437
# Make the upgrader available without explicit import.
3538
from . import upgrader
3639

bindings/python/src/svs/common.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -280,3 +280,29 @@ def k_recall_at(gt_idx, result_idx, k: int, at: int):
280280
ls_recall = [len(intersect) for intersect in ls_intersection]
281281

282282
return sum(ls_recall) / (len(ls_recall) * k)
283+
284+
def get_lvq_range(data: np.array):
285+
"""
286+
For a given uncompressed dataset, get the difference between the minimum and maximum
287+
values for each vector after LVQ-style preprocessing.
288+
289+
This pre-processing involves removing the component-wise average of the dataset.
290+
291+
This is not an efficient function.
292+
293+
Args:
294+
- data: A 2-D numpy array
295+
296+
Returns:
297+
- A 1-D numpy array returning the difference between each vector's maximum and
298+
minimum component after pre-processing.
299+
"""
300+
301+
assert(data.ndim == 2)
302+
center = np.sum(data, axis = 0, dtype = np.float64) / data.shape[0]
303+
centered_data = data - center
304+
305+
# Obtain the minimum and maximum values for each dimension.
306+
mins = np.min(centered_data, axis = 1)
307+
maxs = np.max(centered_data, axis = 1)
308+
return maxs - mins

bindings/python/src/svs/leanvec.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
# Copyright (C) 2023 Intel Corporation
2+
#
3+
# This software and the related documents are Intel copyrighted materials,
4+
# and your use of them is governed by the express license under which they
5+
# were provided to you ("License"). Unless the License provides otherwise,
6+
# you may not use, modify, copy, publish, distribute, disclose or transmit
7+
# this software or the related documents without Intel's prior written
8+
# permission.
9+
#
10+
# This software and the related documents are provided as is, with no
11+
# express or implied warranties, other than those that are expressly stated
12+
# in the License.
13+
14+
import numpy as np
15+
from typing import Tuple
16+
17+
18+
def compute_leanvec_matrices(X: np.ndarray, Q: np.ndarray, n_components: int,
19+
n_max_steps: int = 500, rel_tol:float = 1e-3) -> Tuple[np.ndarray, np.ndarray]:
20+
A = np.zeros((Q.shape[1], n_components))
21+
B = np.zeros((X.shape[1], n_components))
22+
23+
return B.astype(np.float32), A.astype(np.float32)

bindings/python/tests/common.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@
3939
test_groundtruth_mip = str(TEST_DATASET_DIR.joinpath("groundtruth_mip.ivecs"))
4040
test_groundtruth_cosine = str(TEST_DATASET_DIR.joinpath("groundtruth_cosine.ivecs"))
4141
test_vamana_reference = str(TEST_DATASET_DIR.joinpath("reference/vamana_reference.toml"))
42+
test_leanvec_data_matrix = str(TEST_DATASET_DIR.joinpath("leanvec_data_matrix.fvecs"))
43+
test_leanvec_query_matrix = str(TEST_DATASET_DIR.joinpath("leanvec_query_matrix.fvecs"))
4244

4345
test_number_of_vectors = 10000
4446
test_dimensions = 128
@@ -123,3 +125,37 @@ def test_threading(f, *args, validate = None, iters = 4, print_times = False):
123125
# For short lived processes, we generally see closer to a 3x speedup than a 4x
124126
# speedup when using 4 threads.
125127
testcase.assertTrue(1.3 * new_time < base_time)
128+
129+
def test_close_lvq(original, reconstructed, primary_bits: int, residual_bits: int = 0):
130+
"""
131+
Test that the reconstructed values are within the expected tolerance for LVQ compressed
132+
data.
133+
134+
Arguments:
135+
- original: The original, uncompressed data.
136+
- reconstucted: The reconstructed data.
137+
138+
Keyword Arguments:
139+
- primary_bits: The number of bits in the primary encoding.
140+
- residual_bits: The number of bits in the residual encoding.
141+
"""
142+
143+
# Obtain the difference between the maximum and minimum values in the pre-processed
144+
# dataset.
145+
spans = svs.common.get_lvq_range(original)
146+
147+
# Compute the max delta for each component of the dataset.
148+
# NOTE: We *should* divide by another factor of two here, but there are some values in
149+
# the LVQ quantization space that will exceed this threshold due to compression
150+
# limitations.
151+
#
152+
# See the C++ tests for LVQ reconstruction for a more complete explanation.
153+
deltas = spans / (((2 ** primary_bits) - 1) * 2)
154+
if residual_bits != 0:
155+
deltas = deltas / ((2 ** residual_bits) - 1)
156+
157+
# Ensure that each reconstructed value is within the target threshold (plus a tiny
158+
# fudge factor to help offset rounding imprecision.
159+
upper_bound = np.expand_dims(deltas, axis = 1)
160+
upper_bound = upper_bound + 0.0125 * upper_bound;
161+
return np.all(np.abs(original - reconstructed) <= upper_bound)

bindings/python/tests/dataset.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,3 +27,19 @@ def is_match(self, d: dict):
2727
return False
2828

2929
return d["dataset"]["data_type"] == self.data_type
30+
31+
# LVQ (fallback) datasets
32+
class LVQMatcher(UncompressedMatcher):
33+
def __init__(self, primary: int, residual: int = 0):
34+
super().__init__("float32")
35+
self.primary = primary
36+
self.residual = residual
37+
38+
# LeanVec (fallback) datasets
39+
class LeanVecMatcher(UncompressedMatcher):
40+
def __init__(self, primary_kind: str, secondary_kind: str, leanvec_dims: int, is_pca: bool = True):
41+
super().__init__("float32")
42+
self.primary_kind = primary_kind
43+
self.secondary_kind = secondary_kind
44+
self.leanvec_dims = leanvec_dims
45+
self.is_pca = is_pca

bindings/python/tests/test_flat.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,10 @@ def _loaders(self, file: svs.VectorDataLoader):
5252
svs.DistanceType.L2: 1.0,
5353
svs.DistanceType.MIP: 1.0,
5454
}),
55+
(svs.LVQ8(file, 0), {
56+
svs.DistanceType.L2: 0.99997,
57+
svs.DistanceType.MIP: 0.99993,
58+
}),
5559
]
5660

5761
def _do_test(self, flat, queries, groundtruth, expected_recall = 1.0):

bindings/python/tests/test_loader_api.py

Lines changed: 68 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,11 @@
1818
import svs
1919

2020
# Local dependencies
21-
from .common import test_data_vecs
21+
from .common import \
22+
isapprox, \
23+
test_data_svs, \
24+
test_data_vecs, \
25+
test_data_dims
2226

2327
DEBUG = False;
2428

@@ -31,3 +35,66 @@ def _get_basic_loader(self):
3135
self.assertEqual(loader.data_type, svs.float32)
3236
self.assertEqual(loader.dims, 128)
3337
return loader
38+
39+
def test_lvq_loader(self):
40+
loader = self._get_basic_loader()
41+
42+
# One Level LVQ - 4 bits.
43+
lvq = svs.LVQLoader(loader, primary = 4)
44+
self.assertEqual(lvq.dims, 128)
45+
self.assertEqual(lvq.primary_bits, 4)
46+
self.assertEqual(lvq.residual_bits, 0)
47+
self.assertEqual(lvq.strategy, svs.LVQStrategy.Auto)
48+
49+
# One Level LVQ - 8 bits.
50+
lvq = svs.LVQLoader(
51+
loader, primary = 8, strategy = svs.LVQStrategy.Sequential
52+
)
53+
self.assertEqual(lvq.dims, 128)
54+
self.assertEqual(lvq.primary_bits, 8)
55+
self.assertEqual(lvq.residual_bits, 0)
56+
self.assertEqual(lvq.strategy, svs.LVQStrategy.Sequential)
57+
58+
# Two level LVQ - 4x8 bits
59+
lvq = svs.LVQLoader(
60+
loader, primary = 4, residual = 8, strategy = svs.LVQStrategy.Turbo
61+
)
62+
self.assertEqual(lvq.dims, 128)
63+
self.assertEqual(lvq.primary_bits, 4)
64+
self.assertEqual(lvq.residual_bits, 8)
65+
self.assertEqual(lvq.strategy, svs.LVQStrategy.Turbo)
66+
67+
68+
# Two level LVQ - 8x8 bits
69+
lvq = svs.LVQLoader(loader, primary = 8, residual = 8)
70+
self.assertEqual(lvq.dims, 128)
71+
self.assertEqual(lvq.primary_bits, 8)
72+
self.assertEqual(lvq.residual_bits, 8)
73+
self.assertEqual(lvq.strategy, svs.LVQStrategy.Auto)
74+
75+
def test_leanvec_loader(self):
76+
loader = self._get_basic_loader()
77+
78+
kinds = [
79+
svs.LeanVecKind.lvq4,
80+
svs.LeanVecKind.lvq8,
81+
svs.LeanVecKind.float16,
82+
svs.LeanVecKind.float32,
83+
]
84+
85+
alignments = [0, 32]
86+
dims = [64, 96]
87+
88+
for (p, s, a, d) in itertools.product(kinds, kinds, alignments, dims):
89+
leanvec = svs.LeanVecLoader(
90+
loader,
91+
d,
92+
primary_kind = p,
93+
secondary_kind = s,
94+
alignment = a
95+
)
96+
97+
self.assertEqual(leanvec.dims, 128)
98+
self.assertEqual(leanvec.primary_kind, p)
99+
self.assertEqual(leanvec.secondary_kind, s)
100+
self.assertEqual(leanvec.alignment, a)

bindings/python/tests/test_reconstruction.py

Lines changed: 80 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,10 +26,13 @@
2626

2727
# Local dependencies
2828
from .common import \
29+
isapprox, \
2930
test_data_svs, \
3031
test_data_vecs, \
32+
test_data_dims, \
3133
test_graph, \
32-
test_vamana_config
34+
test_vamana_config, \
35+
test_close_lvq
3336

3437
DEBUG = False;
3538

@@ -38,9 +41,57 @@ class ReconstructionTester(unittest.TestCase):
3841
Test the reconstruction interface for indexex.
3942
"""
4043
def _get_loaders(self, loader: svs.VectorDataLoader):
44+
sequential = svs.LVQStrategy.Sequential
45+
turbo = svs.LVQStrategy.Turbo
46+
4147
return [
4248
# Uncompressed
4349
loader,
50+
# LVQ
51+
svs.LVQLoader(loader, primary = 8, padding = 0),
52+
svs.LVQLoader(loader, primary = 4, padding = 0),
53+
svs.LVQLoader(
54+
loader, primary = 4, residual = 4, strategy = sequential, padding = 0
55+
),
56+
svs.LVQLoader(
57+
loader, primary = 4, residual = 4, strategy = turbo, padding = 0
58+
),
59+
svs.LVQLoader(
60+
loader, primary = 4, residual = 8, strategy = sequential, padding = 0
61+
),
62+
svs.LVQLoader(
63+
loader, primary = 4, residual = 8, strategy = turbo, padding = 0
64+
),
65+
svs.LVQLoader(loader, primary = 8, residual = 8, padding = 0),
66+
67+
# LeanVec
68+
svs.LeanVecLoader(
69+
loader,
70+
leanvec_dims = 64,
71+
primary_kind = svs.LeanVecKind.float32,
72+
secondary_kind = svs.LeanVecKind.float32,
73+
),
74+
svs.LeanVecLoader(
75+
loader,
76+
leanvec_dims = 64,
77+
primary_kind = svs.LeanVecKind.lvq4,
78+
secondary_kind = svs.LeanVecKind.lvq8,
79+
alignment = 0
80+
),
81+
svs.LeanVecLoader(
82+
loader,
83+
leanvec_dims = 64,
84+
primary_kind = svs.LeanVecKind.lvq8,
85+
secondary_kind = svs.LeanVecKind.lvq8,
86+
alignment = 0
87+
),
88+
svs.LeanVecLoader(
89+
loader,
90+
leanvec_dims = 64,
91+
primary_kind = svs.LeanVecKind.lvq8,
92+
secondary_kind = svs.LeanVecKind.float16,
93+
alignment = 0
94+
),
4495
]
4596

4697
def _test_misc(self, loader: svs.VectorDataLoader, data):
@@ -68,6 +119,30 @@ def _test_misc(self, loader: svs.VectorDataLoader, data):
68119
vamana.reconstruct(np.zeros((10, 10), dtype = np.uint64)).shape == (10, 10, d)
69120
)
70121

122+
def _compare_lvq(self, data, reconstructed, loader: svs.LVQLoader):
123+
print(f"LVQ: primary = {loader.primary_bits}, residual = {loader.residual_bits}")
124+
self.assertTrue(isinstance(loader, svs.LVQLoader))
125+
self.assertTrue(test_close_lvq(
126+
data,
127+
reconstructed,
128+
primary_bits = loader.primary_bits,
129+
residual_bits = loader.residual_bits
130+
))
131+
132+
def _compare_leanvec(self, data, reconstructed, loader: svs.LeanVecLoader):
133+
self.assertTrue(isinstance(loader, svs.LeanVecLoader))
134+
secondary_kind = loader.secondary_kind
135+
if secondary_kind == svs.LeanVecKind.float32:
136+
self.assertTrue(np.array_equal(data, reconstructed))
137+
elif secondary_kind == svs.LeanVecKind.float16:
138+
self.assertTrue(np.allclose(data, reconstructed))
139+
elif secondary_kind == svs.LeanVecKind.lvq4:
140+
self.assertTrue(test_close_lvq(data, reconstructed, primary_bits = 4))
141+
elif secondary_kind == svs.LeanVecKind.lvq8:
142+
self.assertTrue(test_close_lvq(data, reconstructed, primary_bits = 8))
143+
else:
144+
raise Exception(f"Unknown leanvec kind {secondary_kind}")
145+
71146
def test_reconstruction(self):
72147
default_loader = svs.VectorDataLoader(test_data_svs, svs.DataType.float32)
73148
all_loaders = self._get_loaders(default_loader)
@@ -88,6 +163,10 @@ def test_reconstruction(self):
88163

89164
if isinstance(loader, svs.VectorDataLoader):
90165
self.assertTrue(np.array_equal(shuffled_data, r))
166+
elif isinstance(loader, svs.LVQLoader):
167+
self._compare_lvq(shuffled_data, r, loader)
168+
elif isinstance(loader, svs.LeanVecLoader):
169+
self._compare_leanvec(shuffled_data, r, loader)
91170
else:
92171
raise Exception(f"Unhandled loader kind: {loader}")
93172

0 commit comments

Comments
 (0)