Skip to content

Commit ff7d04e

Browse files
authored
Distance metric small fixes: uninitialized value in IVF_PQ and pass setting during consolidate_updates() (#508)
1 parent c390709 commit ff7d04e

File tree

8 files changed

+66
-32
lines changed

8 files changed

+66
-32
lines changed

apis/python/src/tiledb/vector_search/index.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -541,6 +541,7 @@ def consolidate_updates(self, retrain_index: bool = False, **kwargs):
541541
external_ids_type="TILEDB_ARRAY",
542542
updates_uri=self.updates_array_uri,
543543
index_timestamp=max_timestamp,
544+
distance_metric=self.distance_metric,
544545
storage_version=self.storage_version,
545546
copy_centroids_uri=self.centroids_uri
546547
if should_pass_copy_centroids_uri

apis/python/src/tiledb/vector_search/ingestion.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2314,7 +2314,11 @@ def scale_resources(min_resource, max_resource, max_input_size, input_size):
23142314
# Which reads the vectors and normalizes them, then swaps source_uri for normalized_uri
23152315
# This is because the cosine distance metric requires normalized vectors
23162316
normalization_nodes = []
2317-
if distance_metric == vspy.DistanceMetric.COSINE and not normalized:
2317+
if (
2318+
distance_metric == vspy.DistanceMetric.COSINE
2319+
and not normalized
2320+
and size > 0
2321+
):
23182322
group = tiledb.Group(index_group_uri, "w")
23192323
normalized_uri = create_array(
23202324
group=group,

apis/python/test/common.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -450,3 +450,24 @@ def query_and_check(index, queries, k, expected, expected_distances=None, **kwar
450450
atol=1e-5,
451451
err_msg=f"Distance mismatch for ID {id}",
452452
)
453+
454+
455+
def sum_of_squares_distance(a, b):
456+
return np.sum((a - b) ** 2)
457+
458+
459+
def cosine_distance(a, b):
460+
return 1 - np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
461+
462+
463+
def l2_distance(a, b):
464+
return np.sqrt(np.sum((a - b) ** 2))
465+
466+
467+
def normalize_vectors(vectors):
468+
norms = np.linalg.norm(vectors, axis=1, keepdims=True)
469+
return vectors / norms
470+
471+
472+
def normalize_vector(vector):
473+
return vector / np.linalg.norm(vector)

apis/python/test/test_distance_metrics.py

Lines changed: 0 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -14,16 +14,6 @@
1414
from tiledb.vector_search.utils import MAX_UINT64
1515
from tiledb.vector_search.utils import load_fvecs
1616

17-
18-
def normalize_vectors(vectors):
19-
norms = np.linalg.norm(vectors, axis=1, keepdims=True)
20-
return vectors / norms
21-
22-
23-
def normalize_vector(vector):
24-
return vector / np.linalg.norm(vector)
25-
26-
2717
MINIMUM_ACCURACY = 0.85
2818
MINIMUM_ACCURACY_IVF_PQ = 0.75
2919

@@ -384,14 +374,6 @@ def test_vamana_create_cosine(tmp_path):
384374
)
385375

386376

387-
def cosine_distance(a, b):
388-
return 1 - np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
389-
390-
391-
def l2_distance(a, b):
392-
return np.sqrt(np.sum((a - b) ** 2))
393-
394-
395377
def test_ivf_flat_create_cosine_numpy(tmp_path):
396378
index_uri = os.path.join(tmp_path, "sift10k_flat_COSINE")
397379

@@ -472,10 +454,6 @@ def test_vamana_create_inner_product(tmp_path):
472454
)
473455

474456

475-
def sum_of_squares_distance(a, b):
476-
return np.sum((a - b) ** 2)
477-
478-
479457
def test_ivfpq_create_sum_of_squares(tmp_path):
480458
index_uri = os.path.join(tmp_path, "sift10k_flat_sum_of_squares")
481459
ingest(

src/include/api/ivf_pq_index.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -322,6 +322,10 @@ class IndexIVFPQ {
322322
return reassign_ratio_;
323323
}
324324

325+
constexpr DistanceMetric distance_metric() const {
326+
return distance_metric_;
327+
}
328+
325329
constexpr tiledb_datatype_t feature_type() const {
326330
return feature_datatype_;
327331
}
@@ -608,7 +612,7 @@ class IndexIVFPQ {
608612
tiledb_datatype_t id_datatype_{TILEDB_ANY};
609613
tiledb_datatype_t partitioning_index_datatype_{TILEDB_ANY};
610614
std::unique_ptr<index_base> index_;
611-
DistanceMetric distance_metric_;
615+
DistanceMetric distance_metric_{DistanceMetric::SUM_OF_SQUARES};
612616
};
613617

614618
// clang-format off

src/include/api/vamana_index.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -283,6 +283,10 @@ class IndexVamana {
283283
return r_max_degree_;
284284
}
285285

286+
constexpr DistanceMetric distance_metric() const {
287+
return distance_metric_;
288+
}
289+
286290
constexpr tiledb_datatype_t feature_type() const {
287291
return feature_datatype_;
288292
}

src/include/test/unit_api_ivf_pq_index.cc

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ TEST_CASE("init constructor", "[api_ivf_pq_index]") {
4646
a.partitioning_index_type_string() ==
4747
datatype_to_string(TILEDB_UINT32));
4848
CHECK(dimensions(a) == 0);
49+
CHECK(a.distance_metric() == DistanceMetric::SUM_OF_SQUARES);
4950
}
5051

5152
SECTION("float uint32 uint32") {
@@ -202,6 +203,7 @@ TEST_CASE("create empty index and then train and query", "[api_ivf_pq_index]") {
202203
CHECK(index.feature_type_string() == feature_type);
203204
CHECK(index.id_type_string() == id_type);
204205
CHECK(index.partitioning_index_type_string() == partitioning_index_type);
206+
CHECK(index.distance_metric() == DistanceMetric::SUM_OF_SQUARES);
205207
}
206208

207209
{
@@ -242,6 +244,7 @@ TEST_CASE(
242244
auto partitioning_index_type = "uint32";
243245
uint64_t dimensions = 3;
244246
uint32_t num_subspaces = 1;
247+
auto distance_metric = DistanceMetric::L2;
245248

246249
std::string index_uri =
247250
(std::filesystem::temp_directory_path() / "api_ivf_pq_index").string();
@@ -251,13 +254,14 @@ TEST_CASE(
251254
}
252255

253256
{
254-
auto index = IndexIVFPQ(std::make_optional<IndexOptions>({
255-
{"feature_type", feature_type},
256-
{"id_type", id_type},
257-
{"partitioning_index_type", partitioning_index_type},
258-
{"dimensions", std::to_string(dimensions)},
259-
{"num_subspaces", std::to_string(num_subspaces)},
260-
}));
257+
auto index = IndexIVFPQ(std::make_optional<IndexOptions>(
258+
{{"feature_type", feature_type},
259+
{"id_type", id_type},
260+
{"partitioning_index_type", partitioning_index_type},
261+
{"dimensions", std::to_string(dimensions)},
262+
{"num_subspaces", std::to_string(num_subspaces)},
263+
{"distance_metric",
264+
std::to_string(static_cast<size_t>(distance_metric))}}));
261265

262266
size_t num_vectors = 0;
263267
auto empty_training_vector_array =
@@ -271,6 +275,7 @@ TEST_CASE(
271275
CHECK(index.partitioning_index_type_string() == partitioning_index_type);
272276
CHECK(index.dimensions() == dimensions);
273277
CHECK(index.num_subspaces() == num_subspaces);
278+
CHECK(index.distance_metric() == distance_metric);
274279
}
275280

276281
{
@@ -281,6 +286,7 @@ TEST_CASE(
281286
CHECK(index.partitioning_index_type_string() == partitioning_index_type);
282287
CHECK(index.dimensions() == dimensions);
283288
CHECK(index.num_subspaces() == num_subspaces);
289+
CHECK(index.distance_metric() == distance_metric);
284290
auto training = ColMajorMatrixWithIds<feature_type_type, id_type_type>{
285291
{{8, 6, 7}, {5, 3, 0}, {9, 5, 0}, {2, 7, 3}}, {10, 11, 12, 13}};
286292

@@ -292,6 +298,9 @@ TEST_CASE(
292298
CHECK(index.feature_type_string() == feature_type);
293299
CHECK(index.id_type_string() == id_type);
294300
CHECK(index.partitioning_index_type_string() == partitioning_index_type);
301+
CHECK(index.dimensions() == dimensions);
302+
CHECK(index.num_subspaces() == num_subspaces);
303+
CHECK(index.distance_metric() == distance_metric);
295304

296305
auto queries = ColMajorMatrix<feature_type_type>{
297306
{{8, 6, 7}, {5, 3, 0}, {9, 5, 0}, {2, 7, 3}}};
@@ -370,6 +379,7 @@ TEST_CASE(
370379
CHECK(index.feature_type_string() == feature_type);
371380
CHECK(index.id_type_string() == id_type);
372381
CHECK(index.partitioning_index_type_string() == partitioning_index_type);
382+
CHECK(index.distance_metric() == DistanceMetric::SUM_OF_SQUARES);
373383
}
374384

375385
{

src/include/test/unit_api_vamana_index.cc

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,7 @@ TEST_CASE("create empty index and then train and query", "[api_vamana_index]") {
157157

158158
CHECK(index.feature_type_string() == feature_type);
159159
CHECK(index.id_type_string() == id_type);
160+
CHECK(index.distance_metric() == DistanceMetric::SUM_OF_SQUARES);
160161
}
161162

162163
{
@@ -202,6 +203,7 @@ TEST_CASE(
202203
auto feature_type = "uint8";
203204
auto id_type = "uint32";
204205
uint64_t dimensions = 3;
206+
auto distance_metric = DistanceMetric::L2;
205207

206208
std::string index_uri =
207209
(std::filesystem::temp_directory_path() / "api_vamana_index").string();
@@ -212,7 +214,10 @@ TEST_CASE(
212214

213215
{
214216
auto index = IndexVamana(std::make_optional<IndexOptions>(
215-
{{"feature_type", feature_type}, {"id_type", id_type}}));
217+
{{"feature_type", feature_type},
218+
{"id_type", id_type},
219+
{"distance_metric",
220+
std::to_string(static_cast<size_t>(distance_metric))}}));
216221

217222
size_t num_vectors = 0;
218223
auto empty_training_vector_array =
@@ -223,13 +228,15 @@ TEST_CASE(
223228

224229
CHECK(index.feature_type_string() == feature_type);
225230
CHECK(index.id_type_string() == id_type);
231+
CHECK(index.distance_metric() == distance_metric);
226232
}
227233

228234
{
229235
auto index = IndexVamana(ctx, index_uri);
230236

231237
CHECK(index.feature_type_string() == feature_type);
232238
CHECK(index.id_type_string() == id_type);
239+
CHECK(index.distance_metric() == distance_metric);
233240

234241
auto training = ColMajorMatrixWithIds<feature_type_type, id_type_type>{
235242
{{8, 6, 7}, {5, 3, 0}, {9, 5, 0}, {2, 7, 3}}, {10, 11, 12, 13}};
@@ -241,6 +248,7 @@ TEST_CASE(
241248

242249
CHECK(index.feature_type_string() == feature_type);
243250
CHECK(index.id_type_string() == id_type);
251+
CHECK(index.distance_metric() == distance_metric);
244252

245253
auto queries = ColMajorMatrix<feature_type_type>{
246254
{{8, 6, 7}, {5, 3, 0}, {9, 5, 0}, {2, 7, 3}}};
@@ -287,6 +295,7 @@ TEST_CASE(
287295

288296
CHECK(index.feature_type_string() == feature_type);
289297
CHECK(index.id_type_string() == id_type);
298+
CHECK(index.distance_metric() == DistanceMetric::SUM_OF_SQUARES);
290299
}
291300

292301
{
@@ -302,6 +311,7 @@ TEST_CASE(
302311

303312
CHECK(index.feature_type_string() == feature_type);
304313
CHECK(index.id_type_string() == id_type);
314+
CHECK(index.distance_metric() == DistanceMetric::SUM_OF_SQUARES);
305315

306316
auto query_set = FeatureVectorArray(ctx, siftsmall_query_uri);
307317
auto groundtruth_set = FeatureVectorArray(ctx, siftsmall_groundtruth_uri);
@@ -321,6 +331,7 @@ TEST_CASE("infer feature type", "[api_vamana_index]") {
321331
a.train(training_set);
322332
CHECK(a.feature_type() == TILEDB_FLOAT32);
323333
CHECK(a.id_type() == TILEDB_UINT32);
334+
CHECK(a.distance_metric() == DistanceMetric::SUM_OF_SQUARES);
324335
}
325336

326337
TEST_CASE("infer dimension", "[api_vamana_index]") {
@@ -332,6 +343,7 @@ TEST_CASE("infer dimension", "[api_vamana_index]") {
332343
a.train(training_set);
333344
CHECK(a.feature_type() == TILEDB_FLOAT32);
334345
CHECK(a.id_type() == TILEDB_UINT32);
346+
CHECK(a.distance_metric() == DistanceMetric::SUM_OF_SQUARES);
335347
CHECK(dimensions(a) == 128);
336348
}
337349

0 commit comments

Comments
 (0)