Skip to content

Commit 555b2e9

Browse files
authored
Various small code cleanups from IVF_PQ OOC work (#552)
1 parent 4f58b38 commit 555b2e9

File tree

9 files changed

+84
-106
lines changed

9 files changed

+84
-106
lines changed

apis/python/src/tiledb/vector_search/index.py

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -542,10 +542,9 @@ def consolidate_updates(self, retrain_index: bool = False, **kwargs):
542542
tiledb.consolidate(self.updates_array_uri, config=conf)
543543

544544
# We don't copy the centroids if self.partitions=0 because this means our index was previously empty.
545-
should_pass_copy_centroids_uri = (
546-
self.index_type == "IVF_FLAT" and not retrain_index and self.partitions > 0
547-
)
548-
if should_pass_copy_centroids_uri:
545+
copy_centroids_uri = None
546+
if self.index_type == "IVF_FLAT" and not retrain_index and self.partitions > 0:
547+
copy_centroids_uri = self.centroids_uri
549548
# Make sure the user didn't pass an incorrect number of partitions.
550549
if "partitions" in kwargs and self.partitions != kwargs["partitions"]:
551550
raise ValueError(
@@ -565,9 +564,7 @@ def consolidate_updates(self, retrain_index: bool = False, **kwargs):
565564
index_timestamp=max_timestamp,
566565
distance_metric=self.distance_metric,
567566
storage_version=self.storage_version,
568-
copy_centroids_uri=self.centroids_uri
569-
if should_pass_copy_centroids_uri
570-
else None,
567+
copy_centroids_uri=copy_centroids_uri,
571568
config=self.config,
572569
**kwargs,
573570
)

apis/python/src/tiledb/vector_search/ingestion.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1738,9 +1738,7 @@ def ingest_vectors_udf(
17381738
if part_end > end:
17391739
part_end = end
17401740

1741-
str(part) + "-" + str(part_end)
17421741
part_id = int(part / batch)
1743-
part_id * (partitions + 1)
17441742

17451743
logger.debug("Input vectors start_pos: %d, end_pos: %d", part, part_end)
17461744
updated_ids = read_updated_ids(

apis/python/test/test_index.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -495,28 +495,28 @@ def test_index_with_incorrect_num_of_query_columns_complex(tmp_path):
495495
# number of columns in the indexed data.
496496
size = 1000
497497
indexes = ["FLAT", "IVF_FLAT", "VAMANA", "IVF_PQ"]
498-
num_columns_in_vector = [1, 2, 3, 4, 5, 10]
498+
dimensions_in_ingestion = [1, 2, 3, 4, 5, 10]
499499
for index_type in indexes:
500-
for num_columns in num_columns_in_vector:
501-
index_uri = os.path.join(tmp_path, f"array_{index_type}_{num_columns}")
502-
dataset_dir = os.path.join(tmp_path, f"dataset_{index_type}_{num_columns}")
500+
for dimensions in dimensions_in_ingestion:
501+
index_uri = os.path.join(tmp_path, f"array_{index_type}_{dimensions}")
502+
dataset_dir = os.path.join(tmp_path, f"dataset_{index_type}_{dimensions}")
503503
create_random_dataset_f32_only_data(
504-
nb=size, d=num_columns, centers=1, path=dataset_dir
504+
nb=size, d=dimensions, centers=1, path=dataset_dir
505505
)
506506
index = ingest(
507507
index_type=index_type,
508508
index_uri=index_uri,
509509
source_uri=os.path.join(dataset_dir, "data.f32bin"),
510-
num_subspaces=num_columns,
510+
num_subspaces=dimensions,
511511
partitions=1,
512512
)
513513

514-
# We have created a dataset with num_columns in each vector. Let's try creating queries
514+
# We have created a dataset with dimensions in each vector. Let's try creating queries
515515
# with different numbers of columns and confirming incorrect ones will throw.
516-
for num_columns_for_query in range(1, num_columns + 2):
517-
query_shape = (1, num_columns_for_query)
516+
for dimensions_in_query in range(1, dimensions + 2):
517+
query_shape = (1, dimensions_in_query)
518518
query = np.random.rand(*query_shape).astype(np.float32)
519-
if query.shape[1] == num_columns:
519+
if query.shape[1] == dimensions:
520520
index.query(query, k=1, nprobe=1)
521521
else:
522522
with pytest.raises(TypeError):

apis/python/test/test_type_erased_module.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -234,6 +234,19 @@ def test_numpy_to_feature_vector_array_with_ids():
234234
assert b.ids_type_string() == "uint64"
235235

236236

237+
def test_numpy_to_feature_vector_single_item():
238+
a = np.array([1], dtype=np.float32)
239+
assert a.ndim == 1
240+
assert a.shape == (1,)
241+
b = vspy.FeatureVector(a)
242+
assert b.dimensions() == 1
243+
assert b.feature_type_string() == "float32"
244+
c = np.array(b)
245+
assert c.ndim == 1
246+
assert c.shape == (1,)
247+
assert (a == c).all()
248+
249+
237250
def test_TemporalPolicy():
238251
temporal_policy = vspy.TemporalPolicy()
239252
assert temporal_policy.timestamp_start() == 0

src/include/index/ivf_pq_group.h

Lines changed: 20 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -163,34 +163,38 @@ class ivf_pq_group : public base_index_group<index_type> {
163163
/*****************************************************************************
164164
* Inverted index information: centroids, index, pq_parts, ids
165165
****************************************************************************/
166-
[[nodiscard]] auto cluster_centroids_uri() const {
167-
return this->array_key_to_uri("cluster_centroids_array_name");
168-
}
169166
[[nodiscard]] auto flat_ivf_centroids_uri() const {
170167
return this->array_key_to_uri("flat_ivf_centroids_array_name");
171168
}
172-
[[nodiscard]] auto pq_ivf_indices_uri() const {
173-
return this->array_key_to_uri("pq_ivf_indices_array_name");
174-
}
175-
[[nodiscard]] auto pq_ivf_ids_uri() const {
176-
return this->array_key_to_uri("pq_ivf_ids_array_name");
177-
}
178-
[[nodiscard]] auto pq_ivf_vectors_uri() const {
179-
return this->array_key_to_uri("pq_ivf_vectors_array_name");
169+
[[nodiscard]] auto flat_ivf_centroids_array_name() const {
170+
return this->array_key_to_array_name("flat_ivf_centroids_array_name");
180171
}
181172

173+
[[nodiscard]] auto cluster_centroids_uri() const {
174+
return this->array_key_to_uri("cluster_centroids_array_name");
175+
}
182176
[[nodiscard]] auto cluster_centroids_array_name() const {
183177
return this->array_key_to_array_name("cluster_centroids_array_name");
184178
}
185-
[[nodiscard]] auto flat_ivf_centroids_array_name() const {
186-
return this->array_key_to_array_name("flat_ivf_centroids_array_name");
179+
180+
[[nodiscard]] auto pq_ivf_indices_uri() const {
181+
return this->array_key_to_uri("pq_ivf_indices_array_name");
187182
}
188183
[[nodiscard]] auto pq_ivf_indices_array_name() const {
189184
return this->array_key_to_array_name("pq_ivf_indices_array_name");
190185
}
186+
187+
[[nodiscard]] auto pq_ivf_ids_uri() const {
188+
return this->array_key_to_uri("pq_ivf_ids_array_name");
189+
}
191190
[[nodiscard]] auto pq_ivf_ids_array_name() const {
192191
return this->array_key_to_array_name("pq_ivf_ids_array_name");
193192
}
193+
194+
[[nodiscard]] auto pq_ivf_vectors_uri() const {
195+
return this->array_key_to_uri("pq_ivf_vectors_array_name");
196+
}
197+
194198
[[nodiscard]] auto pq_ivf_vectors_array_name() const {
195199
return this->array_key_to_array_name("pq_ivf_vectors_array_name");
196200
}
@@ -205,6 +209,9 @@ class ivf_pq_group : public base_index_group<index_type> {
205209
metadata_.num_subspaces_ = num_subspaces;
206210
}
207211

212+
uint32_t get_sub_dimensions() const {
213+
return metadata_.sub_dimensions_;
214+
}
208215
void set_sub_dimensions(uint32_t sub_dimensions) {
209216
metadata_.sub_dimensions_ = sub_dimensions;
210217
}

src/include/index/ivf_pq_index.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1665,6 +1665,8 @@ class ivf_pq_index {
16651665
<< " num_vectors(rhs): " << ::num_vectors(rhs) << std::endl;
16661666
std::cout << "dimensions(lhs): " << ::dimensions(lhs)
16671667
<< " dimensions(rhs): " << ::dimensions(rhs) << std::endl;
1668+
debug_matrix(lhs, "[ivf_pq_index@compare_feature_vector_arrays] lhs");
1669+
debug_matrix(rhs, "[ivf_pq_index@compare_feature_vector_arrays] rhs");
16681670
return false;
16691671
}
16701672
for (size_t i = 0; i < ::num_vectors(lhs); ++i) {
@@ -1737,6 +1739,9 @@ class ivf_pq_index {
17371739
return true;
17381740
}
17391741
if (!partitioned_pq_vectors_ || !rhs.partitioned_pq_vectors_) {
1742+
std::cout << "[ivf_pq_index@compare_ivf_index] partitioned_pq_vectors_ "
1743+
"|| rhs.partitioned_pq_vectors_ is nullptr"
1744+
<< std::endl;
17401745
return false;
17411746
}
17421747
return compare_feature_vectors(
@@ -1750,6 +1755,9 @@ class ivf_pq_index {
17501755
return true;
17511756
}
17521757
if (!partitioned_pq_vectors_ || !rhs.partitioned_pq_vectors_) {
1758+
std::cout << "[ivf_pq_index@compare_ivf_ids] partitioned_pq_vectors_ || "
1759+
"rhs.partitioned_pq_vectors_ is nullptr"
1760+
<< std::endl;
17531761
return false;
17541762
}
17551763
return compare_feature_vectors(

src/include/test/unit_api_ivf_pq_index.cc

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -509,11 +509,11 @@ TEST_CASE("infer dimension", "[api_ivf_pq_index]") {
509509

510510
TEST_CASE("write and read", "[api_ivf_pq_index]") {
511511
auto ctx = tiledb::Context{};
512-
std::string api_ivf_pq_index_uri =
512+
std::string index_uri =
513513
(std::filesystem::temp_directory_path() / "api_ivf_pq_index").string();
514514
tiledb::VFS vfs(ctx);
515-
if (vfs.is_dir(api_ivf_pq_index_uri)) {
516-
vfs.remove_dir(api_ivf_pq_index_uri);
515+
if (vfs.is_dir(index_uri)) {
516+
vfs.remove_dir(index_uri);
517517
}
518518

519519
auto a = IndexIVFPQ(std::make_optional<IndexOptions>(
@@ -524,9 +524,9 @@ TEST_CASE("write and read", "[api_ivf_pq_index]") {
524524
auto training_set = FeatureVectorArray(ctx, siftsmall_inputs_uri);
525525
a.train(training_set);
526526
a.add(training_set);
527-
a.write_index(ctx, api_ivf_pq_index_uri);
527+
a.write_index(ctx, index_uri);
528528

529-
auto b = IndexIVFPQ(ctx, api_ivf_pq_index_uri);
529+
auto b = IndexIVFPQ(ctx, index_uri);
530530

531531
CHECK(dimensions(a) == dimensions(b));
532532
CHECK(a.feature_type() == b.feature_type());
@@ -561,10 +561,10 @@ TEST_CASE("read index and query", "[api_ivf_pq_index]") {
561561

562562
size_t k_nn = 10;
563563

564-
std::string api_ivf_pq_index_uri =
564+
std::string index_uri =
565565
(std::filesystem::temp_directory_path() / "api_ivf_pq_index").string();
566-
if (vfs.is_dir(api_ivf_pq_index_uri)) {
567-
vfs.remove_dir(api_ivf_pq_index_uri);
566+
if (vfs.is_dir(index_uri)) {
567+
vfs.remove_dir(index_uri);
568568
}
569569

570570
auto a = IndexIVFPQ(std::make_optional<IndexOptions>(
@@ -576,19 +576,19 @@ TEST_CASE("read index and query", "[api_ivf_pq_index]") {
576576
auto training_set = FeatureVectorArray(ctx, siftsmall_inputs_uri);
577577
a.train(training_set);
578578
a.add(training_set);
579-
a.write_index(ctx, api_ivf_pq_index_uri);
579+
a.write_index(ctx, index_uri);
580580

581581
auto query_set = FeatureVectorArray(ctx, siftsmall_query_uri);
582582
auto groundtruth_set = FeatureVectorArray(ctx, siftsmall_groundtruth_uri);
583583

584584
std::unique_ptr<IndexIVFPQ> b;
585585
SECTION("infinite") {
586-
b = std::make_unique<IndexIVFPQ>(ctx, api_ivf_pq_index_uri);
586+
b = std::make_unique<IndexIVFPQ>(ctx, index_uri);
587587
}
588588
SECTION("finite") {
589589
size_t upper_bound = GENERATE(500, 1000);
590590
b = std::make_unique<IndexIVFPQ>(
591-
ctx, api_ivf_pq_index_uri, IndexLoadStrategy::PQ_OOC, upper_bound);
591+
ctx, index_uri, IndexLoadStrategy::PQ_OOC, upper_bound);
592592
CHECK(b->upper_bound() == upper_bound);
593593
}
594594

src/include/test/unit_flat_pq_index.cc

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -534,13 +534,13 @@ TEMPLATE_TEST_CASE(
534534
scale *= a_vx_pqx2;
535535
}
536536

537-
CHECK(a_vx_pqx2 / scale < 0.00005);
538-
CHECK(a_dpqx_evx2 / scale < 0.00005);
539-
CHECK(s_evx_pqx2 / scale < 0.00005);
540-
CHECK(ss_vx_dpqx2 / scale < 0.00005);
541-
CHECK(s_evx_edpqx2 / scale < 0.00005);
542-
CHECK(a_evx_edpqx2 / scale < 0.00005);
543-
CHECK(ss_devx_dpqx2 / scale < 0.00005);
537+
CHECK(a_vx_pqx2 / scale < 0.00006);
538+
CHECK(a_dpqx_evx2 / scale < 0.00006);
539+
CHECK(s_evx_pqx2 / scale < 0.00006);
540+
CHECK(ss_vx_dpqx2 / scale < 0.00006);
541+
CHECK(s_evx_edpqx2 / scale < 0.00006);
542+
CHECK(a_evx_edpqx2 / scale < 0.00006);
543+
CHECK(ss_devx_dpqx2 / scale < 0.00006);
544544
CHECK(ss_devx_vx2 / scale < 0.000075);
545545
}
546546
}

0 commit comments

Comments
 (0)