Skip to content

Commit fb12012

Browse files
authored
Fix type-erased indexes writing fragments at timestamp=0, thus fixing IVF PQ time travel (#425)
1 parent 5c8fcf9 commit fb12012

File tree

4 files changed

+84
-7
lines changed

4 files changed

+84
-7
lines changed

apis/python/test/test_ingestion.py

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -957,10 +957,6 @@ def test_ingestion_with_updates_and_timetravel(tmp_path):
957957
assert ingestion_timestamps == [1]
958958
assert base_sizes == [size]
959959

960-
if index_type == "IVF_PQ":
961-
# TODO(SC-48897): Fix time travelling for IVF_PQ and re-enable.
962-
continue
963-
964960
if index_type == "IVF_FLAT":
965961
assert index.partitions == partitions
966962

@@ -1226,9 +1222,6 @@ def test_ingestion_with_additions_and_timetravel(tmp_path):
12261222
)
12271223
if index_type == "IVF_FLAT":
12281224
assert index.partitions == partitions
1229-
if index_type == "IVF_PQ":
1230-
# TODO(SC-48897): Fix time travelling for IVF_PQ and re-enable.
1231-
continue
12321225
_, result = index.query(queries, k=k, nprobe=partitions)
12331226
assert accuracy(result, gt_i) == 1.0
12341227

src/include/index/ivf_pq_index.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1040,6 +1040,15 @@ class ivf_pq_index {
10401040
write_group.append_num_partitions(num_partitions_);
10411041
}
10421042

1043+
// When creating from Python we initially call write_index() at timestamp 0.
1044+
// The goal here is just to create the arrays and save metadata. Return here
1045+
// so that we don't write the arrays, as if we write with timestamp=0 then
1046+
// TileDB Core will interpret this as the current timestamp instead, leading
1047+
// to array fragments created at the current time.
1048+
if (temporal_policy_.timestamp_end() == 0) {
1049+
return true;
1050+
}
1051+
10431052
// flat_ivf_centroids_, cluster_centroids_, distance_tables_
10441053
// pq_ivf_centroids_, partitioned_pq_vectors_, unpartitioned_pq_vectors_
10451054

src/include/index/vamana_index.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -757,6 +757,15 @@ class vamana_index {
757757
write_group.append_num_edges(graph_.num_edges());
758758
}
759759

760+
// When creating from Python we initially call write_index() at timestamp 0.
761+
// The goal here is just to create the arrays and save metadata. Return here
762+
// so that we don't write the arrays, as if we write with timestamp=0 then
763+
// TileDB Core will interpret this as the current timestamp instead, leading
764+
// to array fragments created at the current time.
765+
if (temporal_policy_.timestamp_end() == 0) {
766+
return true;
767+
}
768+
760769
write_matrix(
761770
ctx,
762771
feature_vectors_,

src/include/test/unit_ivf_pq_index.cc

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -695,3 +695,69 @@ TEST_CASE("query simple", "[ivf_pq_index]") {
695695
}
696696
}
697697
}
698+
699+
TEST_CASE("ivf_pq_index query index written twice", "[ivf_pq_index]") {
700+
tiledb::Context ctx;
701+
tiledb::VFS vfs(ctx);
702+
std::string index_uri =
703+
(std::filesystem::temp_directory_path() / "tmp_ivf_pq_index").string();
704+
std::cout << "index_uri: " << index_uri << std::endl;
705+
if (vfs.is_dir(index_uri)) {
706+
vfs.remove_dir(index_uri);
707+
}
708+
709+
using feature_type_type = uint8_t;
710+
using id_type_type = uint32_t;
711+
using partitioning_index_type_type = uint32_t;
712+
auto feature_type = "uint8";
713+
auto id_type = "uint32";
714+
auto partitioning_index_type = "uint32";
715+
size_t dimensions = 3;
716+
size_t n_list = 1;
717+
size_t num_subspaces = 1;
718+
float convergence_tolerance = 0.00003f;
719+
size_t max_iterations = 3;
720+
721+
// Write the empty index.
722+
{
723+
auto index = ivf_pq_index<
724+
feature_type_type,
725+
id_type_type,
726+
partitioning_index_type_type>(n_list, dimensions / 2);
727+
auto data =
728+
ColMajorMatrixWithIds<feature_type_type, id_type_type>(dimensions, 0);
729+
index.train(data, data.raveled_ids());
730+
index.add(data, data.raveled_ids());
731+
index.write_index(ctx, index_uri, TemporalPolicy(TimeTravel, 0));
732+
}
733+
734+
// Train the index at timestamp 99.
735+
{
736+
auto index = ivf_pq_index<
737+
feature_type_type,
738+
id_type_type,
739+
partitioning_index_type_type>(ctx, index_uri);
740+
auto data = ColMajorMatrixWithIds<feature_type_type, id_type_type>{
741+
{{1, 1, 1}, {2, 2, 2}, {3, 3, 3}, {4, 4, 4}}, {1, 2, 3, 4}};
742+
index.train(data, data.raveled_ids());
743+
index.add(data, data.raveled_ids());
744+
index.write_index(ctx, index_uri, TemporalPolicy(TimeTravel, 99));
745+
}
746+
747+
// Load the index and query.
748+
{
749+
auto index = ivf_pq_index<
750+
feature_type_type,
751+
id_type_type,
752+
partitioning_index_type_type>(ctx, index_uri);
753+
auto queries = ColMajorMatrix<feature_type_type>{
754+
{1, 1, 1}, {2, 2, 2}, {3, 3, 3}, {4, 4, 4}};
755+
auto&& [scores, ids] = index.query_infinite_ram(queries, 1, n_list);
756+
CHECK(std::equal(
757+
scores.data(),
758+
scores.data() + 4,
759+
std::vector<float>{0, 0, 0, 0}.begin()));
760+
CHECK(std::equal(
761+
ids.data(), ids.data() + 4, std::vector<uint32_t>{1, 2, 3, 4}.begin()));
762+
}
763+
}

0 commit comments

Comments
 (0)