Skip to content

Commit 525160d

Browse files
author
Nikos Papailiou
committed
Avoid checking deleted_ids when there are no updates
1 parent 9d3ee02 commit 525160d

File tree

2 files changed

+26
-5
lines changed

2 files changed

+26
-5
lines changed

apis/python/test/test_ingestion.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -283,7 +283,6 @@ def test_ivf_flat_ingestion_external_ids_numpy(tmp_path):
283283
def test_ivf_flat_ingestion_with_updates(tmp_path):
284284
dataset_dir = os.path.join(tmp_path, "dataset")
285285
index_uri = os.path.join(tmp_path, "array")
286-
index_uri_2 = os.path.join(tmp_path, "array_2")
287286
k = 10
288287
size = 100000
289288
partitions = 100

src/include/detail/ivf/index.h

Lines changed: 26 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -77,11 +77,18 @@ int ivf_index(
7777
std::unordered_set<ids_type> deleted_ids_set(deleted_ids.begin(), deleted_ids.end());
7878
std::vector<size_t> degrees(centroids.num_cols());
7979
std::vector<ids_type> indices(centroids.num_cols() + 1);
80-
for (size_t i = 0; i < db.num_cols(); ++i) {
81-
if (auto iter = deleted_ids_set.find(external_ids[i]); iter == deleted_ids_set.end()) {
80+
if (deleted_ids.empty()) {
81+
for (size_t i = 0; i < db.num_cols(); ++i) {
8282
auto j = parts[i];
8383
++degrees[j];
8484
}
85+
} else {
86+
for (size_t i = 0; i < db.num_cols(); ++i) {
87+
if (auto iter = deleted_ids_set.find(external_ids[i]); iter == deleted_ids_set.end()) {
88+
auto j = parts[i];
89+
++degrees[j];
90+
}
91+
}
8592
}
8693
indices[0] = 0;
8794
std::inclusive_scan(begin(degrees), end(degrees), begin(indices) + 1);
@@ -112,8 +119,8 @@ int ivf_index(
112119
// which will group them nicely -- but a distributed parallel sort may
113120
// be difficult to implement. Even this algorithm is not trivial to
114121
// parallelize, because of the random access to the indices array.
115-
for (size_t i = 0; i < db.num_cols(); ++i) {
116-
if (auto iter = deleted_ids_set.find(external_ids[i]); iter == deleted_ids_set.end()) {
122+
if (deleted_ids.empty()) {
123+
for (size_t i = 0; i < db.num_cols(); ++i) {
117124
size_t bin = parts[i];
118125
size_t ibin = indices[bin];
119126

@@ -125,6 +132,21 @@ int ivf_index(
125132
}
126133
++indices[bin];
127134
}
135+
} else {
136+
for (size_t i = 0; i < db.num_cols(); ++i) {
137+
if (auto iter = deleted_ids_set.find(external_ids[i]); iter == deleted_ids_set.end()) {
138+
size_t bin = parts[i];
139+
size_t ibin = indices[bin];
140+
141+
shuffled_ids[ibin] = external_ids[i];
142+
143+
assert(ibin < shuffled_db.num_cols());
144+
for (size_t j = 0; j < db.num_rows(); ++j) {
145+
shuffled_db(j, ibin) = db(j, i);
146+
}
147+
++indices[bin];
148+
}
149+
}
128150
}
129151

130152
std::shift_right(begin(indices), end(indices), 1);

0 commit comments

Comments
 (0)