Skip to content

Commit f4726c0

Browse files
committed
feat(remove): refactor Remove with mark remove mode
- add and refactor new interfaces - implement if mark remove for HGraph, IVF Signed-off-by: LHT129 <[email protected]>
1 parent 9f7dca1 commit f4726c0

36 files changed

+910
-234
lines changed

include/vsag/index.h

Lines changed: 47 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,23 @@ using OffsetType = uint64_t;
5656
using SizeType = uint64_t;
5757
using WriteFuncType = std::function<void(OffsetType, SizeType, const void*)>;
5858

59+
enum class AddMode {
60+
/** try to reuse the memory of the deleted vector, no recovery check */
61+
DEFAULT = 0,
62+
63+
/** always allocate new memory for the vector, but also check whether recovery from the same id */
64+
KEEP_TOMBSTONE = 1,
65+
};
66+
67+
enum class RemoveMode {
68+
/** mark the vector as deleted, but not remove it from index, no shrink and repair,
69+
* this mode is fast */
70+
MARK_REMOVE = 0,
71+
72+
/** remove the vector from index and repair the index, but not shrink the index,
73+
* this mode is heavy */
74+
REMOVE_AND_REPAIR = 1,
75+
};
5976
class Index {
6077
public:
6178
// [basic methods]
@@ -127,19 +144,40 @@ class Index {
127144
* @return IDs that failed to insert into the index
128145
*/
129146
virtual tl::expected<std::vector<int64_t>, Error>
130-
Add(const DatasetPtr& base) {
147+
Add(const DatasetPtr& base, AddMode mode = AddMode::DEFAULT) {
131148
throw std::runtime_error("Index not support adding vectors");
132149
}
133150

134151
/**
135-
* @brief Remove the vector corresponding to the given ID from the index
136-
*
137-
* @param id of the vector that need to be removed from the index
138-
* @return result indicates whether the remove operation is successful.
139-
*/
140-
virtual tl::expected<bool, Error>
141-
Remove(int64_t id) {
142-
throw std::runtime_error("Index not support delete vector");
152+
* @brief Remove the vectors corresponding to the given IDs from the index
153+
*
154+
* @param ids of the vectors that need to be removed from the index
155+
* @return number of vectors that successfully removed from the index
156+
*/
157+
virtual tl::expected<uint32_t, Error>
158+
Remove(const std::vector<int64_t>& ids, RemoveMode mode = RemoveMode::MARK_REMOVE) {
159+
throw std::runtime_error("Index not support Remove");
160+
}
161+
162+
/**
163+
* @brief Remove the vector corresponding to the given ID from the index
164+
*
165+
* @param id of the vector that need to be removed from the index
166+
* @return number of vectors that successfully removed from the index
167+
*/
168+
virtual tl::expected<uint32_t, Error>
169+
Remove(int64_t id, RemoveMode mode = RemoveMode::MARK_REMOVE) {
170+
return this->Remove(std::vector<int64_t>({id}), mode);
171+
}
172+
173+
/**
174+
* @brief
175+
* 1. Shrink the index to release memory occupied by soft deleted vectors.
176+
* 2. Repair the index which is corrupted by soft delete.
177+
*/
178+
virtual void
179+
ShrinkAndRepair() {
180+
throw std::runtime_error("Index not support ShrinkAndRepair");
143181
}
144182

145183
/**

include/vsag/vsag_ext.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,7 @@ class IndexHandler {
133133
tl::expected<std::vector<int64_t>, Error>
134134
Add(DatasetHandler* base);
135135

136-
tl::expected<bool, Error>
136+
tl::expected<uint32_t, Error>
137137
Remove(int64_t id);
138138

139139
tl::expected<DatasetHandler*, Error>

mockimpl/vsag/simpleflat.cpp

Lines changed: 18 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ SimpleFlat::Build(const DatasetPtr& base) {
6969
}
7070

7171
tl::expected<std::vector<int64_t>, Error>
72-
SimpleFlat::Add(const DatasetPtr& base) {
72+
SimpleFlat::Add(const DatasetPtr& base, AddMode mode) {
7373
std::vector<int64_t> failed_ids;
7474
if (not this->data_.empty()) {
7575
if (this->dim_ != base->GetDim()) {
@@ -429,22 +429,25 @@ SimpleFlat::GetStats() const {
429429
return j.dump();
430430
}
431431

432-
tl::expected<bool, Error>
433-
SimpleFlat::Remove(int64_t id) {
434-
auto iter = std::find(ids_.begin(), ids_.end(), id);
435-
if (iter != ids_.end()) {
436-
int index = iter - ids_.begin();
437-
num_elements_--;
438-
ids_[index] = ids_[num_elements_];
439-
std::memcpy(
440-
data_.data() + index * dim_, data_.data() + num_elements_ * dim_, dim_ * sizeof(float));
441-
ids_.resize(num_elements_);
442-
data_.resize(num_elements_ * dim_);
443-
} else {
444-
return false;
432+
tl::expected<uint32_t, Error>
433+
SimpleFlat::Remove(const std::vector<int64_t>& ids, RemoveMode mode) {
434+
uint32_t removed = 0;
435+
for (auto id : ids) {
436+
auto iter = std::find(ids_.begin(), ids_.end(), id);
437+
if (iter != ids_.end()) {
438+
int index = iter - ids_.begin();
439+
num_elements_--;
440+
ids_[index] = ids_[num_elements_];
441+
std::memcpy(data_.data() + index * dim_,
442+
data_.data() + num_elements_ * dim_,
443+
dim_ * sizeof(float));
444+
ids_.resize(num_elements_);
445+
data_.resize(num_elements_ * dim_);
446+
removed++;
447+
}
445448
}
446449

447-
return true;
450+
return removed;
448451
}
449452

450453
} // namespace vsag

mockimpl/vsag/simpleflat.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,10 +29,10 @@ class SimpleFlat : public Index {
2929
Build(const DatasetPtr& base) override;
3030

3131
virtual tl::expected<std::vector<int64_t>, Error>
32-
Add(const DatasetPtr& base) override;
32+
Add(const DatasetPtr& base, AddMode mode) override;
3333

34-
tl::expected<bool, Error>
35-
Remove(int64_t id) override;
34+
tl::expected<uint32_t, Error>
35+
Remove(const std::vector<int64_t>& ids, RemoveMode mode) override;
3636

3737
tl::expected<DatasetPtr, Error>
3838
KnnSearch(const DatasetPtr& query,

src/algorithm/brute_force.cpp

Lines changed: 40 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ BruteForce::Train(const DatasetPtr& data) {
6464
}
6565

6666
std::vector<int64_t>
67-
BruteForce::Add(const DatasetPtr& data) {
67+
BruteForce::Add(const DatasetPtr& data, AddMode mode) {
6868
std::vector<int64_t> failed_ids;
6969
auto base_dim = data->GetDim();
7070
CHECK_ARGUMENT(base_dim == dim_,
@@ -146,34 +146,44 @@ BruteForce::Add(const DatasetPtr& data) {
146146
return failed_ids;
147147
}
148148

149-
bool
150-
BruteForce::Remove(int64_t label) {
149+
uint32_t
150+
BruteForce::Remove(const std::vector<int64_t>& ids, RemoveMode mode) {
151151
CHECK_ARGUMENT(not use_attribute_filter_,
152152
"remove is not supported when use_attribute_filter is true");
153153

154+
uint32_t delete_count = 0;
155+
if (mode == RemoveMode::MARK_REMOVE) {
156+
std::scoped_lock label_lock(this->label_lookup_mutex_);
157+
delete_count = this->label_table_->MarkRemove(ids);
158+
delete_count_ += delete_count;
159+
return delete_count;
160+
}
161+
154162
std::scoped_lock lock(this->add_mutex_, this->label_lookup_mutex_);
155-
const auto last_inner_id = static_cast<InnerIdType>(this->total_count_ - 1);
156-
const auto inner_id = this->label_table_->GetIdByLabel(label);
163+
for (auto label : ids) {
164+
const auto last_inner_id = static_cast<InnerIdType>(this->total_count_ - 1);
165+
const auto inner_id = this->label_table_->GetIdByLabel(label);
157166

158-
CHECK_ARGUMENT(inner_id <= last_inner_id, "the element to be remove is invalid");
167+
CHECK_ARGUMENT(inner_id <= last_inner_id, "the element to be remove is invalid");
159168

160-
const auto last_label = this->label_table_->GetLabelById(last_inner_id);
161-
this->label_table_->Remove(label);
162-
--this->label_table_->total_count_;
169+
const auto last_label = this->label_table_->GetLabelById(last_inner_id);
170+
this->label_table_->MarkRemove(label);
171+
--this->label_table_->total_count_;
163172

164-
if (inner_id < last_inner_id) {
165-
Vector<float> data(dim_, allocator_);
166-
GetVectorByInnerId(last_inner_id, data.data());
173+
if (inner_id < last_inner_id) {
174+
Vector<float> data(dim_, allocator_);
175+
GetVectorByInnerId(last_inner_id, data.data());
167176

168-
this->label_table_->Remove(last_label);
169-
--this->label_table_->total_count_;
177+
this->label_table_->MarkRemove(last_label);
178+
--this->label_table_->total_count_;
170179

171-
this->inner_codes_->InsertVector(data.data(), inner_id);
172-
this->label_table_->Insert(inner_id, last_label);
173-
}
180+
this->inner_codes_->InsertVector(data.data(), inner_id);
181+
this->label_table_->Insert(inner_id, last_label);
182+
}
174183

175-
this->total_count_--;
176-
return true;
184+
this->total_count_--;
185+
}
186+
return 1;
177187
}
178188

179189
DatasetPtr
@@ -199,10 +209,18 @@ BruteForce::SearchWithRequest(const SearchRequest& request) const {
199209
DistHeapPtr heap = nullptr;
200210
ExecutorPtr executor = nullptr;
201211
Filter* attr_filter = nullptr;
202-
Filter* filter = nullptr;
212+
213+
auto combined_filter = std::make_shared<CombinedFilter>();
214+
combined_filter->AppendFilter(this->label_table_->GetDeletedIdsFilter());
203215
if (request.filter_ != nullptr) {
204-
filter = request.filter_.get();
216+
combined_filter->AppendFilter(
217+
std::make_shared<InnerIdWrapperFilter>(request.filter_, *this->label_table_));
205218
}
219+
FilterPtr ft = nullptr;
220+
if (not combined_filter->IsEmpty()) {
221+
ft = combined_filter;
222+
}
223+
206224
if (request.enable_attribute_filter_) {
207225
auto& schema = this->attr_filter_index_->field_type_map_;
208226
auto expr = AstParse(request.attribute_filter_str_, &schema);
@@ -228,7 +246,7 @@ BruteForce::SearchWithRequest(const SearchRequest& request) const {
228246
if (attr_filter != nullptr and not attr_filter->CheckValid(i)) {
229247
continue;
230248
}
231-
if (filter == nullptr or filter->CheckValid(this->label_table_->GetLabelById(i))) {
249+
if (ft == nullptr or ft->CheckValid(i)) {
232250
inner_codes_->Query(&dist, computer, &i, 1);
233251
++dist_cmp_local;
234252
cur_heap->Push(dist, i);

src/algorithm/brute_force.h

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ class BruteForce : public InnerIndexInterface {
4444
~BruteForce() override = default;
4545

4646
std::vector<int64_t>
47-
Add(const DatasetPtr& data) override;
47+
Add(const DatasetPtr& base, AddMode mode = AddMode::DEFAULT) override;
4848

4949
std::vector<int64_t>
5050
Build(const DatasetPtr& data) override;
@@ -78,7 +78,12 @@ class BruteForce : public InnerIndexInterface {
7878

7979
[[nodiscard]] int64_t
8080
GetNumElements() const override {
81-
return this->total_count_;
81+
return this->total_count_ - this->delete_count_;
82+
}
83+
84+
[[nodiscard]] int64_t
85+
GetNumberRemoved() const override {
86+
return this->delete_count_;
8287
}
8388

8489
void
@@ -100,8 +105,8 @@ class BruteForce : public InnerIndexInterface {
100105
const FilterPtr& filter,
101106
int64_t limited_size = -1) const override;
102107

103-
bool
104-
Remove(int64_t label) override;
108+
uint32_t
109+
Remove(const std::vector<int64_t>& ids, RemoveMode mode = RemoveMode::MARK_REMOVE) override;
105110

106111
[[nodiscard]] DatasetPtr
107112
SearchWithRequest(const SearchRequest& request) const override;
@@ -138,6 +143,8 @@ class BruteForce : public InnerIndexInterface {
138143

139144
uint64_t total_count_{0};
140145

146+
uint64_t delete_count_{0};
147+
141148
uint64_t resize_increase_count_bit_{DEFAULT_RESIZE_BIT};
142149

143150
mutable std::shared_mutex global_mutex_;

0 commit comments

Comments
 (0)