Skip to content

Commit 97a15dc

Browse files
chore(search_family): Minor performance improvements to GetAllResults methods. FIRST PR (#5341)
* fix(search_family): Improve performance of GetAllResults methods Signed-off-by: Stepan Bagritsevich <[email protected]> * refactor: address comments Signed-off-by: Stepan Bagritsevich <[email protected]> --------- Signed-off-by: Stepan Bagritsevich <[email protected]>
1 parent 3e3b3c3 commit 97a15dc

File tree

6 files changed

+123
-89
lines changed

6 files changed

+123
-89
lines changed

src/core/search/base.h

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#include <string_view>
1414
#include <vector>
1515

16+
#include "absl/container/flat_hash_set.h"
1617
#include "base/pmr/memory_resource.h"
1718
#include "core/string_map.h"
1819

@@ -80,6 +81,11 @@ struct DocumentAccessor {
8081
virtual std::optional<StringList> GetTags(std::string_view active_field) const = 0;
8182
};
8283

84+
// Represents a set of document IDs, used for merging results of inverse indices.
85+
template <typename Allocator = std::allocator<DocId>>
86+
using UniqueDocsList = absl::flat_hash_set<DocId, absl::DefaultHashContainerHash<DocId>,
87+
absl::DefaultHashContainerEq<DocId>, Allocator>;
88+
8389
// Base class for type-specific indices.
8490
//
8591
// Queries should be done directly on subclasses with their distinc
@@ -92,9 +98,8 @@ struct BaseIndex {
9298
virtual void Remove(DocId id, const DocumentAccessor& doc, std::string_view field) = 0;
9399

94100
// Returns documents that have non-null values for this field (used for @field:* queries)
95-
virtual std::optional<std::vector<DocId>> GetAllResults() const {
96-
return std::nullopt;
97-
}
101+
// Result must be sorted
102+
virtual std::vector<DocId> GetAllDocsWithNonNullValues() const = 0;
98103
};
99104

100105
// Base class for type-specific sorting indices.

src/core/search/indices.cc

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,24 @@ vector<DocId> NumericIndex::Range(double l, double r) const {
115115
return out;
116116
}
117117

118+
vector<DocId> NumericIndex::GetAllDocsWithNonNullValues() const {
119+
UniqueDocsList<> unique_docs;
120+
std::vector<DocId> result;
121+
122+
unique_docs.reserve(entries_.size());
123+
result.reserve(entries_.size());
124+
125+
for (const auto& [_, doc_id] : entries_) {
126+
const auto [__, is_new] = unique_docs.insert(doc_id);
127+
if (is_new) {
128+
result.push_back(doc_id);
129+
}
130+
}
131+
132+
std::sort(result.begin(), result.end());
133+
return result;
134+
}
135+
118136
template <typename C>
119137
BaseStringIndex<C>::BaseStringIndex(PMR_NS::memory_resource* mr, bool case_sensitive)
120138
: case_sensitive_{case_sensitive}, entries_{mr} {
@@ -196,6 +214,26 @@ template <typename C> vector<string> BaseStringIndex<C>::GetTerms() const {
196214
return res;
197215
}
198216

217+
template <typename C> vector<DocId> BaseStringIndex<C>::GetAllDocsWithNonNullValues() const {
218+
UniqueDocsList<> unique_docs;
219+
std::vector<DocId> result;
220+
221+
unique_docs.reserve(entries_.size());
222+
result.reserve(entries_.size());
223+
224+
for (const auto& [_, container] : entries_) {
225+
for (const auto& doc_id : container) {
226+
auto [_, is_new] = unique_docs.insert(doc_id);
227+
if (is_new) {
228+
result.push_back(doc_id);
229+
}
230+
}
231+
}
232+
233+
std::sort(result.begin(), result.end());
234+
return result;
235+
}
236+
199237
template struct BaseStringIndex<CompressedSortedSet>;
200238
template struct BaseStringIndex<SortedVector>;
201239

@@ -264,6 +302,36 @@ const float* FlatVectorIndex::Get(DocId doc) const {
264302
return &entries_[doc * dim_];
265303
}
266304

305+
std::vector<DocId> FlatVectorIndex::GetAllDocsWithNonNullValues() const {
306+
std::vector<DocId> result;
307+
308+
size_t num_vectors = entries_.size() / dim_;
309+
result.reserve(num_vectors);
310+
311+
for (DocId id = 0; id < num_vectors; ++id) {
312+
// Check if the vector is not zero (all elements are 0)
313+
// TODO: Valid vector can contain 0s, we should use a better approach
314+
const float* vec = Get(id);
315+
bool is_zero_vector = true;
316+
317+
// TODO: Consider don't use check for zero vector
318+
for (size_t i = 0; i < dim_; ++i) {
319+
if (vec[i] != 0.0f) { // TODO: Consider using a threshold for float comparison
320+
is_zero_vector = false;
321+
break;
322+
}
323+
}
324+
325+
if (!is_zero_vector) {
326+
result.push_back(id);
327+
}
328+
}
329+
330+
// Result is already sorted by id, no need to sort again
331+
// Also it has no duplicates
332+
return result;
333+
}
334+
267335
struct HnswlibAdapter {
268336
// Default setting of hnswlib/hnswalg
269337
constexpr static size_t kDefaultEfRuntime = 10;

src/core/search/indices.h

Lines changed: 8 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -35,9 +35,7 @@ struct NumericIndex : public BaseIndex {
3535

3636
std::vector<DocId> Range(double l, double r) const;
3737

38-
std::optional<std::vector<DocId>> GetAllResults() const override {
39-
return Range(-std::numeric_limits<double>::infinity(), std::numeric_limits<double>::infinity());
40-
}
38+
std::vector<DocId> GetAllDocsWithNonNullValues() const override;
4139

4240
private:
4341
using Entry = std::pair<double, DocId>;
@@ -62,19 +60,7 @@ template <typename C> struct BaseStringIndex : public BaseIndex {
6260
// Returns all the terms that appear as keys in the reverse index.
6361
std::vector<std::string> GetTerms() const;
6462

65-
std::optional<std::vector<DocId>> GetAllResults() const override {
66-
absl::flat_hash_set<DocId> unique_docs;
67-
68-
for (const auto& [term, container] : entries_) {
69-
for (const DocId& id : container) {
70-
unique_docs.insert(id);
71-
}
72-
}
73-
74-
auto result = std::vector<DocId>(unique_docs.begin(), unique_docs.end());
75-
std::sort(result.begin(), result.end());
76-
return result;
77-
}
63+
std::vector<DocId> GetAllDocsWithNonNullValues() const override;
7864

7965
protected:
8066
using StringList = DocumentAccessor::StringList;
@@ -152,31 +138,7 @@ struct FlatVectorIndex : public BaseVectorIndex {
152138
const float* Get(DocId doc) const;
153139

154140
// Return all documents that have vectors in this index
155-
std::optional<std::vector<DocId>> GetAllResults() const override {
156-
std::vector<DocId> result;
157-
size_t num_vectors = entries_.size() / dim_;
158-
result.reserve(num_vectors);
159-
160-
for (DocId id = 0; id < num_vectors; ++id) {
161-
// Check if the vector is not zero (all elements are 0)
162-
// TODO: Valid vector can contain 0s, we should use a better approach
163-
const float* vec = Get(id);
164-
bool is_zero_vector = true;
165-
166-
for (size_t i = 0; i < dim_; ++i) {
167-
if (vec[i] != 0.0f) {
168-
is_zero_vector = false;
169-
break;
170-
}
171-
}
172-
173-
if (!is_zero_vector) {
174-
result.push_back(id);
175-
}
176-
}
177-
178-
return result;
179-
}
141+
std::vector<DocId> GetAllDocsWithNonNullValues() const override;
180142

181143
protected:
182144
void AddVector(DocId id, const VectorPtr& vector) override;
@@ -187,10 +149,6 @@ struct FlatVectorIndex : public BaseVectorIndex {
187149

188150
struct HnswlibAdapter;
189151

190-
// This index does't have GetAllResults method
191-
// because it's not possible to get all vectors from the index
192-
// It depends on the Hnswlib implementation
193-
// TODO: Consider adding GetAllResults method in the future
194152
struct HnswVectorIndex : public BaseVectorIndex {
195153
HnswVectorIndex(const SchemaField::VectorParams& params, PMR_NS::memory_resource* mr);
196154
~HnswVectorIndex();
@@ -201,6 +159,11 @@ struct HnswVectorIndex : public BaseVectorIndex {
201159
std::vector<std::pair<float, DocId>> Knn(float* target, size_t k, std::optional<size_t> ef,
202160
const std::vector<DocId>& allowed) const;
203161

162+
// TODO: Implement if needed
163+
std::vector<DocId> GetAllDocsWithNonNullValues() const override {
164+
return std::vector<DocId>{};
165+
}
166+
204167
protected:
205168
void AddVector(DocId id, const VectorPtr& vector) override;
206169

src/core/search/search.cc

Lines changed: 18 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -184,17 +184,25 @@ struct BasicSearch {
184184
profile_builder_ = ProfileBuilder{};
185185
}
186186

187+
BaseIndex* GetBaseIndex(string_view field) {
188+
auto index = indices_->GetIndex(field);
189+
if (!index) {
190+
error_ = absl::StrCat("Invalid field: ", field);
191+
return nullptr;
192+
}
193+
return index;
194+
}
195+
187196
// Get casted sub index by field
188197
template <typename T> T* GetIndex(string_view field) {
189198
static_assert(is_base_of_v<BaseIndex, T>);
190199

191-
auto index = indices_->GetIndex(field);
192-
if (!index) {
193-
error_ = absl::StrCat("Invalid field: ", field);
200+
auto base_index = GetBaseIndex(field);
201+
if (!base_index) {
194202
return nullptr;
195203
}
196204

197-
auto* casted_ptr = dynamic_cast<T*>(index);
205+
auto* casted_ptr = dynamic_cast<T*>(base_index);
198206
if (!casted_ptr) {
199207
error_ = absl::StrCat("Wrong access type for field: ", field);
200208
return nullptr;
@@ -246,7 +254,7 @@ struct BasicSearch {
246254
// Efficiently unify multiple sub results with specified logical op
247255
IndexResult UnifyResults(vector<IndexResult>&& sub_results, LogicOp op) {
248256
if (sub_results.empty())
249-
return vector<DocId>{};
257+
return IndexResult{};
250258

251259
// Unifying from smallest to largest is more efficient.
252260
// AND: the result only shrinks, so starting with the smallest is most optimal.
@@ -283,7 +291,7 @@ struct BasicSearch {
283291
}
284292

285293
IndexResult Search(monostate, string_view) {
286-
return vector<DocId>{};
294+
return IndexResult{};
287295
}
288296

289297
IndexResult Search(const AstStarNode& node, string_view active_field) {
@@ -321,26 +329,12 @@ struct BasicSearch {
321329
// Try to get a sort index first, as `@field:*` might imply wanting sortable behavior
322330
BaseSortIndex* sort_index = indices_->GetSortIndex(active_field);
323331
if (sort_index) {
324-
if (auto result = sort_index->GetAllResults()) {
325-
return std::move(*result);
326-
}
327-
}
328-
329-
// If sort index doesn't exist or doesn't support GetAllResults, try regular index
330-
BaseIndex* base_index = indices_->GetIndex(active_field);
331-
if (base_index) {
332-
if (auto result = base_index->GetAllResults()) {
333-
return std::move(*result);
334-
}
332+
return {sort_index->GetAllDocsWithNonNullValues()};
335333
}
336334

337-
// If we get here, neither index could handle the request
338-
if (!base_index && !sort_index) {
339-
error_ = absl::StrCat("Invalid field: ", active_field);
340-
} else {
341-
error_ = absl::StrCat("Wrong access type for field: ", active_field);
342-
}
343-
return IndexResult{};
335+
// If sort index doesn't exist try regular index
336+
BaseIndex* base_index = GetBaseIndex(active_field);
337+
return base_index ? IndexResult{base_index->GetAllDocsWithNonNullValues()} : IndexResult{};
344338
}
345339

346340
IndexResult Search(const AstPrefixNode& node, string_view active_field) {

src/core/search/sort_indices.cc

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,8 @@ template <typename T> bool SimpleValueSortIndex<T>::ParsedSortValue::IsNullValue
2929
}
3030

3131
template <typename T>
32-
SimpleValueSortIndex<T>::SimpleValueSortIndex(PMR_NS::memory_resource* mr) : values_{mr} {
32+
SimpleValueSortIndex<T>::SimpleValueSortIndex(PMR_NS::memory_resource* mr)
33+
: values_{mr}, null_values_(mr) {
3334
}
3435

3536
template <typename T> SortableValue SimpleValueSortIndex<T>::Lookup(DocId doc) const {
@@ -90,6 +91,23 @@ void SimpleValueSortIndex<T>::Remove(DocId id, const DocumentAccessor& doc,
9091
values_[id] = T{};
9192
}
9293

94+
template <typename T>
95+
std::vector<DocId> SimpleValueSortIndex<T>::GetAllDocsWithNonNullValues() const {
96+
std::vector<DocId> result;
97+
result.reserve(values_.size());
98+
99+
auto empty_value = T{};
100+
for (DocId id = 0; id < values_.size(); ++id) {
101+
if (values_[id] != empty_value) {
102+
result.push_back(id);
103+
}
104+
}
105+
106+
// Result is already sorted by DocId
107+
// Also it has no duplicates
108+
return result;
109+
}
110+
93111
template <typename T> PMR_NS::memory_resource* SimpleValueSortIndex<T>::GetMemRes() const {
94112
return values_.get_allocator().resource();
95113
}

src/core/search/sort_indices.h

Lines changed: 2 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -42,21 +42,7 @@ template <typename T> struct SimpleValueSortIndex : public BaseSortIndex {
4242
void Remove(DocId id, const DocumentAccessor& doc, std::string_view field) override;
4343

4444
// Override GetAllResults to return all documents with non-null values
45-
std::optional<std::vector<DocId>> GetAllResults() const override {
46-
std::vector<DocId> result;
47-
48-
for (DocId id = 0; id < values_.size(); ++id) {
49-
// Check if id is not present in null_values_
50-
// Also need to handle deleted elements - in them T should be empty
51-
// Different types of T have different "empty" values, but we can check
52-
// if this value is the default for the given type
53-
if (!null_values_.contains(id) && !(values_[id] == T{})) {
54-
result.push_back(id);
55-
}
56-
}
57-
58-
return result;
59-
}
45+
std::vector<DocId> GetAllDocsWithNonNullValues() const override;
6046

6147
protected:
6248
virtual ParsedSortValue Get(const DocumentAccessor& doc, std::string_view field_value) = 0;
@@ -65,7 +51,7 @@ template <typename T> struct SimpleValueSortIndex : public BaseSortIndex {
6551

6652
private:
6753
PMR_NS::vector<T> values_;
68-
absl::flat_hash_set<DocId> null_values_;
54+
UniqueDocsList<PMR_NS::polymorphic_allocator<DocId>> null_values_;
6955
};
7056

7157
struct NumericSortIndex : public SimpleValueSortIndex<double> {

0 commit comments

Comments
 (0)