Skip to content

Commit b6b2247

Browse files
authored
chore: avoid deduplicating docs ids when not needed (#5344)
When index consists of only singular values per document, we do not have to deduplicate doc ids during query time. Benchmarks: Before the PR: Benchmark Time CPU Iterations ------------------------------------------------------------ BM_SearchDocIds/0 7238 ns 7238 ns 388043 BM_SearchDocIds/1 30202 ns 30200 ns 89480 BM_SearchDocIds/2 20986 ns 20984 ns 133450 After: Benchmark Time CPU Iterations ------------------------------------------------------------ BM_SearchDocIds/0 7071 ns 7070 ns 406632 BM_SearchDocIds/1 13149 ns 13142 ns 218935 BM_SearchDocIds/2 11870 ns 11868 ns 236922 Signed-off-by: Roman Gershman <[email protected]>
1 parent 6ecbc92 commit b6b2247

File tree

3 files changed

+69
-14
lines changed

3 files changed

+69
-14
lines changed

src/core/search/indices.cc

Lines changed: 38 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,9 @@ bool NumericIndex::Add(DocId id, const DocumentAccessor& doc, string_view field)
8585
return false;
8686
}
8787

88+
if (numbers->size() > 1) {
89+
unique_ids_ = false;
90+
}
8891
for (auto num : numbers.value()) {
8992
entries_.emplace(num, id);
9093
}
@@ -111,22 +114,32 @@ vector<DocId> NumericIndex::Range(double l, double r) const {
111114
out.push_back(it->second);
112115

113116
sort(out.begin(), out.end());
114-
out.erase(unique(out.begin(), out.end()), out.end());
117+
118+
if (!unique_ids_) {
119+
out.erase(unique(out.begin(), out.end()), out.end());
120+
}
115121
return out;
116122
}
117123

118124
vector<DocId> NumericIndex::GetAllDocsWithNonNullValues() const {
119-
UniqueDocsList<> unique_docs;
120125
std::vector<DocId> result;
121126

122-
unique_docs.reserve(entries_.size());
123127
result.reserve(entries_.size());
124128

125-
for (const auto& [_, doc_id] : entries_) {
126-
const auto [__, is_new] = unique_docs.insert(doc_id);
127-
if (is_new) {
129+
if (unique_ids_) {
130+
// If unique_ids_ is true, we can just take the second element of each entry
131+
for (const auto& [_, doc_id] : entries_) {
128132
result.push_back(doc_id);
129133
}
134+
} else {
135+
UniqueDocsList<> unique_docs;
136+
unique_docs.reserve(entries_.size());
137+
for (const auto& [_, doc_id] : entries_) {
138+
const auto [__, is_new] = unique_docs.insert(doc_id);
139+
if (is_new) {
140+
result.push_back(doc_id);
141+
}
142+
}
130143
}
131144

132145
std::sort(result.begin(), result.end());
@@ -181,6 +194,8 @@ bool BaseStringIndex<C>::Add(DocId id, const DocumentAccessor& doc, string_view
181194
for (string_view str : strings_list.value())
182195
tokens.merge(Tokenize(str));
183196

197+
if (tokens.size() > 1)
198+
unique_ids_ = false;
184199
for (string_view token : tokens)
185200
GetOrCreate(token)->Insert(id);
186201
return true;
@@ -215,21 +230,31 @@ template <typename C> vector<string> BaseStringIndex<C>::GetTerms() const {
215230
}
216231

217232
template <typename C> vector<DocId> BaseStringIndex<C>::GetAllDocsWithNonNullValues() const {
218-
UniqueDocsList<> unique_docs;
219233
std::vector<DocId> result;
220234

221-
unique_docs.reserve(entries_.size());
222235
result.reserve(entries_.size());
223236

224-
for (const auto& [_, container] : entries_) {
225-
for (const auto& doc_id : container) {
226-
auto [_, is_new] = unique_docs.insert(doc_id);
227-
if (is_new) {
237+
if (unique_ids_) {
238+
// If unique_ids_ is true, we can just take the second element of each entry
239+
for (const auto& [_, container] : entries_) {
240+
for (const auto& doc_id : container) {
228241
result.push_back(doc_id);
229242
}
230243
}
231-
}
244+
} else {
245+
UniqueDocsList<> unique_docs;
232246

247+
unique_docs.reserve(entries_.size());
248+
249+
for (const auto& [_, container] : entries_) {
250+
for (const auto& doc_id : container) {
251+
auto [_, is_new] = unique_docs.insert(doc_id);
252+
if (is_new) {
253+
result.push_back(doc_id);
254+
}
255+
}
256+
}
257+
}
233258
std::sort(result.begin(), result.end());
234259
return result;
235260
}

src/core/search/indices.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ struct NumericIndex : public BaseIndex {
3838
std::vector<DocId> GetAllDocsWithNonNullValues() const override;
3939

4040
private:
41+
bool unique_ids_ = true; // If true, docs ids are unique in the index, otherwise they can repeat.
4142
using Entry = std::pair<double, DocId>;
4243
absl::btree_set<Entry, std::less<Entry>, PMR_NS::polymorphic_allocator<Entry>> entries_;
4344
};
@@ -75,6 +76,7 @@ template <typename C> struct BaseStringIndex : public BaseIndex {
7576
Container* GetOrCreate(std::string_view word);
7677

7778
bool case_sensitive_ = false;
79+
bool unique_ids_ = true; // If true, docs ids are unique in the index, otherwise they can repeat.
7880
search::RaxTreeMap<Container> entries_;
7981
};
8082

src/core/search/search_test.cc

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
#include <memory_resource>
1818
#include <random>
1919

20+
#include "absl/base/macros.h"
2021
#include "base/gtest.h"
2122
#include "base/logging.h"
2223
#include "core/search/base.h"
@@ -1088,6 +1089,33 @@ BENCHMARK(BM_SearchByType_Diverse)
10881089
->ArgNames({"docs", "pattern_len", "search_type"})
10891090
->Unit(benchmark::kMicrosecond);
10901091

1091-
} // namespace search
1092+
static void BM_SearchDocIds(benchmark::State& state) {
1093+
auto schema = MakeSimpleSchema({{"score", SchemaField::NUMERIC}, {"tag", SchemaField::TAG}});
1094+
FieldIndices indices{schema, kEmptyOptions, PMR_NS::get_default_resource(), nullptr};
1095+
1096+
SearchAlgorithm algo;
1097+
QueryParams params;
1098+
default_random_engine rnd;
1099+
const char* tag_vals[] = {"test", "example", "sample", "demo", "demo2"};
1100+
uniform_int_distribution<size_t> tag_dist(0, ABSL_ARRAYSIZE(tag_vals) - 1);
1101+
uniform_int_distribution<size_t> score_dist(0, 100);
1102+
1103+
for (size_t i = 0; i < 1000; i++) {
1104+
MockedDocument doc{
1105+
Map{{"score", std::to_string(score_dist(rnd))}, {"tag", tag_vals[tag_dist(rnd)]}}};
1106+
indices.Add(i, doc);
1107+
}
10921108

1109+
std::string queries[] = {"@tag:{test} @score:[10 50]", "@tag: *", "@score:*"};
1110+
size_t query_type = state.range(0);
1111+
CHECK_LT(query_type, ABSL_ARRAYSIZE(queries));
1112+
CHECK(algo.Init(queries[query_type], &params));
1113+
while (state.KeepRunning()) {
1114+
auto result = algo.Search(&indices);
1115+
CHECK(result.error.empty());
1116+
}
1117+
}
1118+
BENCHMARK(BM_SearchDocIds)->Range(0, 2);
1119+
1120+
} // namespace search
10931121
} // namespace dfly

0 commit comments

Comments
 (0)