chore: avoid deduplicating docs ids when not needed (#5344)

romange · web-flow · commit b6b22479de20 · 2025-06-22T22:46:11.000+03:00
When index consists of only singular values per document,
we do not have to deduplicate doc ids during query time.

Benchmarks:

Before the PR:

Benchmark                  Time             CPU   Iterations
------------------------------------------------------------
BM_SearchDocIds/0       7238 ns         7238 ns       388043
BM_SearchDocIds/1      30202 ns        30200 ns        89480
BM_SearchDocIds/2      20986 ns        20984 ns       133450

After:

Benchmark                  Time             CPU   Iterations
------------------------------------------------------------
BM_SearchDocIds/0       7071 ns         7070 ns       406632
BM_SearchDocIds/1      13149 ns        13142 ns       218935
BM_SearchDocIds/2      11870 ns        11868 ns       236922

Signed-off-by: Roman Gershman &lt;roman@dragonflydb.io&gt;
diff --git a/src/core/search/indices.cc b/src/core/search/indices.cc
@@ -85,6 +85,9 @@ bool NumericIndex::Add(DocId id, const DocumentAccessor& doc, string_view field)
     return false;
   }
 
+  if (numbers->size() > 1) {
+    unique_ids_ = false;
+  }
   for (auto num : numbers.value()) {
     entries_.emplace(num, id);
   }
@@ -111,22 +114,32 @@ vector<DocId> NumericIndex::Range(double l, double r) const {
     out.push_back(it->second);
 
   sort(out.begin(), out.end());
-  out.erase(unique(out.begin(), out.end()), out.end());
+
+  if (!unique_ids_) {
+    out.erase(unique(out.begin(), out.end()), out.end());
+  }
   return out;
 }
 
 vector<DocId> NumericIndex::GetAllDocsWithNonNullValues() const {
-  UniqueDocsList<> unique_docs;
   std::vector<DocId> result;
 
-  unique_docs.reserve(entries_.size());
   result.reserve(entries_.size());
 
-  for (const auto& [_, doc_id] : entries_) {
-    const auto [__, is_new] = unique_docs.insert(doc_id);
-    if (is_new) {
+  if (unique_ids_) {
+    // If unique_ids_ is true, we can just take the second element of each entry
+    for (const auto& [_, doc_id] : entries_) {
       result.push_back(doc_id);
     }
+  } else {
+    UniqueDocsList<> unique_docs;
+    unique_docs.reserve(entries_.size());
+    for (const auto& [_, doc_id] : entries_) {
+      const auto [__, is_new] = unique_docs.insert(doc_id);
+      if (is_new) {
+        result.push_back(doc_id);
+      }
+    }
   }
 
   std::sort(result.begin(), result.end());
@@ -181,6 +194,8 @@ bool BaseStringIndex<C>::Add(DocId id, const DocumentAccessor& doc, string_view
   for (string_view str : strings_list.value())
     tokens.merge(Tokenize(str));
 
+  if (tokens.size() > 1)
+    unique_ids_ = false;
   for (string_view token : tokens)
     GetOrCreate(token)->Insert(id);
   return true;
@@ -215,21 +230,31 @@ template <typename C> vector<string> BaseStringIndex<C>::GetTerms() const {
 }
 
 template <typename C> vector<DocId> BaseStringIndex<C>::GetAllDocsWithNonNullValues() const {
-  UniqueDocsList<> unique_docs;
   std::vector<DocId> result;
 
-  unique_docs.reserve(entries_.size());
   result.reserve(entries_.size());
 
-  for (const auto& [_, container] : entries_) {
-    for (const auto& doc_id : container) {
-      auto [_, is_new] = unique_docs.insert(doc_id);
-      if (is_new) {
+  if (unique_ids_) {
+    // If unique_ids_ is true, we can just take the second element of each entry
+    for (const auto& [_, container] : entries_) {
+      for (const auto& doc_id : container) {
         result.push_back(doc_id);
       }
     }
-  }
+  } else {
+    UniqueDocsList<> unique_docs;
 
+    unique_docs.reserve(entries_.size());
+
+    for (const auto& [_, container] : entries_) {
+      for (const auto& doc_id : container) {
+        auto [_, is_new] = unique_docs.insert(doc_id);
+        if (is_new) {
+          result.push_back(doc_id);
+        }
+      }
+    }
+  }
   std::sort(result.begin(), result.end());
   return result;
 }
diff --git a/src/core/search/indices.h b/src/core/search/indices.h
@@ -38,6 +38,7 @@ struct NumericIndex : public BaseIndex {
   std::vector<DocId> GetAllDocsWithNonNullValues() const override;
 
  private:
+  bool unique_ids_ = true;  // If true, docs ids are unique in the index, otherwise they can repeat.
   using Entry = std::pair<double, DocId>;
   absl::btree_set<Entry, std::less<Entry>, PMR_NS::polymorphic_allocator<Entry>> entries_;
 };
@@ -75,6 +76,7 @@ template <typename C> struct BaseStringIndex : public BaseIndex {
   Container* GetOrCreate(std::string_view word);
 
   bool case_sensitive_ = false;
+  bool unique_ids_ = true;  // If true, docs ids are unique in the index, otherwise they can repeat.
   search::RaxTreeMap<Container> entries_;
 };
 
diff --git a/src/core/search/search_test.cc b/src/core/search/search_test.cc
@@ -17,6 +17,7 @@
 #include <memory_resource>
 #include <random>
 
+#include "absl/base/macros.h"
 #include "base/gtest.h"
 #include "base/logging.h"
 #include "core/search/base.h"
@@ -1088,6 +1089,33 @@ BENCHMARK(BM_SearchByType_Diverse)
     ->ArgNames({"docs", "pattern_len", "search_type"})
     ->Unit(benchmark::kMicrosecond);
 
-}  // namespace search
+static void BM_SearchDocIds(benchmark::State& state) {
+  auto schema = MakeSimpleSchema({{"score", SchemaField::NUMERIC}, {"tag", SchemaField::TAG}});
+  FieldIndices indices{schema, kEmptyOptions, PMR_NS::get_default_resource(), nullptr};
+
+  SearchAlgorithm algo;
+  QueryParams params;
+  default_random_engine rnd;
+  const char* tag_vals[] = {"test", "example", "sample", "demo", "demo2"};
+  uniform_int_distribution<size_t> tag_dist(0, ABSL_ARRAYSIZE(tag_vals) - 1);
+  uniform_int_distribution<size_t> score_dist(0, 100);
+
+  for (size_t i = 0; i < 1000; i++) {
+    MockedDocument doc{
+        Map{{"score", std::to_string(score_dist(rnd))}, {"tag", tag_vals[tag_dist(rnd)]}}};
+    indices.Add(i, doc);
+  }
 
+  std::string queries[] = {"@tag:{test} @score:[10 50]", "@tag: *", "@score:*"};
+  size_t query_type = state.range(0);
+  CHECK_LT(query_type, ABSL_ARRAYSIZE(queries));
+  CHECK(algo.Init(queries[query_type], &params));
+  while (state.KeepRunning()) {
+    auto result = algo.Search(&indices);
+    CHECK(result.error.empty());
+  }
+}
+BENCHMARK(BM_SearchDocIds)->Range(0, 2);
+
+}  // namespace search
 }  // namespace dfly