Improve s2region_term_indexer:

MBkkt · MBkkt · commit c1e36a4aa1d7 · 2023-01-06T01:35:07.000+01:00
* Add ability to reuse term buffer
* Add option for optimize index size if query only points
diff --git a/src/s2/s2region_term_indexer.cc b/src/s2/s2region_term_indexer.cc
@@ -126,6 +126,14 @@ string S2RegionTermIndexer::GetTerm(TermType term_type, const S2CellId id,
 
 vector<string> S2RegionTermIndexer::GetIndexTerms(const S2Point& point,
                                                   string_view prefix) {
+  vector<string> terms;
+  GetIndexTerms(point, prefix, &terms);
+  return terms;
+}
+
+void S2RegionTermIndexer::GetIndexTerms(const S2Point& point,
+                                        string_view prefix,
+                                        vector<string>* terms) {
   // See the top of this file for an overview of the indexing strategy.
   //
   // The last cell generated by this loop is effectively the covering for
@@ -136,12 +144,13 @@ vector<string> S2RegionTermIndexer::GetIndexTerms(const S2Point& point,
   // max_level() != true_max_level() (see S2RegionCoverer::Options).
 
   const S2CellId id(point);
-  vector<string> terms;
-  for (int level = options_.min_level(); level <= options_.max_level();
-       level += options_.level_mod()) {
-    terms.push_back(GetTerm(TermType::ANCESTOR, id.parent(level), prefix));
+  int level = options_.min_level();
+  if (options_.query_contains_points_only()) {
+    level = options_.true_max_level();
+  }
+  for (; level <= options_.max_level(); level += options_.level_mod()) {
+    terms->push_back(GetTerm(TermType::ANCESTOR, id.parent(level), prefix));
   }
-  return terms;
 }
 
 vector<string> S2RegionTermIndexer::GetIndexTerms(const S2Region& region,
@@ -154,6 +163,13 @@ vector<string> S2RegionTermIndexer::GetIndexTerms(const S2Region& region,
 
 vector<string> S2RegionTermIndexer::GetIndexTermsForCanonicalCovering(
     const S2CellUnion& covering, string_view prefix) {
+  vector<string> terms;
+  GetIndexTermsForCanonicalCovering(covering, prefix, &terms);
+  return terms;
+}
+
+void S2RegionTermIndexer::GetIndexTermsForCanonicalCovering(
+    const S2CellUnion& covering, string_view prefix, vector<string>* terms) {
   // See the top of this file for an overview of the indexing strategy.
   //
   // Cells in the covering are normally indexed as covering terms.  If we are
@@ -168,24 +184,29 @@ vector<string> S2RegionTermIndexer::GetIndexTermsForCanonicalCovering(
     *coverer_.mutable_options() = options_;
     S2_CHECK(coverer_.IsCanonical(covering));
   }
-  vector<string> terms;
   S2CellId prev_id = S2CellId::None();
   int true_max_level = options_.true_max_level();
-  for (S2CellId id : covering) {
+  for (const S2CellId id : covering) {
     // IsCanonical() already checks the following conditions, but we repeat
     // them here for documentation purposes.
     int level = id.level();
     S2_DCHECK_GE(level, options_.min_level());
     S2_DCHECK_LE(level, options_.max_level());
     S2_DCHECK_EQ(0, (level - options_.min_level()) % options_.level_mod());
+    // assume level <= options_.true_max_level()
 
-    if (level < true_max_level) {
-      // Add a covering term for this cell.
-      terms.push_back(GetTerm(TermType::COVERING, id, prefix));
-    }
-    if (level == true_max_level || !options_.optimize_for_space()) {
-      // Add an ancestor term for this cell at the constrained level.
-      terms.push_back(GetTerm(TermType::ANCESTOR, id.parent(level), prefix));
+    const bool is_max_level_cell = level == true_max_level;
+    // Add a term for this cell, max_level cell ANCESTOR is optimization
+    terms->push_back(GetTerm(is_max_level_cell ? TermType::ANCESTOR
+                                               : TermType::COVERING,
+                             id, prefix));
+
+    // If query only contains points, there are no need other terms.
+    if (options_.query_contains_points_only()) continue;
+
+    if (!options_.optimize_for_space() && !is_max_level_cell) {
+      // Add an ancestor term for this cell.
+      terms->push_back(GetTerm(TermType::ANCESTOR, id, prefix));
     }
     // Finally, add ancestor terms for all the ancestors of this cell.
     while ((level -= options_.level_mod()) >= options_.min_level()) {
@@ -194,29 +215,34 @@ vector<string> S2RegionTermIndexer::GetIndexTermsForCanonicalCovering(
           prev_id.parent(level) == ancestor_id) {
         break;  // We have already processed this cell and its ancestors.
       }
-      terms.push_back(GetTerm(TermType::ANCESTOR, ancestor_id, prefix));
+      terms->push_back(GetTerm(TermType::ANCESTOR, ancestor_id, prefix));
     }
     prev_id = id;
   }
-  return terms;
 }
 
 vector<string> S2RegionTermIndexer::GetQueryTerms(const S2Point& point,
                                                   string_view prefix) {
+  vector<string> terms;
+  GetQueryTerms(point, prefix, &terms);
+  return terms;
+}
+
+void S2RegionTermIndexer::GetQueryTerms(const S2Point& point,
+                                        string_view prefix,
+                                        vector<string>* terms) {
   // See the top of this file for an overview of the indexing strategy.
 
   const S2CellId id(point);
-  vector<string> terms;
   // Recall that all true_max_level() cells are indexed only as ancestor terms.
   int level = options_.true_max_level();
-  terms.push_back(GetTerm(TermType::ANCESTOR, id.parent(level), prefix));
-  if (options_.index_contains_points_only()) return terms;
+  terms->push_back(GetTerm(TermType::ANCESTOR, id.parent(level), prefix));
+  if (options_.index_contains_points_only()) return;
 
   // Add covering terms for all the ancestor cells.
   for (; level >= options_.min_level(); level -= options_.level_mod()) {
-    terms.push_back(GetTerm(TermType::COVERING, id.parent(level), prefix));
+    terms->push_back(GetTerm(TermType::COVERING, id.parent(level), prefix));
   }
-  return terms;
 }
 
 vector<string> S2RegionTermIndexer::GetQueryTerms(const S2Region& region,
@@ -229,13 +255,20 @@ vector<string> S2RegionTermIndexer::GetQueryTerms(const S2Region& region,
 
 vector<string> S2RegionTermIndexer::GetQueryTermsForCanonicalCovering(
     const S2CellUnion& covering, string_view prefix) {
+  vector<string> terms;
+  GetQueryTermsForCanonicalCovering(covering, prefix, &terms);
+  return terms;
+}
+
+void S2RegionTermIndexer::GetQueryTermsForCanonicalCovering(
+    const S2CellUnion& covering, string_view prefix, vector<string>* terms) {
   // See the top of this file for an overview of the indexing strategy.
 
+  S2_CHECK(!options_.query_contains_points_only());
   if (google::DEBUG_MODE) {
     *coverer_.mutable_options() = options_;
     S2_CHECK(coverer_.IsCanonical(covering));
   }
-  vector<string> terms;
   S2CellId prev_id = S2CellId::None();
   int true_max_level = options_.true_max_level();
   for (S2CellId id : covering) {
@@ -245,18 +278,19 @@ vector<string> S2RegionTermIndexer::GetQueryTermsForCanonicalCovering(
     S2_DCHECK_GE(level, options_.min_level());
     S2_DCHECK_LE(level, options_.max_level());
     S2_DCHECK_EQ(0, (level - options_.min_level()) % options_.level_mod());
+    // assume level <= options_.true_max_level()
 
     // Cells in the covering are always queried as ancestor terms.
-    terms.push_back(GetTerm(TermType::ANCESTOR, id, prefix));
+    terms->push_back(GetTerm(TermType::ANCESTOR, id, prefix));
 
     // If the index only contains points, there are no covering terms.
     if (options_.index_contains_points_only()) continue;
 
     // If we are optimizing for index space rather than query time, cells are
     // also queried as covering terms (except for true_max_level() cells,
     // which are indexed and queried as ancestor cells only).
-    if (options_.optimize_for_space() && level < true_max_level) {
-      terms.push_back(GetTerm(TermType::COVERING, id, prefix));
+    if (options_.optimize_for_space() && level != true_max_level) {
+      terms->push_back(GetTerm(TermType::COVERING, id, prefix));
     }
     // Finally, add covering terms for all the ancestors of this cell.
     while ((level -= options_.level_mod()) >= options_.min_level()) {
@@ -265,9 +299,8 @@ vector<string> S2RegionTermIndexer::GetQueryTermsForCanonicalCovering(
           prev_id.parent(level) == ancestor_id) {
         break;  // We have already processed this cell and its ancestors.
       }
-      terms.push_back(GetTerm(TermType::COVERING, ancestor_id, prefix));
+      terms->push_back(GetTerm(TermType::COVERING, ancestor_id, prefix));
     }
     prev_id = id;
   }
-  return terms;
 }
diff --git a/src/s2/s2region_term_indexer.h b/src/s2/s2region_term_indexer.h
@@ -196,8 +196,21 @@ class S2RegionTermIndexer {
     // this flag if your index consists entirely of points.)
     //
     // DEFAULT: false
-    bool index_contains_points_only() const { return points_only_; }
-    void set_index_contains_points_only(bool value) { points_only_ = value; }
+    bool index_contains_points_only() const { return index_points_only_; }
+    void set_index_contains_points_only(bool value) { index_points_only_ = value; }
+
+    // If your query will only contain points (rather than regions), be sure
+    // to set this flag.  This will generate smaller and faster index that
+    // are specialized for the points-only case.
+    //
+    // With the default quality settings, this flag reduces the number of
+    // index terms by about a factor of two.  (The improvement gets smaller
+    // as max_cells() is increased, but there is really no reason not to use
+    // this flag if your query consist entirely of points.)
+    //
+    // DEFAULT: false
+    bool query_contains_points_only() const { return query_points_only_; }
+    void set_query_contains_points_only(bool value) { query_points_only_ = value; }
 
     // If true, the index will be optimized for space rather than for query
     // time.  With the default quality settings, this flag reduces the number
@@ -221,7 +234,8 @@ class S2RegionTermIndexer {
     void set_marker_character(char ch);
 
    private:
-    bool points_only_ = false;
+    bool index_points_only_ = false;
+    bool query_points_only_ = false;
     bool optimize_for_space_ = false;
     std::string marker_ = std::string(1, '$');
   };
@@ -293,6 +307,21 @@ class S2RegionTermIndexer {
   std::vector<std::string> GetQueryTermsForCanonicalCovering(
       const S2CellUnion& covering, absl::string_view prefix);
 
+  // Same as above but allows to reuse same buffer for different points or use
+  // single buffer for multiple points (common case is GeoJson MultiPoint)
+  void GetIndexTerms(const S2Point& point, absl::string_view prefix,
+                     std::vector<std::string>* terms);
+  void GetQueryTerms(const S2Point& point, absl::string_view prefix,
+                     std::vector<std::string>* terms);
+
+  // Same as above but allows to reuse same buffer for different covering
+  void GetIndexTermsForCanonicalCovering(const S2CellUnion &covering,
+                                         absl::string_view prefix,
+                                         std::vector<std::string> *terms);
+  void GetQueryTermsForCanonicalCovering(const S2CellUnion &covering,
+                                         absl::string_view prefix,
+                                         std::vector<std::string> *terms);
+
  private:
   enum TermType { ANCESTOR, COVERING };