diff --git a/src/s2/s2region_term_indexer.cc b/src/s2/s2region_term_indexer.cc index a870d7570..43d298128 100644 --- a/src/s2/s2region_term_indexer.cc +++ b/src/s2/s2region_term_indexer.cc @@ -61,6 +61,12 @@ // never be any document regions larger than the query region. This can // significantly reduce the size of queries. // +// + If the query will contain only points (rather than general regions), then +// we can skip all the ancestor terms mentioned above (except last cell see +// `GetIndexTerms(const S2Point& point...` for details) because there will +// never be any document regions larger than the index region. This can +// significantly reduce the size of index. +// // + If it is more important to optimize index size rather than query speed, // the number of index terms can be reduced by creating ancestor terms only // for the *proper* ancestors of the cells in a document region, and @@ -130,6 +136,14 @@ string S2RegionTermIndexer::GetTerm(TermType term_type, const S2CellId id, vector S2RegionTermIndexer::GetIndexTerms(const S2Point& point, string_view prefix) { + vector terms; + GetIndexTerms(point, prefix, &terms); + return terms; +} + +void S2RegionTermIndexer::GetIndexTerms(const S2Point& point, + string_view prefix, + vector* terms) { // See the top of this file for an overview of the indexing strategy. // // The last cell generated by this loop is effectively the covering for @@ -140,15 +154,16 @@ vector S2RegionTermIndexer::GetIndexTerms(const S2Point& point, // max_level() != true_max_level() (see S2RegionCoverer::Options). const S2CellId id(point); - vector terms; terms.reserve((options_.true_max_level() - options_.min_level()) / options_.level_mod() + 1); - for (int level = options_.min_level(); level <= options_.max_level(); - level += options_.level_mod()) { - terms.push_back(GetTerm(TermType::ANCESTOR, id.parent(level), prefix)); + int level = options_.min_level(); + if (options_.query_contains_points_only()) { + level = options_.true_max_level(); + } + for (; level <= options_.max_level(); level += options_.level_mod()) { + terms->push_back(GetTerm(TermType::ANCESTOR, id.parent(level), prefix)); } - return terms; } vector S2RegionTermIndexer::GetIndexTerms(const S2Region& region, @@ -161,6 +176,13 @@ vector S2RegionTermIndexer::GetIndexTerms(const S2Region& region, vector S2RegionTermIndexer::GetIndexTermsForCanonicalCovering( const S2CellUnion& covering, string_view prefix) { + vector terms; + GetIndexTermsForCanonicalCovering(covering, prefix, &terms); + return terms; +} + +void S2RegionTermIndexer::GetIndexTermsForCanonicalCovering( + const S2CellUnion& covering, string_view prefix, vector* terms) { // See the top of this file for an overview of the indexing strategy. // // Cells in the covering are normally indexed as covering terms. If we are @@ -175,7 +197,6 @@ vector S2RegionTermIndexer::GetIndexTermsForCanonicalCovering( *coverer_.mutable_options() = options_; ABSL_CHECK(coverer_.IsCanonical(covering)); } - vector terms; // `covering.size()` is necessary. Double it because we'll probably add // more. This could probably reasonably be even higher. terms.reserve(2 * covering.size()); @@ -188,14 +209,19 @@ vector S2RegionTermIndexer::GetIndexTermsForCanonicalCovering( ABSL_DCHECK_GE(level, options_.min_level()); ABSL_DCHECK_LE(level, options_.max_level()); ABSL_DCHECK_EQ(0, (level - options_.min_level()) % options_.level_mod()); + ABSL_DCHECK_LE(level, options_.true_max_level()); - if (level < true_max_level) { - // Add a covering term for this cell. - terms.push_back(GetTerm(TermType::COVERING, id, prefix)); - } - if (level == true_max_level || !options_.optimize_for_space()) { - // Add an ancestor term for this cell at the constrained level. - terms.push_back(GetTerm(TermType::ANCESTOR, id.parent(level), prefix)); + const bool is_max_level_cell = level == true_max_level; + // Add a term for this cell, max_level cell ANCESTOR is optimization + terms->push_back(GetTerm(is_max_level_cell ? TermType::ANCESTOR + : TermType::COVERING, + id, prefix)); + + if (options_.query_contains_points_only()) continue; + + if (!options_.optimize_for_space() && !is_max_level_cell) { + // Add an ancestor term for this cell. + terms->push_back(GetTerm(TermType::ANCESTOR, id, prefix)); } // Finally, add ancestor terms for all the ancestors of this cell. while ((level -= options_.level_mod()) >= options_.min_level()) { @@ -204,19 +230,25 @@ vector S2RegionTermIndexer::GetIndexTermsForCanonicalCovering( prev_id.parent(level) == ancestor_id) { break; // We have already processed this cell and its ancestors. } - terms.push_back(GetTerm(TermType::ANCESTOR, ancestor_id, prefix)); + terms->push_back(GetTerm(TermType::ANCESTOR, ancestor_id, prefix)); } prev_id = id; } - return terms; } vector S2RegionTermIndexer::GetQueryTerms(const S2Point& point, string_view prefix) { + vector terms; + GetQueryTerms(point, prefix, &terms); + return terms; +} + +void S2RegionTermIndexer::GetQueryTerms(const S2Point& point, + string_view prefix, + vector* terms) { // See the top of this file for an overview of the indexing strategy. const S2CellId id(point); - vector terms; terms.reserve(options_.index_contains_points_only() ? 1 : ((options_.true_max_level() - options_.min_level()) / @@ -224,14 +256,13 @@ vector S2RegionTermIndexer::GetQueryTerms(const S2Point& point, 2)); // Recall that all true_max_level() cells are indexed only as ancestor terms. int level = options_.true_max_level(); - terms.push_back(GetTerm(TermType::ANCESTOR, id.parent(level), prefix)); - if (options_.index_contains_points_only()) return terms; + terms->push_back(GetTerm(TermType::ANCESTOR, id.parent(level), prefix)); + if (options_.index_contains_points_only()) return; // Add covering terms for all the ancestor cells. for (; level >= options_.min_level(); level -= options_.level_mod()) { - terms.push_back(GetTerm(TermType::COVERING, id.parent(level), prefix)); + terms->push_back(GetTerm(TermType::COVERING, id.parent(level), prefix)); } - return terms; } vector S2RegionTermIndexer::GetQueryTerms(const S2Region& region, @@ -244,13 +275,20 @@ vector S2RegionTermIndexer::GetQueryTerms(const S2Region& region, vector S2RegionTermIndexer::GetQueryTermsForCanonicalCovering( const S2CellUnion& covering, string_view prefix) { + vector terms; + GetQueryTermsForCanonicalCovering(covering, prefix, &terms); + return terms; +} + +void S2RegionTermIndexer::GetQueryTermsForCanonicalCovering( + const S2CellUnion& covering, string_view prefix, vector* terms) { // See the top of this file for an overview of the indexing strategy. + S2_CHECK(!options_.query_contains_points_only()); if (google::DEBUG_MODE) { *coverer_.mutable_options() = options_; ABSL_CHECK(coverer_.IsCanonical(covering)); } - vector terms; terms.reserve(2 * covering.size()); S2CellId prev_id = S2CellId::None(); int true_max_level = options_.true_max_level(); @@ -261,9 +299,10 @@ vector S2RegionTermIndexer::GetQueryTermsForCanonicalCovering( ABSL_DCHECK_GE(level, options_.min_level()); ABSL_DCHECK_LE(level, options_.max_level()); ABSL_DCHECK_EQ(0, (level - options_.min_level()) % options_.level_mod()); + ABSL_DCHECK_LE(level, options_.true_max_level()); // Cells in the covering are always queried as ancestor terms. - terms.push_back(GetTerm(TermType::ANCESTOR, id, prefix)); + terms->push_back(GetTerm(TermType::ANCESTOR, id, prefix)); // If the index only contains points, there are no covering terms. if (options_.index_contains_points_only()) continue; @@ -271,8 +310,8 @@ vector S2RegionTermIndexer::GetQueryTermsForCanonicalCovering( // If we are optimizing for index space rather than query time, cells are // also queried as covering terms (except for true_max_level() cells, // which are indexed and queried as ancestor cells only). - if (options_.optimize_for_space() && level < true_max_level) { - terms.push_back(GetTerm(TermType::COVERING, id, prefix)); + if (options_.optimize_for_space() && level != true_max_level) { + terms->push_back(GetTerm(TermType::COVERING, id, prefix)); } // Finally, add covering terms for all the ancestors of this cell. while ((level -= options_.level_mod()) >= options_.min_level()) { @@ -281,9 +320,8 @@ vector S2RegionTermIndexer::GetQueryTermsForCanonicalCovering( prev_id.parent(level) == ancestor_id) { break; // We have already processed this cell and its ancestors. } - terms.push_back(GetTerm(TermType::COVERING, ancestor_id, prefix)); + terms->push_back(GetTerm(TermType::COVERING, ancestor_id, prefix)); } prev_id = id; } - return terms; } diff --git a/src/s2/s2region_term_indexer.h b/src/s2/s2region_term_indexer.h index de2c409a1..58ed8916a 100644 --- a/src/s2/s2region_term_indexer.h +++ b/src/s2/s2region_term_indexer.h @@ -198,8 +198,21 @@ class S2RegionTermIndexer { // this flag if your index consists entirely of points.) // // DEFAULT: false - bool index_contains_points_only() const { return points_only_; } - void set_index_contains_points_only(bool value) { points_only_ = value; } + bool index_contains_points_only() const { return index_points_only_; } + void set_index_contains_points_only(bool value) { index_points_only_ = value; } + + // If your query will only contain points (rather than regions), be sure + // to set this flag. This will generate smaller and faster index that + // are specialized for the points-only case. + // + // With the default quality settings, this flag reduces the number of + // index terms by about a factor of two. (The improvement gets smaller + // as max_cells() is increased, but there is really no reason not to use + // this flag if your query consist entirely of points.) + // + // DEFAULT: false + bool query_contains_points_only() const { return query_points_only_; } + void set_query_contains_points_only(bool value) { query_points_only_ = value; } // If true, the index will be optimized for space rather than for query // time. With the default quality settings, this flag reduces the number @@ -223,7 +236,8 @@ class S2RegionTermIndexer { void set_marker_character(char ch); private: - bool points_only_ = false; + bool index_points_only_ = false; + bool query_points_only_ = false; bool optimize_for_space_ = false; char marker_ = '$'; }; @@ -289,6 +303,21 @@ class S2RegionTermIndexer { std::vector GetQueryTermsForCanonicalCovering( const S2CellUnion& covering, absl::string_view prefix); + // Same as above but allows to reuse same buffer for different points or use + // single buffer for multiple points (common case is GeoJson MultiPoint) + void GetIndexTerms(const S2Point& point, absl::string_view prefix, + std::vector* terms); + void GetQueryTerms(const S2Point& point, absl::string_view prefix, + std::vector* terms); + + // Same as above but allows to reuse same buffer for different covering + void GetIndexTermsForCanonicalCovering(const S2CellUnion &covering, + absl::string_view prefix, + std::vector *terms); + void GetQueryTermsForCanonicalCovering(const S2CellUnion &covering, + absl::string_view prefix, + std::vector *terms); + private: enum TermType { ANCESTOR, COVERING }; diff --git a/src/s2/s2region_term_indexer_test.cc b/src/s2/s2region_term_indexer_test.cc index 2aa9f4186..8f8809d89 100644 --- a/src/s2/s2region_term_indexer_test.cc +++ b/src/s2/s2region_term_indexer_test.cc @@ -26,6 +26,7 @@ #include "absl/container/flat_hash_map.h" #include "absl/container/flat_hash_set.h" #include "absl/flags/flag.h" +#include "absl/strings/str_cat.h" #include "absl/strings/str_format.h" #include "s2/base/commandlineflags.h" @@ -48,7 +49,7 @@ namespace { enum class QueryType { POINT, CAP }; void TestRandomCaps(const S2RegionTermIndexer::Options& options, - QueryType query_type) { + DataType index_type, DataType query_type) { // This function creates an index consisting either of points (if // options.index_contains_points_only() is true) or S2Caps of random size. // It then executes queries consisting of points (if query_type == POINT) @@ -64,7 +65,7 @@ void TestRandomCaps(const S2RegionTermIndexer::Options& options, // of random size (up to a full sphere). S2Cap cap; vector terms; - if (options.index_contains_points_only()) { + if (index_type == DataType::POINT) { cap = S2Cap::FromPoint(S2Testing::RandomPoint()); terms = indexer.GetIndexTerms(cap.center(), ""); } else { @@ -113,62 +114,80 @@ void TestRandomCaps(const S2RegionTermIndexer::Options& options, static_cast(query_terms) / absl::GetFlag(FLAGS_iters)); } -// We run one test case for each combination of space vs. time optimization, -// and indexing regions vs. only points. +using TestCase = std::tuple; + +class S2RegionTermIndexerTest : public testing::TestWithParam { +protected: + void SetUp() override { + index_type = std::get<0>(GetParam()); + query_type = std::get<1>(GetParam()); + options.set_optimize_for_space(std::get<2>(GetParam())); + options.set_index_contains_points_only(std::get<3>(GetParam())); + options.set_query_contains_points_only(std::get<4>(GetParam())); + if (index_type != DataType::POINT && options.index_contains_points_only()) { + GTEST_SKIP() << "Case index_type != DataType::POINT && " + "options.index_contains_points_only() is invalid."; + } + if (query_type != DataType::POINT && options.query_contains_points_only()) { + GTEST_SKIP() << "Case query_type != DataType::POINT && " + "options.query_contains_points_only() is invalid."; + } + } -TEST(S2RegionTermIndexer, IndexRegionsQueryRegionsOptimizeTime) { S2RegionTermIndexer::Options options; - options.set_optimize_for_space(false); // Optimize for time. - options.set_min_level(0); // Use face cells. - options.set_max_level(16); - options.set_max_cells(20); - TestRandomCaps(options, QueryType::CAP); + DataType index_type{}; + DataType query_type{}; +}; + +// We run one test case for each combination: +// index_type: POINT, CAP +// query_type: POINT, CAP +// optimize_for_space: false, true +// index_contains_points_only: false, true +// query_contains_points_only: false, true +INSTANTIATE_TEST_CASE_P( + S2RegionTermIndexerTests, S2RegionTermIndexerTest, + testing::Combine(testing::Values(DataType::POINT, DataType::CAP), + testing::Values(DataType::POINT, DataType::CAP), + testing::Bool(), testing::Bool(), testing::Bool()), + [](const testing::TestParamInfo& info) { + return absl::StrCat( + "Index", std::get<0>(info.param), "Query", std::get<1>(info.param), + "SpaceOpt", std::get<2>(info.param), "IndexOpt", + std::get<3>(info.param), "QueryOpt", std::get<4>(info.param)); + }); + +TEST_P(S2RegionTermIndexerTest, DefaultParametersValues) { + TestRandomCaps(options, index_type, query_type); } -TEST(S2RegionTermIndexer, IndexRegionsQueryPointsOptimizeTime) { - S2RegionTermIndexer::Options options; - options.set_optimize_for_space(false); // Optimize for time. - options.set_min_level(0); // Use face cells. +TEST_P(S2RegionTermIndexerTest, UseFaceCells) { + options.set_min_level(0); options.set_max_level(16); options.set_max_cells(20); - TestRandomCaps(options, QueryType::POINT); + TestRandomCaps(options, index_type, query_type); } -TEST(S2RegionTermIndexer, IndexRegionsQueryRegionsOptimizeTimeWithLevelMod) { - S2RegionTermIndexer::Options options; - options.set_optimize_for_space(false); // Optimize for time. - options.set_min_level(6); // Constrain min/max levels. +TEST_P(S2RegionTermIndexerTest, ConstrainMinMaxLevels) { + options.set_min_level(6); options.set_max_level(12); options.set_level_mod(3); - TestRandomCaps(options, QueryType::CAP); + TestRandomCaps(options, index_type, query_type); } -TEST(S2RegionTermIndexer, IndexRegionsQueryRegionsOptimizeSpace) { - S2RegionTermIndexer::Options options; - options.set_optimize_for_space(true); // Optimize for space. +TEST_P(S2RegionTermIndexerTest, UseLeafCells) { options.set_min_level(4); options.set_max_level(S2CellId::kMaxLevel); // Use leaf cells. options.set_max_cells(8); - TestRandomCaps(options, QueryType::CAP); + TestRandomCaps(options, index_type, query_type); } -TEST(S2RegionTermIndexer, IndexPointsQueryRegionsOptimizeTime) { - S2RegionTermIndexer::Options options; - options.set_optimize_for_space(false); // Optimize for time. - options.set_min_level(0); // Use face cells. +TEST_P(S2RegionTermIndexerTest, UseFaceCellsCustomLevelMode) { + options.set_min_level(0); options.set_max_level(S2CellId::kMaxLevel); options.set_level_mod(2); options.set_max_cells(20); - options.set_index_contains_points_only(true); - TestRandomCaps(options, QueryType::CAP); -} - -TEST(S2RegionTermIndexer, IndexPointsQueryRegionsOptimizeSpace) { - S2RegionTermIndexer::Options options; - options.set_optimize_for_space(true); // Optimize for space. - options.set_index_contains_points_only(true); - // Use default parameter values. - TestRandomCaps(options, QueryType::CAP); + TestRandomCaps(options, index_type, query_type); } TEST(S2RegionTermIndexer, MarkerCharacter) {