Skip to content

Commit 8e90f7d

Browse files
authored
test(search): add unified benchmarks for prefix/suffix/infix search (#5319)
Fixed: #4984
1 parent 060add0 commit 8e90f7d

File tree

1 file changed

+181
-0
lines changed

1 file changed

+181
-0
lines changed

src/core/search/search_test.cc

Lines changed: 181 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -907,6 +907,187 @@ TEST_F(SearchTest, NotImplementedSearchTypes) {
907907
<< "Infix search should return a not implemented error";
908908
}
909909

910+
// Enumeration for different search types
911+
enum class SearchType { PREFIX = 0, SUFFIX = 1, INFIX = 2 };
912+
913+
// Helper function to generate content with ASCII characters
914+
static std::string GenerateWordSequence(size_t word_count, size_t doc_offset = 0) {
915+
std::string content;
916+
for (size_t i = 0; i < word_count; ++i) {
917+
std::string word;
918+
char start_char = 'a' + ((doc_offset + i) % 26);
919+
size_t word_len = 3 + (i % 5); // Word length 3-7 chars
920+
921+
for (size_t j = 0; j < word_len; ++j) {
922+
char c = start_char + (j % 26);
923+
if (c > 'z')
924+
c = 'a' + (c - 'z' - 1);
925+
word += c;
926+
}
927+
928+
if (i > 0)
929+
content += " ";
930+
content += word;
931+
}
932+
return content;
933+
}
934+
935+
// Helper function to generate pattern with variety
936+
static std::string GeneratePattern(SearchType search_type, size_t pattern_len, bool use_uniform) {
937+
if (use_uniform) {
938+
// Original uniform pattern for comparison
939+
switch (search_type) {
940+
case SearchType::PREFIX:
941+
return std::string(pattern_len, 'p');
942+
case SearchType::SUFFIX:
943+
return std::string(pattern_len, 's');
944+
case SearchType::INFIX:
945+
return std::string(pattern_len, 'i');
946+
}
947+
} else {
948+
// Diverse ASCII pattern
949+
std::string pattern;
950+
char base_char = (search_type == SearchType::PREFIX) ? 'p'
951+
: (search_type == SearchType::SUFFIX) ? 's'
952+
: 'i';
953+
954+
for (size_t i = 0; i < pattern_len; ++i) {
955+
char c = base_char + (i % 10); // Use variety of chars
956+
if (c > 'z')
957+
c = 'a' + (c - 'z' - 1);
958+
pattern += c;
959+
}
960+
return pattern;
961+
}
962+
return "";
963+
}
964+
965+
static void BM_SearchByTypeImpl(benchmark::State& state, bool use_diverse_pattern) {
966+
size_t num_docs = state.range(0);
967+
size_t pattern_len = state.range(1);
968+
SearchType search_type = static_cast<SearchType>(state.range(2));
969+
970+
auto schema = MakeSimpleSchema({{"title", SchemaField::TEXT}});
971+
FieldIndices indices{schema, kEmptyOptions, PMR_NS::get_default_resource(), nullptr};
972+
973+
// Generate pattern
974+
std::string pattern = GeneratePattern(search_type, pattern_len, !use_diverse_pattern);
975+
std::string search_type_name = (search_type == SearchType::PREFIX) ? "prefix"
976+
: (search_type == SearchType::SUFFIX) ? "suffix"
977+
: "infix";
978+
979+
// Generate test data with more realistic content
980+
for (size_t i = 0; i < num_docs; i++) {
981+
std::string content;
982+
if (i < num_docs / 2) {
983+
// Half documents have the pattern in appropriate position
984+
std::string base_content = GenerateWordSequence(5 + (i % 5), i);
985+
986+
switch (search_type) {
987+
case SearchType::PREFIX:
988+
content = pattern + base_content;
989+
break;
990+
case SearchType::SUFFIX:
991+
content = base_content + pattern;
992+
break;
993+
case SearchType::INFIX:
994+
// Fix: embed pattern inside a word, not as separate word
995+
size_t split_pos = base_content.length() / 2;
996+
content = base_content.substr(0, split_pos) + pattern + base_content.substr(split_pos);
997+
break;
998+
}
999+
} else {
1000+
// Half don't have the pattern - generate different content
1001+
content = GenerateWordSequence(8 + (i % 3), i + 1000);
1002+
}
1003+
MockedDocument doc{Map{{"title", content}}};
1004+
indices.Add(i, doc);
1005+
}
1006+
1007+
SearchAlgorithm algo{};
1008+
QueryParams params;
1009+
std::string query;
1010+
1011+
// Generate query based on search type
1012+
switch (search_type) {
1013+
case SearchType::PREFIX:
1014+
query = pattern + "*";
1015+
break;
1016+
case SearchType::SUFFIX:
1017+
query = "*" + pattern;
1018+
break;
1019+
case SearchType::INFIX:
1020+
query = "*" + pattern + "*";
1021+
break;
1022+
}
1023+
1024+
if (!algo.Init(query, &params)) {
1025+
state.SkipWithError("Failed to initialize " + search_type_name + " search");
1026+
return;
1027+
}
1028+
1029+
while (state.KeepRunning()) {
1030+
auto result = algo.Search(&indices);
1031+
benchmark::DoNotOptimize(result);
1032+
1033+
// If result has error, skip the benchmark
1034+
if (!result.error.empty()) {
1035+
state.SkipWithError(search_type_name + " search returned error: " + result.error);
1036+
return;
1037+
}
1038+
}
1039+
1040+
// Set counters for analysis
1041+
state.counters["docs_total"] = num_docs;
1042+
state.counters["pattern_length"] = pattern_len;
1043+
state.counters["diverse_pattern"] = use_diverse_pattern ? 1 : 0;
1044+
state.SetLabel(search_type_name + (use_diverse_pattern ? "_diverse" : "_uniform"));
1045+
}
1046+
1047+
// Instantiate template functions
1048+
static void BM_SearchByType_Uniform(benchmark::State& state) {
1049+
BM_SearchByTypeImpl(state, false);
1050+
}
1051+
1052+
static void BM_SearchByType_Diverse(benchmark::State& state) {
1053+
BM_SearchByTypeImpl(state, true);
1054+
}
1055+
1056+
// Benchmark to compare all search types - removed 100K docs per romange's suggestion
1057+
BENCHMARK(BM_SearchByType_Uniform)
1058+
// Uniform patterns (original test)
1059+
->Args({1000, 3, static_cast<int>(SearchType::PREFIX)})
1060+
->Args({1000, 5, static_cast<int>(SearchType::PREFIX)})
1061+
->Args({10000, 3, static_cast<int>(SearchType::PREFIX)})
1062+
->Args({10000, 5, static_cast<int>(SearchType::PREFIX)})
1063+
->Args({1000, 3, static_cast<int>(SearchType::SUFFIX)})
1064+
->Args({1000, 5, static_cast<int>(SearchType::SUFFIX)})
1065+
->Args({10000, 3, static_cast<int>(SearchType::SUFFIX)})
1066+
->Args({10000, 5, static_cast<int>(SearchType::SUFFIX)})
1067+
->Args({1000, 3, static_cast<int>(SearchType::INFIX)})
1068+
->Args({1000, 5, static_cast<int>(SearchType::INFIX)})
1069+
->Args({10000, 3, static_cast<int>(SearchType::INFIX)})
1070+
->Args({10000, 5, static_cast<int>(SearchType::INFIX)})
1071+
->ArgNames({"docs", "pattern_len", "search_type"})
1072+
->Unit(benchmark::kMicrosecond);
1073+
1074+
BENCHMARK(BM_SearchByType_Diverse)
1075+
// Diverse patterns (new test with ASCII variety)
1076+
->Args({1000, 3, static_cast<int>(SearchType::PREFIX)})
1077+
->Args({1000, 5, static_cast<int>(SearchType::PREFIX)})
1078+
->Args({10000, 3, static_cast<int>(SearchType::PREFIX)})
1079+
->Args({10000, 5, static_cast<int>(SearchType::PREFIX)})
1080+
->Args({1000, 3, static_cast<int>(SearchType::SUFFIX)})
1081+
->Args({1000, 5, static_cast<int>(SearchType::SUFFIX)})
1082+
->Args({10000, 3, static_cast<int>(SearchType::SUFFIX)})
1083+
->Args({10000, 5, static_cast<int>(SearchType::SUFFIX)})
1084+
->Args({1000, 3, static_cast<int>(SearchType::INFIX)})
1085+
->Args({1000, 5, static_cast<int>(SearchType::INFIX)})
1086+
->Args({10000, 3, static_cast<int>(SearchType::INFIX)})
1087+
->Args({10000, 5, static_cast<int>(SearchType::INFIX)})
1088+
->ArgNames({"docs", "pattern_len", "search_type"})
1089+
->Unit(benchmark::kMicrosecond);
1090+
9101091
} // namespace search
9111092

9121093
} // namespace dfly

0 commit comments

Comments
 (0)