Skip to content

Commit 9c6353f

Browse files
authored
[refact](inverted index) use inverted index context for query (apache#58981)
1 parent 11f0a7a commit 9c6353f

39 files changed

+412
-292
lines changed

be/src/olap/comparison_predicate.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ class ComparisonPredicateBase final : public ColumnPredicate {
104104
param.query_type = query_type;
105105
param.num_rows = num_rows;
106106
param.roaring = std::make_shared<roaring::Roaring>();
107-
RETURN_IF_ERROR(iterator->read_from_index(&param));
107+
RETURN_IF_ERROR(iterator->read_from_index(segment_v2::IndexParam {&param}));
108108

109109
// mask out null_bitmap, since NULL cmp VALUE will produce NULL
110110
// and be treated as false in WHERE

be/src/olap/inverted_index_parser.h

Lines changed: 22 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -45,18 +45,34 @@ enum class InvertedIndexParserType {
4545

4646
using CharFilterMap = std::map<std::string, std::string>;
4747

48-
struct InvertedIndexCtx {
48+
// Configuration for creating analyzer (SRP: only used during analyzer creation)
49+
// This is typically a stack-allocated temporary object, discarded after use
50+
struct InvertedIndexAnalyzerConfig {
4951
std::string analyzer_name;
50-
InvertedIndexParserType parser_type;
52+
InvertedIndexParserType parser_type = InvertedIndexParserType::PARSER_UNKNOWN;
5153
std::string parser_mode;
52-
std::string support_phrase;
53-
CharFilterMap char_filter_map;
5454
std::string lower_case;
5555
std::string stop_words;
56-
lucene::analysis::Analyzer* analyzer = nullptr;
56+
CharFilterMap char_filter_map;
5757
};
5858

59-
using InvertedIndexCtxSPtr = std::shared_ptr<InvertedIndexCtx>;
59+
// Runtime context for analyzer
60+
// Contains only the fields needed at runtime
61+
struct InvertedIndexAnalyzerCtx {
62+
// Used by execute_column path to determine if tokenization should be skipped
63+
std::string analyzer_name;
64+
InvertedIndexParserType parser_type = InvertedIndexParserType::PARSER_UNKNOWN;
65+
66+
// Used for creating reader and tokenization
67+
CharFilterMap char_filter_map;
68+
lucene::analysis::Analyzer* analyzer = nullptr;
69+
70+
// Helper method: returns true if tokenization should be performed
71+
bool should_tokenize() const {
72+
return !(parser_type == InvertedIndexParserType::PARSER_NONE && analyzer_name.empty());
73+
}
74+
};
75+
using InvertedIndexAnalyzerCtxSPtr = std::shared_ptr<InvertedIndexAnalyzerCtx>;
6076

6177
const std::string INVERTED_INDEX_PARSER_TRUE = "true";
6278
const std::string INVERTED_INDEX_PARSER_FALSE = "false";

be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.cpp

Lines changed: 30 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@
1717

1818
#include "olap/rowset/segment_v2/inverted_index/analyzer/analyzer.h"
1919

20+
#include <glog/logging.h>
21+
2022
#include "CLucene.h"
2123
#include "CLucene/analysis/LanguageBasedAnalyzer.h"
2224

@@ -42,14 +44,19 @@
4244
namespace doris::segment_v2::inverted_index {
4345
#include "common/compile_check_begin.h"
4446

45-
ReaderPtr InvertedIndexAnalyzer::create_reader(CharFilterMap& char_filter_map) {
47+
ReaderPtr InvertedIndexAnalyzer::create_reader(const CharFilterMap& char_filter_map) {
4648
ReaderPtr reader = std::make_shared<lucene::util::SStringReader<char>>();
4749
if (!char_filter_map.empty()) {
48-
if (char_filter_map[INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE] ==
49-
INVERTED_INDEX_CHAR_FILTER_CHAR_REPLACE) {
50-
reader = std::make_shared<CharReplaceCharFilter>(
51-
reader, char_filter_map[INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN],
52-
char_filter_map[INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT]);
50+
auto it_type = char_filter_map.find(INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE);
51+
if (it_type != char_filter_map.end() &&
52+
it_type->second == INVERTED_INDEX_CHAR_FILTER_CHAR_REPLACE) {
53+
auto it_pattern = char_filter_map.find(INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN);
54+
auto it_replacement =
55+
char_filter_map.find(INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT);
56+
if (it_pattern != char_filter_map.end() && it_replacement != char_filter_map.end()) {
57+
reader = std::make_shared<CharReplaceCharFilter>(reader, it_pattern->second,
58+
it_replacement->second);
59+
}
5360
}
5461
}
5562
return reader;
@@ -123,21 +130,19 @@ AnalyzerPtr InvertedIndexAnalyzer::create_builtin_analyzer(InvertedIndexParserTy
123130
return analyzer;
124131
}
125132

126-
std::shared_ptr<lucene::analysis::Analyzer> InvertedIndexAnalyzer::create_analyzer(
127-
const InvertedIndexCtx* inverted_index_ctx) {
128-
const std::string& analyzer_name = inverted_index_ctx->analyzer_name;
133+
AnalyzerPtr InvertedIndexAnalyzer::create_analyzer(const InvertedIndexAnalyzerConfig* config) {
134+
DCHECK(config != nullptr);
135+
const std::string& analyzer_name = config->analyzer_name;
129136
if (analyzer_name.empty()) {
130-
return create_builtin_analyzer(
131-
inverted_index_ctx->parser_type, inverted_index_ctx->parser_mode,
132-
inverted_index_ctx->lower_case, inverted_index_ctx->stop_words);
137+
return create_builtin_analyzer(config->parser_type, config->parser_mode, config->lower_case,
138+
config->stop_words);
133139
}
134140

135141
if (is_builtin_analyzer(analyzer_name)) {
136142
InvertedIndexParserType parser_type =
137143
get_inverted_index_parser_type_from_string(analyzer_name);
138-
return create_builtin_analyzer(parser_type, inverted_index_ctx->parser_mode,
139-
inverted_index_ctx->lower_case,
140-
inverted_index_ctx->stop_words);
144+
return create_builtin_analyzer(parser_type, config->parser_mode, config->lower_case,
145+
config->stop_words);
141146
}
142147

143148
auto* index_policy_mgr = doris::ExecEnv::GetInstance()->index_policy_mgr();
@@ -176,18 +181,16 @@ std::vector<TermInfo> InvertedIndexAnalyzer::get_analyse_result(
176181

177182
std::vector<TermInfo> InvertedIndexAnalyzer::get_analyse_result(
178183
const std::string& search_str, const std::map<std::string, std::string>& properties) {
179-
InvertedIndexCtxSPtr inverted_index_ctx = std::make_shared<InvertedIndexCtx>(
180-
get_analyzer_name_from_properties(properties),
181-
get_inverted_index_parser_type_from_string(
182-
get_parser_string_from_properties(properties)),
183-
get_parser_mode_string_from_properties(properties),
184-
get_parser_phrase_support_string_from_properties(properties),
185-
get_parser_char_filter_map_from_properties(properties),
186-
get_parser_lowercase_from_properties(properties),
187-
get_parser_stopwords_from_properties(properties));
188-
auto analyzer = create_analyzer(inverted_index_ctx.get());
189-
inverted_index_ctx->analyzer = analyzer.get();
190-
auto reader = create_reader(inverted_index_ctx->char_filter_map);
184+
InvertedIndexAnalyzerConfig config;
185+
config.analyzer_name = get_analyzer_name_from_properties(properties);
186+
config.parser_type = get_inverted_index_parser_type_from_string(
187+
get_parser_string_from_properties(properties));
188+
config.parser_mode = get_parser_mode_string_from_properties(properties);
189+
config.lower_case = get_parser_lowercase_from_properties(properties);
190+
config.stop_words = get_parser_stopwords_from_properties(properties);
191+
config.char_filter_map = get_parser_char_filter_map_from_properties(properties);
192+
auto analyzer = create_analyzer(&config);
193+
auto reader = create_reader(config.char_filter_map);
191194
reader->init(search_str.data(), static_cast<int32_t>(search_str.size()), true);
192195
return get_analyse_result(reader, analyzer.get());
193196
}

be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,14 +41,14 @@ using AnalyzerPtr = std::shared_ptr<lucene::analysis::Analyzer>;
4141

4242
class InvertedIndexAnalyzer {
4343
public:
44-
static ReaderPtr create_reader(CharFilterMap& char_filter_map);
44+
static ReaderPtr create_reader(const CharFilterMap& char_filter_map);
4545

4646
static bool is_builtin_analyzer(const std::string& analyzer_name);
4747
static AnalyzerPtr create_builtin_analyzer(InvertedIndexParserType parser_type,
4848
const std::string& parser_mode,
4949
const std::string& lower_case,
5050
const std::string& stop_words);
51-
static AnalyzerPtr create_analyzer(const InvertedIndexCtx* inverted_index_ctx);
51+
static AnalyzerPtr create_analyzer(const InvertedIndexAnalyzerConfig* config);
5252

5353
static std::vector<TermInfo> get_analyse_result(ReaderPtr reader,
5454
lucene::analysis::Analyzer* analyzer);

be/src/olap/rowset/segment_v2/inverted_index_iterator.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,9 +65,10 @@ Status InvertedIndexIterator::read_from_index(const IndexParam& param) {
6565
}
6666
}
6767

68+
// Note: analyzer_ctx is now passed via i_param->analyzer_ctx
6869
auto execute_query = [&]() {
6970
return reader->query(_context, i_param->column_name, i_param->query_value,
70-
i_param->query_type, i_param->roaring);
71+
i_param->query_type, i_param->roaring, i_param->analyzer_ctx);
7172
};
7273

7374
if (runtime_state->query_options().enable_profile) {

be/src/olap/rowset/segment_v2/inverted_index_iterator.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717

1818
#pragma once
1919

20+
#include "olap/inverted_index_parser.h"
2021
#include "olap/rowset/segment_v2/index_iterator.h"
2122
#include "olap/rowset/segment_v2/inverted_index_reader.h"
2223

@@ -30,6 +31,10 @@ struct InvertedIndexParam {
3031
uint32_t num_rows;
3132
std::shared_ptr<roaring::Roaring> roaring;
3233
bool skip_try = false;
34+
35+
// Pointer to analyzer context (can be nullptr if not needed)
36+
// Used by FullTextIndexReader for tokenization
37+
const InvertedIndexAnalyzerCtx* analyzer_ctx = nullptr;
3338
};
3439

3540
class InvertedIndexIterator : public IndexIterator {
@@ -39,6 +44,7 @@ class InvertedIndexIterator : public IndexIterator {
3944

4045
void add_reader(InvertedIndexReaderType type, const InvertedIndexReaderPtr& reader);
4146

47+
// Note: analyzer_ctx is now passed via InvertedIndexParam.analyzer_ctx
4248
Status read_from_index(const IndexParam& param) override;
4349

4450
Status read_null_bitmap(InvertedIndexQueryCacheHandle* cache_handle) override;

be/src/olap/rowset/segment_v2/inverted_index_reader.cpp

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -292,7 +292,8 @@ Status FullTextIndexReader::new_iterator(std::unique_ptr<IndexIterator>* iterato
292292
Status FullTextIndexReader::query(const IndexQueryContextPtr& context,
293293
const std::string& column_name, const void* query_value,
294294
InvertedIndexQueryType query_type,
295-
std::shared_ptr<roaring::Roaring>& bit_map) {
295+
std::shared_ptr<roaring::Roaring>& bit_map,
296+
const InvertedIndexAnalyzerCtx* analyzer_ctx) {
296297
SCOPED_RAW_TIMER(&context->stats->inverted_index_query_timer);
297298

298299
std::string search_str = *reinterpret_cast<const std::string*>(query_value);
@@ -313,8 +314,16 @@ Status FullTextIndexReader::query(const IndexQueryContextPtr& context,
313314
query_info);
314315
} else {
315316
SCOPED_RAW_TIMER(&context->stats->inverted_index_analyzer_timer);
316-
query_info.term_infos = inverted_index::InvertedIndexAnalyzer::get_analyse_result(
317-
search_str, _index_meta.properties());
317+
if (analyzer_ctx != nullptr && analyzer_ctx->analyzer != nullptr) {
318+
auto reader = inverted_index::InvertedIndexAnalyzer::create_reader(
319+
analyzer_ctx->char_filter_map);
320+
reader->init(search_str.data(), static_cast<int32_t>(search_str.size()), true);
321+
query_info.term_infos = inverted_index::InvertedIndexAnalyzer::get_analyse_result(
322+
reader, analyzer_ctx->analyzer);
323+
} else {
324+
query_info.term_infos = inverted_index::InvertedIndexAnalyzer::get_analyse_result(
325+
search_str, _index_meta.properties());
326+
}
318327
}
319328

320329
if (query_info.term_infos.empty()) {
@@ -394,7 +403,8 @@ Status StringTypeInvertedIndexReader::new_iterator(std::unique_ptr<IndexIterator
394403
Status StringTypeInvertedIndexReader::query(const IndexQueryContextPtr& context,
395404
const std::string& column_name, const void* query_value,
396405
InvertedIndexQueryType query_type,
397-
std::shared_ptr<roaring::Roaring>& bit_map) {
406+
std::shared_ptr<roaring::Roaring>& bit_map,
407+
const InvertedIndexAnalyzerCtx* /*analyzer_ctx*/) {
398408
SCOPED_RAW_TIMER(&context->stats->inverted_index_query_timer);
399409

400410
std::string search_str = *reinterpret_cast<const std::string*>(query_value);
@@ -688,7 +698,8 @@ Status BkdIndexReader::try_query(const IndexQueryContextPtr& context,
688698

689699
Status BkdIndexReader::query(const IndexQueryContextPtr& context, const std::string& column_name,
690700
const void* query_value, InvertedIndexQueryType query_type,
691-
std::shared_ptr<roaring::Roaring>& bit_map) {
701+
std::shared_ptr<roaring::Roaring>& bit_map,
702+
const InvertedIndexAnalyzerCtx* /*analyzer_ctx*/) {
692703
SCOPED_RAW_TIMER(&context->stats->inverted_index_query_timer);
693704

694705
try {

be/src/olap/rowset/segment_v2/inverted_index_reader.h

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,8 @@ class InvertedIndexReader : public IndexReader {
194194

195195
virtual Status query(const IndexQueryContextPtr& context, const std::string& column_name,
196196
const void* query_value, InvertedIndexQueryType query_type,
197-
std::shared_ptr<roaring::Roaring>& bit_map) = 0;
197+
std::shared_ptr<roaring::Roaring>& bit_map,
198+
const InvertedIndexAnalyzerCtx* analyzer_ctx = nullptr) = 0;
198199
virtual Status try_query(const IndexQueryContextPtr& context, const std::string& column_name,
199200
const void* query_value, InvertedIndexQueryType query_type,
200201
size_t* count) = 0;
@@ -255,7 +256,8 @@ class FullTextIndexReader : public InvertedIndexReader {
255256
Status new_iterator(std::unique_ptr<IndexIterator>* iterator) override;
256257
Status query(const IndexQueryContextPtr& context, const std::string& column_name,
257258
const void* query_value, InvertedIndexQueryType query_type,
258-
std::shared_ptr<roaring::Roaring>& bit_map) override;
259+
std::shared_ptr<roaring::Roaring>& bit_map,
260+
const InvertedIndexAnalyzerCtx* analyzer_ctx = nullptr) override;
259261
Status try_query(const IndexQueryContextPtr& context, const std::string& column_name,
260262
const void* query_value, InvertedIndexQueryType query_type,
261263
size_t* count) override {
@@ -279,7 +281,8 @@ class StringTypeInvertedIndexReader : public InvertedIndexReader {
279281
Status new_iterator(std::unique_ptr<IndexIterator>* iterator) override;
280282
Status query(const IndexQueryContextPtr& context, const std::string& column_name,
281283
const void* query_value, InvertedIndexQueryType query_type,
282-
std::shared_ptr<roaring::Roaring>& bit_map) override;
284+
std::shared_ptr<roaring::Roaring>& bit_map,
285+
const InvertedIndexAnalyzerCtx* analyzer_ctx = nullptr) override;
283286
Status try_query(const IndexQueryContextPtr& context, const std::string& column_name,
284287
const void* query_value, InvertedIndexQueryType query_type,
285288
size_t* count) override {
@@ -338,7 +341,8 @@ class BkdIndexReader : public InvertedIndexReader {
338341
Status new_iterator(std::unique_ptr<IndexIterator>* iterator) override;
339342
Status query(const IndexQueryContextPtr& context, const std::string& column_name,
340343
const void* query_value, InvertedIndexQueryType query_type,
341-
std::shared_ptr<roaring::Roaring>& bit_map) override;
344+
std::shared_ptr<roaring::Roaring>& bit_map,
345+
const InvertedIndexAnalyzerCtx* analyzer_ctx = nullptr) override;
342346
Status try_query(const IndexQueryContextPtr& context, const std::string& column_name,
343347
const void* query_value, InvertedIndexQueryType query_type,
344348
size_t* count) override;

be/src/olap/rowset/segment_v2/inverted_index_writer.cpp

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -191,9 +191,9 @@ Status InvertedIndexColumnWriter<field_type>::create_field(lucene::document::Fie
191191
template <FieldType field_type>
192192
Result<std::shared_ptr<lucene::analysis::Analyzer>>
193193
InvertedIndexColumnWriter<field_type>::create_analyzer(
194-
std::shared_ptr<InvertedIndexCtx>& inverted_index_ctx) {
194+
const InvertedIndexAnalyzerConfig& analyzer_config) {
195195
try {
196-
return inverted_index::InvertedIndexAnalyzer::create_analyzer(inverted_index_ctx.get());
196+
return inverted_index::InvertedIndexAnalyzer::create_analyzer(&analyzer_config);
197197
} catch (CLuceneError& e) {
198198
return ResultError(Status::Error<doris::ErrorCode::INVERTED_INDEX_ANALYZER_ERROR>(
199199
"inverted index create analyzer failed: {}", e.what()));
@@ -205,20 +205,20 @@ InvertedIndexColumnWriter<field_type>::create_analyzer(
205205

206206
template <FieldType field_type>
207207
Status InvertedIndexColumnWriter<field_type>::init_fulltext_index() {
208-
_inverted_index_ctx = std::make_shared<InvertedIndexCtx>(
209-
get_analyzer_name_from_properties(_index_meta->properties()),
210-
get_inverted_index_parser_type_from_string(
211-
get_parser_string_from_properties(_index_meta->properties())),
212-
get_parser_mode_string_from_properties(_index_meta->properties()),
213-
get_parser_phrase_support_string_from_properties(_index_meta->properties()),
214-
get_parser_char_filter_map_from_properties(_index_meta->properties()),
215-
get_parser_lowercase_from_properties<true>(_index_meta->properties()),
216-
get_parser_stopwords_from_properties(_index_meta->properties()));
208+
_analyzer_config.analyzer_name = get_analyzer_name_from_properties(_index_meta->properties());
209+
_analyzer_config.parser_type = get_inverted_index_parser_type_from_string(
210+
get_parser_string_from_properties(_index_meta->properties()));
211+
_analyzer_config.parser_mode =
212+
get_parser_mode_string_from_properties(_index_meta->properties());
213+
_analyzer_config.char_filter_map =
214+
get_parser_char_filter_map_from_properties(_index_meta->properties());
215+
_analyzer_config.lower_case =
216+
get_parser_lowercase_from_properties<true>(_index_meta->properties());
217+
_analyzer_config.stop_words = get_parser_stopwords_from_properties(_index_meta->properties());
217218
RETURN_IF_ERROR(open_index_directory());
218-
_char_string_reader =
219-
DORIS_TRY(create_char_string_reader(_inverted_index_ctx->char_filter_map));
219+
_char_string_reader = DORIS_TRY(create_char_string_reader(_analyzer_config.char_filter_map));
220220
if (_should_analyzer) {
221-
_analyzer = DORIS_TRY(create_analyzer(_inverted_index_ctx));
221+
_analyzer = DORIS_TRY(create_analyzer(_analyzer_config));
222222
}
223223
_similarity = std::make_unique<lucene::search::LengthSimilarity>();
224224
_index_writer = create_index_writer();
@@ -448,7 +448,7 @@ Status InvertedIndexColumnWriter<field_type>::add_array_values(size_t field_size
448448
// stream can not reuse for different field
449449
bool own_token_stream = true;
450450
ReaderPtr char_string_reader = DORIS_TRY(
451-
create_char_string_reader(_inverted_index_ctx->char_filter_map));
451+
create_char_string_reader(_analyzer_config.char_filter_map));
452452
char_string_reader->init(v->get_data(), cast_set<int32_t>(v->get_size()),
453453
false);
454454
ts = _analyzer->tokenStream(new_field->name(), char_string_reader);

0 commit comments

Comments
 (0)