Skip to content

Commit ebb637b

Browse files
SharafMohameddavidlionkirkrodrigues
authored
build: Update to the latest version of log-surgeon. (#1033)
Co-authored-by: SharafMohamed <[email protected]> Co-authored-by: davidlion <[email protected]> Co-authored-by: kirkrodrigues <[email protected]>
1 parent d450194 commit ebb637b

File tree

11 files changed

+133
-347
lines changed

11 files changed

+133
-347
lines changed

components/core/src/clp/Grep.cpp

Lines changed: 18 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -502,8 +502,7 @@ std::optional<Query> Grep::process_raw_query(
502502
epochtime_t search_begin_ts,
503503
epochtime_t search_end_ts,
504504
bool ignore_case,
505-
log_surgeon::lexers::ByteLexer& forward_lexer,
506-
log_surgeon::lexers::ByteLexer& reverse_lexer,
505+
log_surgeon::lexers::ByteLexer& lexer,
507506
bool use_heuristic
508507
) {
509508
// Add prefix and suffix '*' to make the search a sub-string match
@@ -546,8 +545,7 @@ std::optional<Query> Grep::process_raw_query(
546545
begin_pos,
547546
end_pos,
548547
is_var,
549-
forward_lexer,
550-
reverse_lexer
548+
lexer
551549
))
552550
{
553551
query_tokens.emplace_back(search_string_for_sub_queries, begin_pos, end_pos, is_var);
@@ -752,8 +750,7 @@ bool Grep::get_bounds_of_next_potential_var(
752750
size_t& begin_pos,
753751
size_t& end_pos,
754752
bool& is_var,
755-
log_surgeon::lexers::ByteLexer& forward_lexer,
756-
log_surgeon::lexers::ByteLexer& reverse_lexer
753+
log_surgeon::lexers::ByteLexer& lexer
757754
) {
758755
size_t const value_length = value.length();
759756
if (end_pos >= value_length) {
@@ -774,7 +771,7 @@ bool Grep::get_bounds_of_next_potential_var(
774771
if (is_escaped) {
775772
is_escaped = false;
776773

777-
if (false == forward_lexer.is_delimiter(c)) {
774+
if (false == lexer.is_delimiter(c)) {
778775
// Found escaped non-delimiter, so reverse the index to retain the escape
779776
// character
780777
--begin_pos;
@@ -788,7 +785,7 @@ bool Grep::get_bounds_of_next_potential_var(
788785
contains_wildcard = true;
789786
break;
790787
}
791-
if (false == forward_lexer.is_delimiter(c)) {
788+
if (false == lexer.is_delimiter(c)) {
792789
break;
793790
}
794791
}
@@ -803,7 +800,7 @@ bool Grep::get_bounds_of_next_potential_var(
803800
if (is_escaped) {
804801
is_escaped = false;
805802

806-
if (forward_lexer.is_delimiter(c)) {
803+
if (lexer.is_delimiter(c)) {
807804
// Found escaped delimiter, so reverse the index to retain the escape character
808805
--end_pos;
809806
break;
@@ -814,7 +811,7 @@ bool Grep::get_bounds_of_next_potential_var(
814811
} else {
815812
if (is_wildcard(c)) {
816813
contains_wildcard = true;
817-
} else if (forward_lexer.is_delimiter(c)) {
814+
} else if (lexer.is_delimiter(c)) {
818815
// Found delimiter that's not also a wildcard
819816
break;
820817
}
@@ -832,7 +829,7 @@ bool Grep::get_bounds_of_next_potential_var(
832829
}
833830
}
834831
SearchToken search_token;
835-
if (has_wildcard_in_middle || (has_prefix_wildcard && has_suffix_wildcard)) {
832+
if (has_wildcard_in_middle || has_prefix_wildcard) {
836833
// DO NOTHING
837834
} else {
838835
StringReader string_reader;
@@ -844,43 +841,22 @@ bool Grep::get_bounds_of_next_potential_var(
844841
// string, should be improved when adding a SearchParser to log_surgeon
845842
string_reader.open(value.substr(begin_pos, end_pos - begin_pos - 1));
846843
parser_input_buffer.read_if_safe(reader_wrapper);
847-
forward_lexer.reset();
848-
forward_lexer.scan_with_wildcard(
849-
parser_input_buffer,
850-
value[end_pos - 1],
851-
search_token
852-
);
853-
} else if (has_prefix_wildcard) { // *text
854-
std::string value_reverse
855-
= value.substr(begin_pos + 1, end_pos - begin_pos - 1);
856-
std::reverse(value_reverse.begin(), value_reverse.end());
857-
string_reader.open(value_reverse);
858-
parser_input_buffer.read_if_safe(reader_wrapper);
859-
reverse_lexer.reset();
860-
reverse_lexer.scan_with_wildcard(
861-
parser_input_buffer,
862-
value[begin_pos],
863-
search_token
864-
);
844+
lexer.reset();
845+
lexer.scan_with_wildcard(parser_input_buffer, value[end_pos - 1], search_token);
865846
} else { // no wildcards
866847
string_reader.open(value.substr(begin_pos, end_pos - begin_pos));
867848
parser_input_buffer.read_if_safe(reader_wrapper);
868-
forward_lexer.reset();
869-
forward_lexer.scan(parser_input_buffer, search_token);
849+
lexer.reset();
850+
auto [err, token] = lexer.scan(parser_input_buffer);
851+
if (log_surgeon::ErrorCode::Success != err) {
852+
return false;
853+
}
854+
search_token = SearchToken{token.value()};
870855
search_token.m_type_ids_set.insert(search_token.m_type_ids_ptr->at(0));
871856
}
872-
// TODO: use a set so its faster
873-
// auto const& set = search_token.m_type_ids_set;
874-
// if (set.find(static_cast<int>(log_surgeon::SymbolID::TokenUncaughtStringID))
875-
// == set.end()
876-
// && set.find(static_cast<int>(log_surgeon::SymbolID::TokenEndID))
877-
// == set.end())
878-
// {
879-
// is_var = true;
880-
// }
881857
auto const& type = search_token.m_type_ids_ptr->at(0);
882-
if (type != static_cast<int>(log_surgeon::SymbolID::TokenUncaughtStringID)
883-
&& type != static_cast<int>(log_surgeon::SymbolID::TokenEndID))
858+
if (type != static_cast<int>(log_surgeon::SymbolId::TokenUncaughtString)
859+
&& type != static_cast<int>(log_surgeon::SymbolId::TokenEnd))
884860
{
885861
is_var = true;
886862
}

components/core/src/clp/Grep.hpp

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -37,8 +37,7 @@ class Grep {
3737
* @param search_begin_ts
3838
* @param search_end_ts
3939
* @param ignore_case
40-
* @param forward_lexer DFA for determining if input is in the schema
41-
* @param reverse_lexer DFA for determining if reverse of input is in the schema
40+
* @param lexer DFA for determining if input is in the schema
4241
* @param use_heuristic
4342
* @return Query if it may match a message, std::nullopt otherwise
4443
*/
@@ -48,8 +47,7 @@ class Grep {
4847
epochtime_t search_begin_ts,
4948
epochtime_t search_end_ts,
5049
bool ignore_case,
51-
log_surgeon::lexers::ByteLexer& forward_lexer,
52-
log_surgeon::lexers::ByteLexer& reverse_lexer,
50+
log_surgeon::lexers::ByteLexer& lexer,
5351
bool use_heuristic
5452
);
5553

@@ -76,17 +74,15 @@ class Grep {
7674
* @param begin_pos Begin position of last token, changes to begin position of next token
7775
* @param end_pos End position of last token, changes to end position of next token
7876
* @param is_var Whether the token is definitely a variable
79-
* @param forward_lexer DFA for determining if input is in the schema
80-
* @param reverse_lexer DFA for determining if reverse of input is in the schema
77+
* @param lexer DFA for determining if input is in the schema
8178
* @return true if another potential variable was found, false otherwise
8279
*/
8380
static bool get_bounds_of_next_potential_var(
8481
std::string const& value,
8582
size_t& begin_pos,
8683
size_t& end_pos,
8784
bool& is_var,
88-
log_surgeon::lexers::ByteLexer& forward_lexer,
89-
log_surgeon::lexers::ByteLexer& reverse_lexer
85+
log_surgeon::lexers::ByteLexer& lexer
9086
);
9187
/**
9288
* Marks which sub-queries in each query are relevant to the given file

components/core/src/clp/Utils.cpp

Lines changed: 22 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
#include <boost/algorithm/string.hpp>
1313
#include <boost/lexical_cast.hpp>
14+
#include <log_surgeon/Constants.hpp>
1415
#include <log_surgeon/SchemaParser.hpp>
1516
#include <spdlog/spdlog.h>
1617
#include <string_utils/string_utils.hpp>
@@ -120,12 +121,8 @@ ErrorCode read_list_of_paths(string const& list_path, vector<string>& paths) {
120121
// TODO: duplicates code in log_surgeon/parser.tpp, should implement a
121122
// SearchParser in log_surgeon instead and use it here. Specifically, initialization of
122123
// lexer.m_symbol_id, contains_delimiter error, and add_rule logic.
123-
void load_lexer_from_file(
124-
std::string const& schema_file_path,
125-
bool reverse,
126-
log_surgeon::lexers::ByteLexer& lexer
127-
) {
128-
log_surgeon::SchemaParser sp;
124+
void
125+
load_lexer_from_file(std::string const& schema_file_path, log_surgeon::lexers::ByteLexer& lexer) {
129126
std::unique_ptr<log_surgeon::SchemaAST> schema_ast
130127
= log_surgeon::SchemaParser::try_schema_file(schema_file_path);
131128
if (!lexer.m_symbol_id.empty()) {
@@ -134,52 +131,52 @@ void load_lexer_from_file(
134131

135132
// cTokenEnd and cTokenUncaughtString never need to be added as a rule to the lexer as they are
136133
// not parsed
137-
lexer.m_symbol_id[log_surgeon::cTokenEnd] = static_cast<int>(log_surgeon::SymbolID::TokenEndID);
134+
lexer.m_symbol_id[log_surgeon::cTokenEnd] = static_cast<int>(log_surgeon::SymbolId::TokenEnd);
138135
lexer.m_symbol_id[log_surgeon::cTokenUncaughtString]
139-
= static_cast<int>(log_surgeon::SymbolID::TokenUncaughtStringID);
136+
= static_cast<int>(log_surgeon::SymbolId::TokenUncaughtString);
140137
// cTokenInt, cTokenFloat, cTokenFirstTimestamp, and cTokenNewlineTimestamp each have unknown
141138
// rule(s) until specified by the user so can't be explicitly added and are done by looping over
142139
// schema_vars (user schema)
143-
lexer.m_symbol_id[log_surgeon::cTokenInt] = static_cast<int>(log_surgeon::SymbolID::TokenIntId);
140+
lexer.m_symbol_id[log_surgeon::cTokenInt] = static_cast<int>(log_surgeon::SymbolId::TokenInt);
144141
lexer.m_symbol_id[log_surgeon::cTokenFloat]
145-
= static_cast<int>(log_surgeon::SymbolID::TokenFloatId);
142+
= static_cast<int>(log_surgeon::SymbolId::TokenFloat);
146143
lexer.m_symbol_id[log_surgeon::cTokenFirstTimestamp]
147-
= static_cast<int>(log_surgeon::SymbolID::TokenFirstTimestampId);
144+
= static_cast<int>(log_surgeon::SymbolId::TokenFirstTimestamp);
148145
lexer.m_symbol_id[log_surgeon::cTokenNewlineTimestamp]
149-
= static_cast<int>(log_surgeon::SymbolID::TokenNewlineTimestampId);
146+
= static_cast<int>(log_surgeon::SymbolId::TokenNewlineTimestamp);
150147
// cTokenNewline is not added in schema_vars and can be explicitly added as '\n' to catch the
151148
// end of non-timestamped log messages
152149
lexer.m_symbol_id[log_surgeon::cTokenNewline]
153-
= static_cast<int>(log_surgeon::SymbolID::TokenNewlineId);
150+
= static_cast<int>(log_surgeon::SymbolId::TokenNewline);
154151

155-
lexer.m_id_symbol[static_cast<int>(log_surgeon::SymbolID::TokenEndID)] = log_surgeon::cTokenEnd;
156-
lexer.m_id_symbol[static_cast<int>(log_surgeon::SymbolID::TokenUncaughtStringID)]
152+
lexer.m_id_symbol[static_cast<int>(log_surgeon::SymbolId::TokenEnd)] = log_surgeon::cTokenEnd;
153+
lexer.m_id_symbol[static_cast<int>(log_surgeon::SymbolId::TokenUncaughtString)]
157154
= log_surgeon::cTokenUncaughtString;
158-
lexer.m_id_symbol[static_cast<int>(log_surgeon::SymbolID::TokenIntId)] = log_surgeon::cTokenInt;
159-
lexer.m_id_symbol[static_cast<int>(log_surgeon::SymbolID::TokenFloatId)]
155+
lexer.m_id_symbol[static_cast<int>(log_surgeon::SymbolId::TokenInt)] = log_surgeon::cTokenInt;
156+
lexer.m_id_symbol[static_cast<int>(log_surgeon::SymbolId::TokenFloat)]
160157
= log_surgeon::cTokenFloat;
161-
lexer.m_id_symbol[static_cast<int>(log_surgeon::SymbolID::TokenFirstTimestampId)]
158+
lexer.m_id_symbol[static_cast<int>(log_surgeon::SymbolId::TokenFirstTimestamp)]
162159
= log_surgeon::cTokenFirstTimestamp;
163-
lexer.m_id_symbol[static_cast<int>(log_surgeon::SymbolID::TokenNewlineTimestampId)]
160+
lexer.m_id_symbol[static_cast<int>(log_surgeon::SymbolId::TokenNewlineTimestamp)]
164161
= log_surgeon::cTokenNewlineTimestamp;
165-
lexer.m_id_symbol[static_cast<int>(log_surgeon::SymbolID::TokenNewlineId)]
162+
lexer.m_id_symbol[static_cast<int>(log_surgeon::SymbolId::TokenNewline)]
166163
= log_surgeon::cTokenNewline;
167164

168165
lexer.add_rule(
169166
lexer.m_symbol_id["newLine"],
170167
std::move(
171168
std::make_unique<log_surgeon::finite_automata::RegexASTLiteral<
172-
log_surgeon::finite_automata::RegexNFAByteState>>(
169+
log_surgeon::finite_automata::ByteNfaState>>(
173170
log_surgeon::finite_automata::RegexASTLiteral<
174-
log_surgeon::finite_automata::RegexNFAByteState>('\n')
171+
log_surgeon::finite_automata::ByteNfaState>('\n')
175172
)
176173
)
177174
);
178175

179176
for (auto const& delimiters_ast : schema_ast->m_delimiters) {
180177
auto* delimiters_ptr = dynamic_cast<log_surgeon::DelimiterStringAST*>(delimiters_ast.get());
181178
if (delimiters_ptr != nullptr) {
182-
lexer.add_delimiters(delimiters_ptr->m_delimiters);
179+
lexer.set_delimiters(delimiters_ptr->m_delimiters);
183180
}
184181
}
185182
vector<uint32_t> delimiters;
@@ -203,7 +200,7 @@ void load_lexer_from_file(
203200
// transform '.' from any-character into any non-delimiter character
204201
rule->m_regex_ptr->remove_delimiters_from_wildcard(delimiters);
205202

206-
bool is_possible_input[log_surgeon::cUnicodeMax] = {false};
203+
std::array<bool, log_surgeon::cSizeOfUnicode> is_possible_input{};
207204
rule->m_regex_ptr->set_possible_inputs_to_true(is_possible_input);
208205
bool contains_delimiter = false;
209206
uint32_t delimiter_name;
@@ -242,10 +239,6 @@ void load_lexer_from_file(
242239
}
243240
lexer.add_rule(lexer.m_symbol_id[rule->m_name], std::move(rule->m_regex_ptr));
244241
}
245-
if (reverse) {
246-
lexer.generate_reverse();
247-
} else {
248-
lexer.generate();
249-
}
242+
lexer.generate();
250243
}
251244
} // namespace clp

components/core/src/clp/Utils.hpp

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -47,13 +47,11 @@ ErrorCode read_list_of_paths(std::string const& list_path, std::vector<std::stri
4747
/**
4848
* Loads a lexer from a file
4949
* @param schema_file_path
50-
* @param done
51-
* @param forward_lexer_ptr
50+
* @param lexer_ptr
5251
*/
5352
void load_lexer_from_file(
5453
std::string const& schema_file_path,
55-
bool done,
56-
log_surgeon::lexers::ByteLexer& forward_lexer_ptr
54+
log_surgeon::lexers::ByteLexer& lexer_ptr
5755
);
5856
} // namespace clp
5957

0 commit comments

Comments
 (0)