From f8d06b2935b2a870bfcb28ae2d4d2d6b1ccdb042 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 17 Sep 2025 18:08:21 -0400 Subject: [PATCH 001/164] Add dynamic programming to CLP. --- components/core/src/clp/GrepCore.cpp | 201 ++++----- components/core/src/clp/GrepCore.hpp | 405 ++++++++++++++---- .../search/test/test_QueryHandlerImpl.cpp | 1 + .../clp/ffi/ir_stream/search/test/utils.cpp | 1 + components/core/tests/test-GrepCore.cpp | 78 +--- taskfiles/deps/main.yaml | 12 +- 6 files changed, 434 insertions(+), 264 deletions(-) diff --git a/components/core/src/clp/GrepCore.cpp b/components/core/src/clp/GrepCore.cpp index 1a4bf499e2..58c8b1d2f9 100644 --- a/components/core/src/clp/GrepCore.cpp +++ b/components/core/src/clp/GrepCore.cpp @@ -1,35 +1,30 @@ #include "GrepCore.hpp" #include -#include #include #include +#include +#include #include #include #include "ir/parsing.hpp" -#include "LogSurgeonReader.hpp" -#include "QueryToken.hpp" #include "StringReader.hpp" using clp::ir::is_delim; using clp::string_utils::is_alphabet; using clp::string_utils::is_wildcard; +using log_surgeon::SymbolId::TokenFloat; +using log_surgeon::SymbolId::TokenInt; +using log_surgeon::wildcard_query_parser::QueryInterpretation; +using log_surgeon::wildcard_query_parser::StaticQueryToken; +using log_surgeon::wildcard_query_parser::VariableQueryToken; using std::string; +using std::unordered_map; +using std::vector; namespace clp { -namespace { -/** - * Wraps the tokens returned from the log_surgeon lexer, and stores the variable ids of the tokens - * in a search query in a set. This allows for optimized search performance. - */ -class SearchToken : public log_surgeon::Token { -public: - std::set m_type_ids_set; -}; -} // namespace - bool GrepCore::get_bounds_of_next_potential_var( string const& value, size_t& begin_pos, @@ -150,124 +145,90 @@ bool GrepCore::get_bounds_of_next_potential_var( return (value_length != begin_pos); } -bool GrepCore::get_bounds_of_next_potential_var( - string const& value, - size_t& begin_pos, - size_t& end_pos, - bool& is_var, - log_surgeon::lexers::ByteLexer& lexer -) { - size_t const value_length = value.length(); - if (end_pos >= value_length) { - return false; +auto GrepCore::get_wildcard_encodable_positions(QueryInterpretation const& interpretation) +-> vector { + vector wildcard_encodable_positions; + for (size_t i{0}; i < interpretation.get_logtype().size(); ++i) { + auto const& token{interpretation.get_logtype()[i]}; + if (std::holds_alternative(token)) { + auto const& variable_token{std::get(token)}; + auto const var_type{variable_token.get_variable_type()}; + bool const is_int{static_cast(TokenInt) == var_type}; + bool const is_float{static_cast(TokenFloat) == var_type}; + if (variable_token.get_contains_wildcard() && (is_int || is_float)) { + wildcard_encodable_positions.push_back(i); + } + } } + return wildcard_encodable_positions; +} - is_var = false; - bool contains_wildcard = false; - while (false == is_var && false == contains_wildcard && begin_pos < value_length) { - // Start search at end of last token - begin_pos = end_pos; - - // Find variable begin or wildcard - bool is_escaped = false; - for (; begin_pos < value_length; ++begin_pos) { - char c = value[begin_pos]; - - if (is_escaped) { - is_escaped = false; - - if (false == lexer.is_delimiter(c)) { - // Found escaped non-delimiter, so reverse the index to retain the escape - // character - --begin_pos; - break; - } - } else if ('\\' == c) { - // Escape character - is_escaped = true; - } else { - if (is_wildcard(c)) { - contains_wildcard = true; - break; - } - if (false == lexer.is_delimiter(c)) { - break; - } - } +auto GrepCore::generate_logtype_string( + QueryInterpretation const& interpretation, + unordered_map const& wildcard_mask_map +) -> std::string { + std::string logtype_string; + + // Reserve size for `logtype_string`. + size_t logtype_string_size{0}; + for (auto const& token : interpretation.get_logtype()) { + if (std::holds_alternative(token)) { + auto const& static_token{std::get(token)}; + logtype_string_size += static_token.get_query_substring().size(); + } else { + logtype_string_size++; + } + } + logtype_string.reserve(logtype_string_size); + + // Generete `logtype_string`. + for (size_t i{0}; i < interpretation.get_logtype().size(); ++i) { + auto const& token{interpretation.get_logtype()[i]}; + if (std::holds_alternative(token)) { + logtype_string += std::get(token).get_query_substring(); + continue; } - // Find next delimiter - is_escaped = false; - end_pos = begin_pos; - for (; end_pos < value_length; ++end_pos) { - char c = value[end_pos]; + auto const& var_token{std::get(token)}; + auto const& raw_string{var_token.get_query_substring()}; + auto const var_type{var_token.get_variable_type()}; - if (is_escaped) { - is_escaped = false; + bool const is_int{static_cast(TokenInt) == var_type}; + bool const is_float{static_cast(TokenFloat) == var_type}; - if (lexer.is_delimiter(c)) { - // Found escaped delimiter, so reverse the index to retain the escape character - --end_pos; - break; + if (wildcard_mask_map.contains(i)) { + bool const use_encoded{wildcard_mask_map.at(i)}; + if (use_encoded) { + if (is_int) { + EncodedVariableInterpreter::add_int_var(logtype_string); + } else { + EncodedVariableInterpreter::add_float_var(logtype_string); } - } else if ('\\' == c) { - // Escape character - is_escaped = true; } else { - if (is_wildcard(c)) { - contains_wildcard = true; - } else if (lexer.is_delimiter(c)) { - // Found delimiter that's not also a wildcard - break; - } + EncodedVariableInterpreter::add_dict_var(logtype_string); } + continue; } - if (end_pos > begin_pos) { - bool has_prefix_wildcard = ('*' == value[begin_pos]) || ('?' == value[begin_pos]); - bool has_suffix_wildcard = ('*' == value[end_pos - 1]) || ('?' == value[end_pos - 1]); - bool has_wildcard_in_middle = false; - for (size_t i = begin_pos + 1; i < end_pos - 1; ++i) { - if (('*' == value[i] || '?' == value[i]) && value[i - 1] != '\\') { - has_wildcard_in_middle = true; - break; - } - } - SearchToken search_token; - if (has_wildcard_in_middle || has_prefix_wildcard) { - // DO NOTHING - } else { - StringReader string_reader; - LogSurgeonReader reader_wrapper(string_reader); - log_surgeon::ParserInputBuffer parser_input_buffer; - if (has_suffix_wildcard) { // text* - // TODO: creating a string reader, setting it equal to a string, to read it into - // the ParserInputBuffer, seems like a convoluted way to set a string equal to a - // string, should be improved when adding a SearchParser to log_surgeon - string_reader.open(value.substr(begin_pos, end_pos - begin_pos - 1)); - parser_input_buffer.read_if_safe(reader_wrapper); - lexer.reset(); - lexer.scan_with_wildcard(parser_input_buffer, value[end_pos - 1], search_token); - } else { // no wildcards - string_reader.open(value.substr(begin_pos, end_pos - begin_pos)); - parser_input_buffer.read_if_safe(reader_wrapper); - lexer.reset(); - auto [err, token] = lexer.scan(parser_input_buffer); - if (log_surgeon::ErrorCode::Success != err) { - return false; - } - search_token = SearchToken{token.value()}; - search_token.m_type_ids_set.insert(search_token.m_type_ids_ptr->at(0)); - } - auto const& type = search_token.m_type_ids_ptr->at(0); - if (type != static_cast(log_surgeon::SymbolId::TokenUncaughtString) - && type != static_cast(log_surgeon::SymbolId::TokenEnd)) - { - is_var = true; - } - } + encoded_variable_t encoded_var{0}; + if (is_int + && EncodedVariableInterpreter::convert_string_to_representable_integer_var( + raw_string, + encoded_var + )) + { + EncodedVariableInterpreter::add_int_var(logtype_string); + } else if (is_float + && EncodedVariableInterpreter::convert_string_to_representable_float_var( + raw_string, + encoded_var + )) + { + EncodedVariableInterpreter::add_float_var(logtype_string); + } else { + EncodedVariableInterpreter::add_dict_var(logtype_string); } } - return (value_length != begin_pos); + return logtype_string; } } // namespace clp diff --git a/components/core/src/clp/GrepCore.hpp b/components/core/src/clp/GrepCore.hpp index 042d8f14a1..6ad11114fa 100644 --- a/components/core/src/clp/GrepCore.hpp +++ b/components/core/src/clp/GrepCore.hpp @@ -6,10 +6,11 @@ #include #include #include +#include #include #include -#include +#include #include #include @@ -70,24 +71,6 @@ class GrepCore { bool& is_var ); - /** - * Returns bounds of next potential variable (either a definite variable or a token with - * wildcards) - * @param value String containing token - * @param begin_pos Begin position of last token, changes to begin position of next token - * @param end_pos End position of last token, changes to end position of next token - * @param is_var Whether the token is definitely a variable - * @param lexer DFA for determining if input is in the schema - * @return true if another potential variable was found, false otherwise - */ - static bool get_bounds_of_next_potential_var( - std::string const& value, - size_t& begin_pos, - size_t& end_pos, - bool& is_var, - log_surgeon::lexers::ByteLexer& lexer - ); - private: // Types enum class SubQueryMatchabilityResult : uint8_t { @@ -143,6 +126,115 @@ class GrepCore { bool ignore_case, SubQuery& sub_query ); + + /** + * Compare all log-surgeon interpretations against the dictionaries to determine the sub queries + * to search for within the archive. + * + * A. For each interpretation we must consider encodable wildcard variables (e.g. (*1)). + * Each such variable introduces a binary choice: + * - 0: treat as a dictionary variable (\d) + * - 1: treat as an encoded variable (\i for integers, \f for floats) + * + * If there are k encodable wildcard variables, then 2^k logtype strings are possible. Each bit + * in the mask corresponds to one variable. + * + * Example: + * Search query: "a *1 *2 b", + * Interpretation (one of many): "a (*1) (*2) b" + * Possible logtypes (for the above interpretation): + * mask 00 -> "a \d \d b" + * mask 01 ->"a \d \f b" + * mask 10 ->"a \i \d b" + * mask 11 ->"a \i \f b" + * + * B. Each candidate combination becomes a useful subquery only if: + * 1. The logtype exists in the logtype dictionary, and + * 2. Each variable is either: + * a) resolvable in the variable dictionary (for dictionary vars), or + * b) encoded (always assumed valid). + * + * Note: Encoded variables are always assumed to exist in the segment. This is a performance + * trade-off: checking the archive would be slower than decompressing. + * + * @tparam LogTypeDictionaryReaderType Logtype dictionary reader type. + * @tparam VariableDictionaryReaderType Variable dictionary reader type. + * @tparam LogTypeDictionaryEntryType Logtype dictionary entry type. + * @param interpretations Log-surgeon's interpretations of the search query. + * @param logtype_dict The logtype dictionary. + * @param var_dict The variable dictionary. + * @param ignore_case Flag indicating if search is case sensitive. + * @param sub_queries Returns the subqueries to compare against CLP's archives. + * @throw std::runtime_error If there are too many candidate combinations. + */ + template < + typename LogTypeDictionaryReaderType, + typename VariableDictionaryReaderType, + typename LogTypeDictionaryEntryType = typename LogTypeDictionaryReaderType::entry_t> + static void generate_schema_sub_queries( + std::set const& + interpretations, + LogTypeDictionaryReaderType const& logtype_dict, + VariableDictionaryReaderType const& var_dict, + bool ignore_case, + std::vector& sub_queries + ); + + /** + * Scans the interpretation and returns the indicies of all encodable * wildcard variables. + * + * An encodable variable is a variable token than: + * - Contains a wildcard (e.g. *1). + * - Is of an encodable type (integer or float). + * + * @param interpretation The `QueryInterpretation` to scan. + * @return A vector of positions of encodabe wildcard variables. + */ + static auto get_wildcard_encodable_positions( + log_surgeon::wildcard_query_parser::QueryInterpretation const& interpretation + ) -> std::vector; + + /** + * Generates a logtype string from an interpretation, applying a mask to determine which + * encodable wildcard positions are treated as encoded vs dictionary variables. + * - 0: Treat as dictionary variable. + * - 1: Treat as an encoded variable. + * + * @param interpretation The interpetation to convert to a logtype string. + * @param wildcard_mask_map A map indicating the state of encodable wildcard variables. + * @return The logtype string corresponding to this combination of encoded variables. + */ + static auto generate_logtype_string( + log_surgeon::wildcard_query_parser::QueryInterpretation const& interpretation, + std::unordered_map const& wildcard_mask_map + ) -> std::string; + + /** + * Process a single variable token for schema subquery generation. + * + * Determines if the variable can be treated as: + * - an encoded variable, + * - a dictionary variable, + * - or requires wildcard dictionary search. + * + * Updates `sub_query` with the appropriate variable encodings. + * + * @tparam VariableDictionaryReaderType Variable dictionary reader type. + * @param variable_token The variable token to process. + * @param var_dict The variable dictionary. + * @param ignore_case If the search is case sensitive. + * @param is_wildcard_mask_encoded If the token is an encodable wildcard and is to be encoded. + * @param sub_query Returns the updated sub query object. + * @return True if the variable is encoded or is in the variable dictionary, false otherwise. + */ + template + static auto process_schema_var_token( + log_surgeon::wildcard_query_parser::VariableQueryToken const& variable_token, + VariableDictionaryReaderType const& var_dict, + bool ignore_case, + bool is_wildcard_mask_encoded, + SubQuery& sub_query + ) -> bool; }; template @@ -156,13 +248,15 @@ std::optional GrepCore::process_raw_query( log_surgeon::lexers::ByteLexer& lexer, bool use_heuristic ) { - // Split search_string into tokens with wildcards - std::vector query_tokens; - size_t begin_pos = 0; - size_t end_pos = 0; - bool is_var; - std::string search_string_for_sub_queries{search_string}; + std::vector sub_queries; if (use_heuristic) { + // Split search_string into tokens with wildcards + std::vector query_tokens; + size_t begin_pos = 0; + size_t end_pos = 0; + bool is_var; + std::string search_string_for_sub_queries{search_string}; + // Replace unescaped '?' wildcards with '*' wildcards since we currently have no support for // generating sub-queries with '?' wildcards. The final wildcard match on the decompressed // message uses the original wildcards, so correctness will be maintained. @@ -185,72 +279,79 @@ std::optional GrepCore::process_raw_query( { query_tokens.emplace_back(search_string_for_sub_queries, begin_pos, end_pos, is_var); } - } else { - while (get_bounds_of_next_potential_var( - search_string_for_sub_queries, - begin_pos, - end_pos, - is_var, - lexer - )) - { - query_tokens.emplace_back(search_string_for_sub_queries, begin_pos, end_pos, is_var); + // Get pointers to all ambiguous tokens. Exclude tokens with wildcards in the middle since + // we fall-back to decompression + wildcard matching for those. + std::vector ambiguous_tokens; + for (auto& query_token : query_tokens) { + if (!query_token.has_greedy_wildcard_in_middle() && query_token.is_ambiguous_token()) { + ambiguous_tokens.push_back(&query_token); + } } - } - // Get pointers to all ambiguous tokens. Exclude tokens with wildcards in the middle since we - // fall-back to decompression + wildcard matching for those. - std::vector ambiguous_tokens; - for (auto& query_token : query_tokens) { - if (!query_token.has_greedy_wildcard_in_middle() && query_token.is_ambiguous_token()) { - ambiguous_tokens.push_back(&query_token); - } - } + // Generate a sub-query for each combination of ambiguous tokens + // E.g., if there are two ambiguous tokens each of which could be a logtype or variable, we + // need to create: + // - (token1 as logtype) (token2 as logtype) + // - (token1 as logtype) (token2 as var) + // - (token1 as var) (token2 as logtype) + // - (token1 as var) (token2 as var) + std::string logtype; + bool type_of_one_token_changed = true; + while (type_of_one_token_changed) { + SubQuery sub_query; - // Generate a sub-query for each combination of ambiguous tokens - // E.g., if there are two ambiguous tokens each of which could be a logtype or variable, we need - // to create: - // - (token1 as logtype) (token2 as logtype) - // - (token1 as logtype) (token2 as var) - // - (token1 as var) (token2 as logtype) - // - (token1 as var) (token2 as var) - std::vector sub_queries; - std::string logtype; - bool type_of_one_token_changed = true; - while (type_of_one_token_changed) { - SubQuery sub_query; + // Compute logtypes and variables for query + auto matchability = generate_logtypes_and_vars_for_subquery( + logtype_dict, + var_dict, + search_string_for_sub_queries, + query_tokens, + ignore_case, + sub_query + ); + switch (matchability) { + case SubQueryMatchabilityResult::SupercedesAllSubQueries: + // Since other sub-queries will be superceded by this one, we can stop + // processing now + return Query{search_begin_ts, search_end_ts, ignore_case, search_string, {}}; + case SubQueryMatchabilityResult::MayMatch: + sub_queries.push_back(std::move(sub_query)); + break; + case SubQueryMatchabilityResult::WontMatch: + default: + // Do nothing + break; + } - // Compute logtypes and variables for query - auto matchability = generate_logtypes_and_vars_for_subquery( + // Update combination of ambiguous tokens + type_of_one_token_changed = false; + for (auto* ambiguous_token : ambiguous_tokens) { + if (ambiguous_token->change_to_next_possible_type()) { + type_of_one_token_changed = true; + break; + } + } + } + } else { + static bool interpretations_generated{false}; + static std::set interpretations; + + // TODO: This needs to be done for every archive until we have per schema logic. + constexpr bool cExecuteForEveryArchive{true}; + if (cExecuteForEveryArchive || false == interpretations_generated) { + log_surgeon::wildcard_query_parser::Query const query(search_string); + interpretations.clear(); + interpretations = query.get_all_multi_token_interpretations(lexer); + interpretations_generated = true; + } + // Transfrom log-surgeon interpretations into CLP sub-queries. + generate_schema_sub_queries( + interpretations, logtype_dict, var_dict, - search_string_for_sub_queries, - query_tokens, ignore_case, - sub_query + sub_queries ); - switch (matchability) { - case SubQueryMatchabilityResult::SupercedesAllSubQueries: - // Since other sub-queries will be superceded by this one, we can stop processing - // now - return Query{search_begin_ts, search_end_ts, ignore_case, search_string, {}}; - case SubQueryMatchabilityResult::MayMatch: - sub_queries.push_back(std::move(sub_query)); - break; - case SubQueryMatchabilityResult::WontMatch: - default: - // Do nothing - break; - } - - // Update combination of ambiguous tokens - type_of_one_token_changed = false; - for (auto* ambiguous_token : ambiguous_tokens) { - if (ambiguous_token->change_to_next_possible_type()) { - type_of_one_token_changed = true; - break; - } - } } if (sub_queries.empty()) { @@ -422,6 +523,144 @@ GrepCore::SubQueryMatchabilityResult GrepCore::generate_logtypes_and_vars_for_su return SubQueryMatchabilityResult::MayMatch; } + +template < + typename LogTypeDictionaryReaderType, + typename VariableDictionaryReaderType, + typename LogTypeDictionaryEntryType> +void GrepCore::generate_schema_sub_queries( + std::set const& interpretations, + LogTypeDictionaryReaderType const& logtype_dict, + VariableDictionaryReaderType const& var_dict, + bool const ignore_case, + std::vector& sub_queries +) { + for (auto const& interpretation : interpretations) { + auto wildcard_encodable_positions{get_wildcard_encodable_positions(interpretation)}; + if (wildcard_encodable_positions.size() > 32) { + throw std::runtime_error("Too many encodable variables."); + } + size_t const num_combos{static_cast(1) << wildcard_encodable_positions.size()}; + for (size_t mask{0}; mask < num_combos; ++mask) { + std::unordered_map wildcard_mask_map; + for (size_t i{0}; i < wildcard_encodable_positions.size(); ++i) { + wildcard_mask_map[wildcard_encodable_positions[i]] = mask >> i & 1; + } + + auto const& logtype_string{generate_logtype_string(interpretation, wildcard_mask_map)}; + + std::unordered_set logtype_entries; + logtype_dict.get_entries_matching_wildcard_string( + logtype_string, + ignore_case, + logtype_entries + ); + if (logtype_entries.empty()) { + continue; + } + + SubQuery sub_query; + bool has_vars{true}; + for (size_t i{0}; i < interpretation.get_logtype().size(); ++i) { + auto const& token{interpretation.get_logtype()[i]}; + if (std::holds_alternative( + token)) + { + bool is_wildcard_mask_encoded{false}; + if (wildcard_mask_map.contains(i)) { + is_wildcard_mask_encoded = wildcard_mask_map.at(i); + } + + has_vars = process_schema_var_token( + std::get(token), + var_dict, + ignore_case, + is_wildcard_mask_encoded, + sub_query + ); + } + if (false == has_vars) { + break; + } + } + if (false == has_vars) { + continue; + } + + std::unordered_set possible_logtype_ids; + for (auto const* entry : logtype_entries) { + possible_logtype_ids.emplace(entry->get_id()); + } + sub_query.set_possible_logtypes(possible_logtype_ids); + sub_queries.push_back(std::move(sub_query)); + } + } +} + +template +auto GrepCore::process_schema_var_token( + log_surgeon::wildcard_query_parser::VariableQueryToken const& variable_token, + VariableDictionaryReaderType const& var_dict, + bool const ignore_case, + bool const is_wildcard_mask_encoded, + SubQuery& sub_query +) -> bool { + auto const& raw_string{variable_token.get_query_substring()}; + auto const var_has_wildcard{variable_token.get_contains_wildcard()}; + auto const var_type{variable_token.get_variable_type()}; + + bool const is_int{static_cast(log_surgeon::SymbolId::TokenInt) == var_type}; + bool const is_float{static_cast(log_surgeon::SymbolId::TokenFloat) == var_type}; + + if (is_wildcard_mask_encoded) { + sub_query.mark_wildcard_match_required(); + return true; + } + + if (var_has_wildcard) { + return EncodedVariableInterpreter::wildcard_search_dictionary_and_get_encoded_matches( + raw_string, + var_dict, + ignore_case, + sub_query + ); + } + + encoded_variable_t encoded_var{}; + if ((is_int + && EncodedVariableInterpreter::convert_string_to_representable_integer_var( + raw_string, + encoded_var + )) + || (is_float + && EncodedVariableInterpreter::convert_string_to_representable_float_var( + raw_string, + encoded_var + ))) + { + sub_query.add_non_dict_var(encoded_var); + return true; + } + + auto entries = var_dict.get_entry_matching_value(raw_string, ignore_case); + if (entries.empty()) { + return false; + } + if (1 == entries.size()) { + auto const entry_id{entries[0]->get_id()}; + sub_query.add_dict_var(EncodedVariableInterpreter::encode_var_dict_id(entry_id), entry_id); + return true; + } + std::unordered_set encoded_vars; + std::unordered_set var_dict_ids; + encoded_vars.reserve(entries.size()); + for (auto const* entry: entries) { + encoded_vars.emplace(EncodedVariableInterpreter::encode_var_dict_id(entry->get_id())); + var_dict_ids.emplace(entry->get_id()); + } + sub_query.add_imprecise_dict_var(encoded_vars, var_dict_ids); + return true; +} } // namespace clp #endif // CLP_GREPCORE_HPP diff --git a/components/core/src/clp/ffi/ir_stream/search/test/test_QueryHandlerImpl.cpp b/components/core/src/clp/ffi/ir_stream/search/test/test_QueryHandlerImpl.cpp index 160b88cd66..f4eb58e6bf 100644 --- a/components/core/src/clp/ffi/ir_stream/search/test/test_QueryHandlerImpl.cpp +++ b/components/core/src/clp/ffi/ir_stream/search/test/test_QueryHandlerImpl.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #include #include "../../../../../clp_s/archive_constants.hpp" diff --git a/components/core/src/clp/ffi/ir_stream/search/test/utils.cpp b/components/core/src/clp/ffi/ir_stream/search/test/utils.cpp index e4cc603bda..f109b59f0e 100644 --- a/components/core/src/clp/ffi/ir_stream/search/test/utils.cpp +++ b/components/core/src/clp/ffi/ir_stream/search/test/utils.cpp @@ -12,6 +12,7 @@ #include #include +#include #include #include "../../../../../clp_s/search/ast/Literal.hpp" diff --git a/components/core/tests/test-GrepCore.cpp b/components/core/tests/test-GrepCore.cpp index 4f78304068..317d4a73b6 100644 --- a/components/core/tests/test-GrepCore.cpp +++ b/components/core/tests/test-GrepCore.cpp @@ -1,89 +1,66 @@ #include #include -#include -#include #include "../src/clp/GrepCore.hpp" -#include "../src/clp/Utils.hpp" using clp::GrepCore; -using clp::load_lexer_from_file; -using log_surgeon::DelimiterStringAST; -using log_surgeon::lexers::ByteLexer; -using log_surgeon::ParserAST; -using log_surgeon::SchemaAST; -using log_surgeon::SchemaParser; -using log_surgeon::SchemaVarAST; using std::string; TEST_CASE("get_bounds_of_next_potential_var", "[get_bounds_of_next_potential_var]") { - ByteLexer lexer; - load_lexer_from_file("../tests/test_schema_files/search_schema.txt", lexer); - string str; - size_t begin_pos; - size_t end_pos; - bool is_var; + size_t begin_pos{}; + size_t end_pos{}; + bool is_var{}; // m_end_pos past the end of the string str = ""; begin_pos = string::npos; end_pos = string::npos; - REQUIRE(GrepCore::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var, lexer) - == false); + REQUIRE(GrepCore::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var) == false); // Empty string str = ""; begin_pos = 0; end_pos = 0; - REQUIRE(GrepCore::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var, lexer) - == false); + REQUIRE(GrepCore::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var) == false); // No tokens str = "="; begin_pos = 0; end_pos = 0; - REQUIRE(GrepCore::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var, lexer) - == false); + REQUIRE(GrepCore::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var) == false); // No wildcards str = " MAC address 95: ad ff 95 24 0d ff =-abc- "; begin_pos = 0; end_pos = 0; - REQUIRE(GrepCore::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var, lexer) - == true); + REQUIRE(GrepCore::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var) == true); REQUIRE("95" == str.substr(begin_pos, end_pos - begin_pos)); REQUIRE(true == is_var); - REQUIRE(GrepCore::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var, lexer) - == true); + REQUIRE(GrepCore::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var) == true); REQUIRE("ad" == str.substr(begin_pos, end_pos - begin_pos)); REQUIRE(true == is_var); - REQUIRE(GrepCore::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var, lexer) - == true); + REQUIRE(GrepCore::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var) == true); REQUIRE("ff" == str.substr(begin_pos, end_pos - begin_pos)); REQUIRE(true == is_var); - REQUIRE(GrepCore::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var, lexer) - == true); + REQUIRE(GrepCore::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var) == true); REQUIRE("95" == str.substr(begin_pos, end_pos - begin_pos)); REQUIRE(true == is_var); - REQUIRE(GrepCore::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var, lexer) - == true); + REQUIRE(GrepCore::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var) == true); REQUIRE("24" == str.substr(begin_pos, end_pos - begin_pos)); REQUIRE(true == is_var); - REQUIRE(GrepCore::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var, lexer) - == true); + REQUIRE(GrepCore::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var) == true); REQUIRE("0d" == str.substr(begin_pos, end_pos - begin_pos)); REQUIRE(true == is_var); - REQUIRE(GrepCore::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var, lexer) - == true); + REQUIRE(GrepCore::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var) == true); REQUIRE("ff" == str.substr(begin_pos, end_pos - begin_pos)); REQUIRE(true == is_var); @@ -91,8 +68,7 @@ TEST_CASE("get_bounds_of_next_potential_var", "[get_bounds_of_next_potential_var REQUIRE("-abc-" == str.substr(begin_pos, end_pos - begin_pos)); REQUIRE(true == is_var); - REQUIRE(GrepCore::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var, lexer) - == false); + REQUIRE(GrepCore::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var) == false); REQUIRE(str.length() == begin_pos); // With wildcards @@ -100,33 +76,25 @@ TEST_CASE("get_bounds_of_next_potential_var", "[get_bounds_of_next_potential_var begin_pos = 0; end_pos = 0; - REQUIRE(GrepCore::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var, lexer) - == true); - REQUIRE(str.substr(begin_pos, end_pos - begin_pos) == "1\\*x"); + REQUIRE(GrepCore::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var) == true); + REQUIRE(str.substr(begin_pos, end_pos - begin_pos) == "1"); REQUIRE(is_var == true); - // REQUIRE(is_var == true); - REQUIRE(GrepCore::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var, lexer) - == true); + REQUIRE(GrepCore::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var) == true); REQUIRE(str.substr(begin_pos, end_pos - begin_pos) == "abc*123"); - REQUIRE(is_var == false); - // REQUIRE(is_var == true); + REQUIRE(is_var == true); - REQUIRE(GrepCore::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var, lexer) - == true); + REQUIRE(GrepCore::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var) == true); REQUIRE(str.substr(begin_pos, end_pos - begin_pos) == "1.2"); REQUIRE(is_var == true); - REQUIRE(GrepCore::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var, lexer) - == true); - REQUIRE(str.substr(begin_pos, end_pos - begin_pos) == "+394/-"); + REQUIRE(GrepCore::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var) == true); + REQUIRE(str.substr(begin_pos, end_pos - begin_pos) == "+394"); REQUIRE(is_var == true); - REQUIRE(GrepCore::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var, lexer) - == true); + REQUIRE(GrepCore::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var) == true); REQUIRE(str.substr(begin_pos, end_pos - begin_pos) == "-*abc-"); REQUIRE(is_var == false); - REQUIRE(GrepCore::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var, lexer) - == false); + REQUIRE(GrepCore::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var) == false); } diff --git a/taskfiles/deps/main.yaml b/taskfiles/deps/main.yaml index ee687a5590..3689f6d888 100644 --- a/taskfiles/deps/main.yaml +++ b/taskfiles/deps/main.yaml @@ -250,8 +250,8 @@ tasks: - "-DFMT_DOC=OFF" - "-DFMT_TEST=OFF" LIB_NAME: "{{.G_FMT_LIB_NAME}}" - TARBALL_SHA256: "1250e4cc58bf06ee631567523f48848dc4596133e163f02615c97f78bab6c811" - TARBALL_URL: "https://github.com/fmtlib/fmt/archive/refs/tags/10.2.1.tar.gz" + TARBALL_SHA256: "bc23066d87ab3168f27cef3e97d545fa63314f5c79df5ea444d41d56f962c6af" + TARBALL_URL: "https://github.com/fmtlib/fmt/archive/refs/tags/11.2.0.tar.gz" liblzma: internal: true @@ -317,8 +317,8 @@ tasks: - "-DCMAKE_INSTALL_MESSAGE=LAZY" - "-Dlog_surgeon_BUILD_TESTING=OFF" LIB_NAME: "log_surgeon" - TARBALL_SHA256: "6053f7e26ff21aef0c4cf409502d3abb0cfcf76d8f76786c3bb1bcc03e8f5df2" - TARBALL_URL: "https://github.com/y-scope/log-surgeon/archive/a82ad13.tar.gz" + TARBALL_SHA256: "69a99e0804a52c6b6397c5e7eabecc9bb4915d0145632c66fc63ad13678ff56a" + TARBALL_URL: "https://github.com/y-scope/log-surgeon/archive/a722d07.tar.gz" lz4: internal: true @@ -440,11 +440,11 @@ tasks: - "-DSPDLOG_BUILD_EXAMPLE_HO=OFF" - "-DSPDLOG_FMT_EXTERNAL=ON" LIB_NAME: "spdlog" - TARBALL_SHA256: "1586508029a7d0670dfcb2d97575dcdc242d3868a259742b69f100801ab4e16b" + TARBALL_SHA256: "15a04e69c222eb6c01094b5c7ff8a249b36bb22788d72519646fb85feb267e67" # NOTE: Since spdlog depends on fmt, we need to choose a version of spdlog that's # compatible with the version of fmt we use. - TARBALL_URL: "https://github.com/gabime/spdlog/archive/refs/tags/v1.14.1.tar.gz" + TARBALL_URL: "https://github.com/gabime/spdlog/archive/refs/tags/v1.15.3.tar.gz" sqlite3: internal: true From add11a20b2df99d73f8bd2abefecf57b1da1d34e Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Fri, 19 Sep 2025 10:28:50 -0400 Subject: [PATCH 002/164] Format. --- components/core/src/clp/GrepCore.cpp | 2 +- components/core/src/clp/GrepCore.hpp | 9 +++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/components/core/src/clp/GrepCore.cpp b/components/core/src/clp/GrepCore.cpp index 58c8b1d2f9..bda3b04264 100644 --- a/components/core/src/clp/GrepCore.cpp +++ b/components/core/src/clp/GrepCore.cpp @@ -146,7 +146,7 @@ bool GrepCore::get_bounds_of_next_potential_var( } auto GrepCore::get_wildcard_encodable_positions(QueryInterpretation const& interpretation) --> vector { + -> vector { vector wildcard_encodable_positions; for (size_t i{0}; i < interpretation.get_logtype().size(); ++i) { auto const& token{interpretation.get_logtype()[i]}; diff --git a/components/core/src/clp/GrepCore.hpp b/components/core/src/clp/GrepCore.hpp index 6ad11114fa..3511ee940f 100644 --- a/components/core/src/clp/GrepCore.hpp +++ b/components/core/src/clp/GrepCore.hpp @@ -136,8 +136,8 @@ class GrepCore { * - 0: treat as a dictionary variable (\d) * - 1: treat as an encoded variable (\i for integers, \f for floats) * - * If there are k encodable wildcard variables, then 2^k logtype strings are possible. Each bit - * in the mask corresponds to one variable. + * If there are k encodable wildcard variables, then 2^k logtype strings are possible. Each + * bit in the mask corresponds to one variable. * * Example: * Search query: "a *1 *2 b", @@ -564,7 +564,8 @@ void GrepCore::generate_schema_sub_queries( for (size_t i{0}; i < interpretation.get_logtype().size(); ++i) { auto const& token{interpretation.get_logtype()[i]}; if (std::holds_alternative( - token)) + token + )) { bool is_wildcard_mask_encoded{false}; if (wildcard_mask_map.contains(i)) { @@ -654,7 +655,7 @@ auto GrepCore::process_schema_var_token( std::unordered_set encoded_vars; std::unordered_set var_dict_ids; encoded_vars.reserve(entries.size()); - for (auto const* entry: entries) { + for (auto const* entry : entries) { encoded_vars.emplace(EncodedVariableInterpreter::encode_var_dict_id(entry->get_id())); var_dict_ids.emplace(entry->get_id()); } From 70f357b7e2fa3848f7673f2764b9c5abd198531f Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Fri, 26 Sep 2025 10:31:14 -0400 Subject: [PATCH 003/164] Add unit-tests. --- components/core/src/clp/GrepCore.cpp | 6 +- components/core/src/clp/GrepCore.hpp | 14 +- components/core/src/clp/Query.hpp | 2 +- components/core/tests/test-GrepCore.cpp | 924 ++++++++++++++++++++++++ 4 files changed, 937 insertions(+), 9 deletions(-) diff --git a/components/core/src/clp/GrepCore.cpp b/components/core/src/clp/GrepCore.cpp index bda3b04264..267bd67cef 100644 --- a/components/core/src/clp/GrepCore.cpp +++ b/components/core/src/clp/GrepCore.cpp @@ -149,7 +149,7 @@ auto GrepCore::get_wildcard_encodable_positions(QueryInterpretation const& inter -> vector { vector wildcard_encodable_positions; for (size_t i{0}; i < interpretation.get_logtype().size(); ++i) { - auto const& token{interpretation.get_logtype()[i]}; + auto const token{interpretation.get_logtype()[i]}; if (std::holds_alternative(token)) { auto const& variable_token{std::get(token)}; auto const var_type{variable_token.get_variable_type()}; @@ -171,7 +171,7 @@ auto GrepCore::generate_logtype_string( // Reserve size for `logtype_string`. size_t logtype_string_size{0}; - for (auto const& token : interpretation.get_logtype()) { + for (auto const token : interpretation.get_logtype()) { if (std::holds_alternative(token)) { auto const& static_token{std::get(token)}; logtype_string_size += static_token.get_query_substring().size(); @@ -183,7 +183,7 @@ auto GrepCore::generate_logtype_string( // Generete `logtype_string`. for (size_t i{0}; i < interpretation.get_logtype().size(); ++i) { - auto const& token{interpretation.get_logtype()[i]}; + auto const token{interpretation.get_logtype()[i]}; if (std::holds_alternative(token)) { logtype_string += std::get(token).get_query_substring(); continue; diff --git a/components/core/src/clp/GrepCore.hpp b/components/core/src/clp/GrepCore.hpp index 9e8ede4781..8bf38d0d32 100644 --- a/components/core/src/clp/GrepCore.hpp +++ b/components/core/src/clp/GrepCore.hpp @@ -24,7 +24,11 @@ #include "VariableDictionaryReaderReq.hpp" namespace clp { +class GrepCoreTest; + class GrepCore { +friend class GrepCoreTest; + public: // Methods /** @@ -181,7 +185,7 @@ class GrepCore { ); /** - * Scans the interpretation and returns the indicies of all encodable * wildcard variables. + * Scans the interpretation and returns the indicies of all encodable wildcard variables. * * An encodable variable is a variable token than: * - Contains a wildcard (e.g. *1). @@ -227,7 +231,7 @@ class GrepCore { * @param sub_query Returns the updated sub query object. * @return True if the variable is encoded or is in the variable dictionary, false otherwise. */ - template + template static auto process_schema_var_token( log_surgeon::wildcard_query_parser::VariableQueryToken const& variable_token, VariableDictionaryReaderType const& var_dict, @@ -547,7 +551,7 @@ void GrepCore::generate_schema_sub_queries( wildcard_mask_map[wildcard_encodable_positions[i]] = mask >> i & 1; } - auto const& logtype_string{generate_logtype_string(interpretation, wildcard_mask_map)}; + auto logtype_string{generate_logtype_string(interpretation, wildcard_mask_map)}; std::unordered_set logtype_entries; logtype_dict.get_entries_matching_wildcard_string( @@ -562,7 +566,7 @@ void GrepCore::generate_schema_sub_queries( SubQuery sub_query; bool has_vars{true}; for (size_t i{0}; i < interpretation.get_logtype().size(); ++i) { - auto const& token{interpretation.get_logtype()[i]}; + auto const token{interpretation.get_logtype()[i]}; if (std::holds_alternative( token )) @@ -598,7 +602,7 @@ void GrepCore::generate_schema_sub_queries( } } -template +template auto GrepCore::process_schema_var_token( log_surgeon::wildcard_query_parser::VariableQueryToken const& variable_token, VariableDictionaryReaderType const& var_dict, diff --git a/components/core/src/clp/Query.hpp b/components/core/src/clp/Query.hpp index f28989096b..021868e2ec 100644 --- a/components/core/src/clp/Query.hpp +++ b/components/core/src/clp/Query.hpp @@ -155,7 +155,7 @@ class SubQuery { std::unordered_set m_possible_logtypes; std::set m_ids_of_matching_segments; std::vector m_vars; - bool m_wildcard_match_required; + bool m_wildcard_match_required{false}; }; /** diff --git a/components/core/tests/test-GrepCore.cpp b/components/core/tests/test-GrepCore.cpp index fc4fcc8fd7..40ee4b5f26 100644 --- a/components/core/tests/test-GrepCore.cpp +++ b/components/core/tests/test-GrepCore.cpp @@ -1,13 +1,937 @@ +#include +#include #include +#include #include +#include +#include +#include +#include +#include #include +#include +#include +#include + +#include "../src/clp/Defs.h" +#include "../src/clp/EncodedVariableInterpreter.hpp" #include "../src/clp/GrepCore.hpp" +#include "../src/clp/LogTypeDictionaryReaderReq.hpp" +#include "../src/clp/Query.hpp" +#include "../src/clp/string_utils/string_utils.hpp" +#include "../src/clp/VariableDictionaryReaderReq.hpp" +using clp::EncodedVariableInterpreter; using clp::GrepCore; +using clp::LogTypeDictionaryReaderReq; +using clp::string_utils::wildcard_match_unsafe_case_sensitive; +using clp::SubQuery; +using clp::variable_dictionary_id_t; +using clp::VariableDictionaryReaderReq; +using log_surgeon::lexers::ByteLexer; +using log_surgeon::SymbolId::TokenFloat; +using log_surgeon::SymbolId::TokenInt; +using log_surgeon::wildcard_query_parser::QueryInterpretation; +using log_surgeon::wildcard_query_parser::VariableQueryToken; +using std::make_unique; +using std::set; using std::string; +using std::string_view; +using std::unique_ptr; +using std::unordered_map; +using std::unordered_set; +using std::vector; + +class clp::GrepCoreTest { +public: + static auto get_wildcard_encodable_positions( + QueryInterpretation const& interpretation + ) -> vector { + return GrepCore::get_wildcard_encodable_positions(interpretation); + } + + static auto generate_logtype_string( + QueryInterpretation const& interpretation, + unordered_map const& wildcard_mask_map + ) -> string { + return GrepCore::generate_logtype_string(interpretation, wildcard_mask_map); + } + + template + static auto process_token( + VariableQueryToken const& var_token, + VariableDictionaryReaderType const& var_dict, + SubQuery& sub_query + ) -> bool { + return GrepCore::process_schema_var_token( + var_token, + var_dict, + false, + false, + sub_query + ); + } + + template + static auto process_encoded_token( + VariableQueryToken const& var_token, + VariableDictionaryReaderType const& var_dict, + SubQuery& sub_query + ) -> bool { + return GrepCore::process_schema_var_token( + var_token, + var_dict, + false, + true, + sub_query + ); + } + + template < + LogTypeDictionaryReaderReq LogTypeDictionaryReaderType, + VariableDictionaryReaderReq VariableDictionaryReaderType> + static void generate_schema_sub_queries( + std::set const& interpretations, + LogTypeDictionaryReaderType const& logtype_dict, + VariableDictionaryReaderType const& var_dict, + std::vector& sub_queries + ) { + GrepCore::generate_schema_sub_queries( + interpretations, + logtype_dict, + var_dict, + false, + sub_queries + ); + } +}; + +namespace { +class FakeVarEntry { +public: + explicit FakeVarEntry(variable_dictionary_id_t const id, string value) : + m_id{id}, + m_value{value} {} + + [[nodiscard]] auto get_id() const -> variable_dictionary_id_t {return m_id;} + + [[nodiscard]] auto get_value() const -> string const& {return m_value;} + +private: + variable_dictionary_id_t m_id; + string m_value; +}; + +class FakeVarDict { +public: + using Entry = FakeVarEntry; + using dictionary_id_t = variable_dictionary_id_t; + + auto add_entry(dictionary_id_t const id, string value) -> void{ + m_storage.emplace(id, Entry{id, std::move(value)}); + } + + [[nodiscard]] auto get_value(dictionary_id_t const id) const -> string const& { + static const string empty{}; + if (m_storage.contains(id)) { + return m_storage.at(id).get_value(); + } + return empty; + } + + auto get_entry_matching_value( + string_view const val, + bool ignore_case + ) const -> vector { + vector results; + for(auto const& [id, entry] : m_storage) { + if (val == entry.get_value()) { + results.push_back(&entry); + } + } + return results; + } + + auto get_entries_matching_wildcard_string( + string_view const val, + bool ignore_case, + unordered_set& results + ) const -> void { + for(auto const& [id, entry] : m_storage) { + if (wildcard_match_unsafe_case_sensitive(entry.get_value(), val)) { + results.insert(&entry); + } + } + } + +private: + unordered_map m_storage; +}; + +class FakeLogTypeEntry { +public: + FakeLogTypeEntry(string const value, clp::logtype_dictionary_id_t const id) : + m_value(value), + m_id(id) {} + + auto clear() -> void { + m_value.clear(); + } + + auto reserve_constant_length(size_t length) -> void { + m_value.reserve(length); + } + + auto parse_next_var(string_view msg, size_t begin, size_t end, string_view& parsed) -> bool { + return false; + } + + auto add_constant(string_view const msg, size_t const begin_pos, size_t const length) -> void { + m_value.append(msg.substr(begin_pos, length)); + } + + auto add_int_var() -> void { + EncodedVariableInterpreter::add_int_var(m_value); + } + + auto add_float_var() -> void { + EncodedVariableInterpreter::add_float_var(m_value); + } + + auto add_dictionary_var() -> void { + EncodedVariableInterpreter::add_dict_var(m_value); + } + + [[nodiscard]] auto get_value() const -> string const& { + return m_value; + } + + [[nodiscard]] auto get_num_variables() const -> size_t { + return 0; + } + + [[nodiscard]] auto get_num_placeholders() const -> size_t { + return 0; + } + + [[nodiscard]] auto get_placeholder_info(size_t idx, auto& ref) const -> size_t { + return SIZE_MAX; + } + + [[nodiscard]] auto get_id() const -> clp::logtype_dictionary_id_t { + return m_id; + } + +private: + string m_value; + clp::logtype_dictionary_id_t m_id{0}; +}; + + +class FakeLogTypeDict{ +public: + using Entry = FakeLogTypeEntry; + using dictionary_id_t = clp::logtype_dictionary_id_t; + + auto add_entry(string const& value, dictionary_id_t id) -> void { + m_storage.emplace_back(value, id); + } + + auto get_entry_matching_value( + string_view const logtype, + bool ignore_case + ) const -> vector { + vector results; + for(auto const& entry : m_storage) { + if (logtype == entry.get_value()) { + results.push_back(&entry); + } + } + return results; + } + + auto get_entries_matching_wildcard_string( + string_view const logtype, + bool ignore_case, + unordered_set& results + ) const -> void { + for(auto const& entry : m_storage) { + if (wildcard_match_unsafe_case_sensitive(entry.get_value(), logtype)) { + results.insert(&entry); + } + } + } + +private: + vector m_storage; +}; +} // namespace + +// Tests: `get_wildcard_encodable_positions` +TEST_CASE("get_wildcard_encodable_positions_for_empty_interpretation", "[dfa_search]") { + QueryInterpretation const interpretation{}; + + auto const positions{clp::GrepCoreTest::get_wildcard_encodable_positions(interpretation)}; + REQUIRE(positions.empty()); +} + +TEST_CASE("get_wildcard_encodable_positions_for_multi_variable_interpretation", "[dfa_search]") { + constexpr uint32_t cHasNumberId{100}; + + QueryInterpretation interpretation{}; + interpretation.append_static_token("static_text"); + interpretation.append_variable_token(static_cast(TokenInt), "100", false); + interpretation.append_variable_token(static_cast(TokenFloat), "32.2", false); + interpretation.append_variable_token(static_cast(TokenInt), "10?", true); + interpretation.append_variable_token(static_cast(TokenFloat), "3.14*", true); + interpretation.append_variable_token(cHasNumberId, "3.14*", true); + + auto const positions{clp::GrepCoreTest::get_wildcard_encodable_positions(interpretation)}; + REQUIRE(2 == positions.size()); + REQUIRE(3 == positions[0]); + REQUIRE(4 == positions[1]); +} + + +// Tests: `generate_logtype_string` +TEST_CASE("generate_logtype_string_for_empty_interpretation", "[dfa_search]") { + QueryInterpretation const interpretation{}; + + auto const wildcard_encodable_positions{ + clp::GrepCoreTest::get_wildcard_encodable_positions(interpretation) + }; + size_t const num_combos{static_cast(1) << wildcard_encodable_positions.size()}; + REQUIRE(1 == num_combos); + for (size_t mask{0}; mask < num_combos; ++mask) { + std::unordered_map wildcard_mask_map; + for (size_t i{0}; i < wildcard_encodable_positions.size(); ++i) { + wildcard_mask_map[wildcard_encodable_positions[i]] = mask >> i & 1; + } + auto logtype_string{ + clp::GrepCoreTest::generate_logtype_string(interpretation, wildcard_mask_map) + }; + REQUIRE("" == logtype_string); + } +} + +TEST_CASE("generate_logtype_string_for_single_variable_interpretation", "[dfa_search]") { + string expected_logtype_string; + EncodedVariableInterpreter::add_int_var(expected_logtype_string); + + QueryInterpretation interpretation{}; + interpretation.append_variable_token(static_cast(TokenInt), "100", false); + + auto const wildcard_encodable_positions{ + clp::GrepCoreTest::get_wildcard_encodable_positions(interpretation) + }; + size_t const num_combos{static_cast(1) << wildcard_encodable_positions.size()}; + REQUIRE(1 == num_combos); + + std::unordered_map const wildcard_mask_map{false}; + auto logtype_string{ + clp::GrepCoreTest::generate_logtype_string(interpretation, wildcard_mask_map) + }; + REQUIRE(expected_logtype_string == logtype_string); +} + +TEST_CASE("generate_logtype_string_for_multi_variable_interpretation", "[dfa_search]") { + constexpr uint32_t cHasNumberId{100}; + + vector expected_logtype_strings; + expected_logtype_strings.push_back("static_text"); + EncodedVariableInterpreter::add_int_var(expected_logtype_strings.back()); + EncodedVariableInterpreter::add_float_var(expected_logtype_strings.back()); + EncodedVariableInterpreter::add_dict_var(expected_logtype_strings.back()); + EncodedVariableInterpreter::add_dict_var(expected_logtype_strings.back()); + EncodedVariableInterpreter::add_dict_var(expected_logtype_strings.back()); + + expected_logtype_strings.push_back("static_text"); + EncodedVariableInterpreter::add_int_var(expected_logtype_strings.back()); + EncodedVariableInterpreter::add_float_var(expected_logtype_strings.back()); + EncodedVariableInterpreter::add_int_var(expected_logtype_strings.back()); + EncodedVariableInterpreter::add_dict_var(expected_logtype_strings.back()); + EncodedVariableInterpreter::add_dict_var(expected_logtype_strings.back()); + + expected_logtype_strings.push_back("static_text"); + EncodedVariableInterpreter::add_int_var(expected_logtype_strings.back()); + EncodedVariableInterpreter::add_float_var(expected_logtype_strings.back()); + EncodedVariableInterpreter::add_dict_var(expected_logtype_strings.back()); + EncodedVariableInterpreter::add_float_var(expected_logtype_strings.back()); + EncodedVariableInterpreter::add_dict_var(expected_logtype_strings.back()); + + expected_logtype_strings.push_back("static_text"); + EncodedVariableInterpreter::add_int_var(expected_logtype_strings.back()); + EncodedVariableInterpreter::add_float_var(expected_logtype_strings.back()); + EncodedVariableInterpreter::add_int_var(expected_logtype_strings.back()); + EncodedVariableInterpreter::add_float_var(expected_logtype_strings.back()); + EncodedVariableInterpreter::add_dict_var(expected_logtype_strings.back()); + + QueryInterpretation interpretation{}; + interpretation.append_static_token("static_text"); + interpretation.append_variable_token(static_cast(TokenInt), "100", false); + interpretation.append_variable_token(static_cast(TokenFloat), "32.2", false); + interpretation.append_variable_token(static_cast(TokenInt), "10?", true); + interpretation.append_variable_token(static_cast(TokenFloat), "3.14*", true); + interpretation.append_variable_token(cHasNumberId, "3.14*", true); + + auto const wildcard_encodable_positions{ + clp::GrepCoreTest::get_wildcard_encodable_positions(interpretation) + }; + + size_t const num_combos{static_cast(1) << wildcard_encodable_positions.size()}; + REQUIRE(num_combos == 4); + for (size_t mask{0}; mask < num_combos; ++mask) { + unordered_map wildcard_mask_map; + for (size_t i{0}; i < wildcard_encodable_positions.size(); ++i) { + wildcard_mask_map[wildcard_encodable_positions[i]] = mask >> i & 1; + } + auto logtype_string{ + clp::GrepCoreTest::generate_logtype_string(interpretation, wildcard_mask_map) + }; + CAPTURE(mask); + REQUIRE(expected_logtype_strings[mask] == logtype_string); + } +} + +// Tests: `process_schema_var_token` +TEST_CASE("process_schema_empty_token ", "[dfa_search]") { + FakeVarDict var_dict; + var_dict.add_entry(0, "100"); + + SubQuery sub_query; + VariableQueryToken const static_token{0, "", false}; + REQUIRE(false == clp::GrepCoreTest::process_token(static_token, var_dict, sub_query)); + REQUIRE(false == sub_query.wildcard_match_required()); + REQUIRE(0 == sub_query.get_num_possible_vars()); +} + +TEST_CASE("process_schema_unmatched_token ", "[dfa_search]") { + FakeVarDict var_dict; + var_dict.add_entry(0, "100"); + + SubQuery sub_query; + VariableQueryToken const static_token{0, "200", false}; + REQUIRE(false == clp::GrepCoreTest::process_token(static_token, var_dict, sub_query)); + REQUIRE(false == sub_query.wildcard_match_required()); + REQUIRE(0 == sub_query.get_num_possible_vars()); +} + +TEST_CASE("process_schema_int_token ", "[dfa_search]") { + FakeVarDict var_dict; + var_dict.add_entry(0, "100"); + + SubQuery sub_query; + VariableQueryToken const int_token{0, "100", false}; + REQUIRE(clp::GrepCoreTest::process_token(int_token, var_dict, sub_query)); + REQUIRE(false == sub_query.wildcard_match_required()); + REQUIRE(1 == sub_query.get_num_possible_vars()); + auto const& var{sub_query.get_vars()[0]}; + REQUIRE(var.is_dict_var()); + REQUIRE(var.is_precise_var()); + REQUIRE(0 == var.get_var_dict_id()); + REQUIRE(var.get_possible_var_dict_ids().empty()); +} + +TEST_CASE("process_schema_encoded_non_greedy_wildcard_token ", "[dfa_search]") { + FakeVarDict var_dict; + var_dict.add_entry(0, "10a0"); + var_dict.add_entry(1, "10b0"); + + SECTION("interpret_as_int") { + SubQuery sub_query; + VariableQueryToken const int_token{0, "10?0", true}; + REQUIRE(clp::GrepCoreTest::process_encoded_token(int_token, var_dict, sub_query)); + REQUIRE(sub_query.wildcard_match_required()); + REQUIRE(0 == sub_query.get_num_possible_vars()); + } + + SECTION("interpret_as_float") { + SubQuery sub_query; + VariableQueryToken const float_token{1, "10?0", true}; + REQUIRE(clp::GrepCoreTest::process_encoded_token(float_token, var_dict, sub_query)); + REQUIRE(sub_query.wildcard_match_required()); + REQUIRE(0 == sub_query.get_num_possible_vars()); + } + + SECTION("interpret_as_precise_has_number") { + SubQuery sub_query; + VariableQueryToken const has_number_token{2, "10a?", true}; + REQUIRE(clp::GrepCoreTest::process_token(has_number_token, var_dict, sub_query)); + REQUIRE(false == sub_query.wildcard_match_required()); + REQUIRE(1 == sub_query.get_num_possible_vars()); + auto const& var{sub_query.get_vars()[0]}; + REQUIRE(var.is_dict_var()); + REQUIRE(var.is_precise_var()); + REQUIRE(0 == var.get_var_dict_id()); + REQUIRE(var.get_possible_var_dict_ids().empty()); + } + + SECTION("interpret_as_imprecise_has_number") { + SubQuery sub_query; + VariableQueryToken const has_number_token{2, "10?0", true}; + REQUIRE(clp::GrepCoreTest::process_token(has_number_token, var_dict, sub_query)); + REQUIRE(false == sub_query.wildcard_match_required()); + REQUIRE(1 == sub_query.get_num_possible_vars()); + auto const& var{sub_query.get_vars()[0]}; + REQUIRE(var.is_dict_var()); + REQUIRE(false == var.is_precise_var()); + REQUIRE(2 == var.get_possible_var_dict_ids().size()); + for (size_t i{0}; i < var.get_possible_var_dict_ids().size(); ++i) { + REQUIRE(var.get_possible_var_dict_ids().contains(i)); + } + } +} + +// NOTE: CLP currently treats all non-encoded variables as the same, so the below test demonstrates +// this. In the future if CLP is more sophisticated, the two sections behave differently. +TEST_CASE("process_schema_non_encoded_non_greedy_wildcard_token ", "[dfa_search]") { + FakeVarDict var_dict; + var_dict.add_entry(0, "100000000000000000000000010"); + var_dict.add_entry(1, "100000000000000000000000020"); + var_dict.add_entry(2, "100000000000000000000000030"); + var_dict.add_entry(3, "1000000000000000000000000.0"); + var_dict.add_entry(4, "1000000000000000000000000a0"); + + SECTION("interpret_as_int") { + SubQuery sub_query; + VariableQueryToken const int_token{0, "1000000000000000000000000?0", true}; + REQUIRE(clp::GrepCoreTest::process_token(int_token, var_dict, sub_query)); + REQUIRE(false == sub_query.wildcard_match_required()); + REQUIRE(1 == sub_query.get_num_possible_vars()); + auto const& var{sub_query.get_vars()[0]}; + REQUIRE(var.is_dict_var()); + REQUIRE(false == var.is_precise_var()); + REQUIRE(5 == var.get_possible_var_dict_ids().size()); + for (size_t i{0}; i < var.get_possible_var_dict_ids().size(); ++i) { + REQUIRE(var.get_possible_var_dict_ids().contains(i)); + } + } + + SECTION("interpret_as_float") { + SubQuery sub_query; + VariableQueryToken const float_token{1, "1000000000000000000000000?0", true}; + REQUIRE(clp::GrepCoreTest::process_token(float_token, var_dict, sub_query)); + REQUIRE(false == sub_query.wildcard_match_required()); + REQUIRE(1 == sub_query.get_num_possible_vars()); + auto const& var{sub_query.get_vars()[0]}; + REQUIRE(var.is_dict_var()); + REQUIRE(false == var.is_precise_var()); + REQUIRE(5 == var.get_possible_var_dict_ids().size()); + for (size_t i{0}; i < var.get_possible_var_dict_ids().size(); ++i) { + REQUIRE(var.get_possible_var_dict_ids().contains(i)); + } + } + + SECTION("interpret_as_has_number") { + SubQuery sub_query; + VariableQueryToken const has_number_token{2, "1000000000000000000000000?0", true}; + REQUIRE(clp::GrepCoreTest::process_token(has_number_token, var_dict, sub_query)); + REQUIRE(false == sub_query.wildcard_match_required()); + REQUIRE(1 == sub_query.get_num_possible_vars()); + auto const& var{sub_query.get_vars()[0]}; + REQUIRE(var.is_dict_var()); + REQUIRE(false == var.is_precise_var()); + REQUIRE(5 == var.get_possible_var_dict_ids().size()); + for (size_t i{0}; i < var.get_possible_var_dict_ids().size(); ++i) { + REQUIRE(var.get_possible_var_dict_ids().contains(i)); + } + } +} + +TEST_CASE("process_schema_greedy_wildcard_token ", "[dfa_search]") { + FakeVarDict var_dict; + var_dict.add_entry(0, "10a0"); + var_dict.add_entry(1, "10b0"); + var_dict.add_entry(2, "100000000000000000000000010"); + var_dict.add_entry(3, "100000000000000000000000020"); + var_dict.add_entry(4, "100000000000000000000000030"); + var_dict.add_entry(5, "1000000000000000000000000.0"); + var_dict.add_entry(6, "1000000000000000000000000a0"); + + SECTION("interpret_as_non_encoded_int") { + SubQuery sub_query; + VariableQueryToken const int_token{0, "10*0", true}; + REQUIRE(clp::GrepCoreTest::process_token(int_token, var_dict, sub_query)); + REQUIRE(false == sub_query.wildcard_match_required()); + REQUIRE(1 == sub_query.get_num_possible_vars()); + auto const& var{sub_query.get_vars()[0]}; + REQUIRE(var.is_dict_var()); + REQUIRE(false == var.is_precise_var()); + REQUIRE(7 == var.get_possible_var_dict_ids().size()); + for (size_t i{0}; i < var.get_possible_var_dict_ids().size(); ++i) { + REQUIRE(var.get_possible_var_dict_ids().contains(i)); + } + } + + SECTION("interpret_as_non_encoded_float") { + SubQuery sub_query; + VariableQueryToken const float_token{0, "10*0", true}; + REQUIRE(clp::GrepCoreTest::process_token(float_token, var_dict, sub_query)); + REQUIRE(false == sub_query.wildcard_match_required()); + REQUIRE(1 == sub_query.get_num_possible_vars()); + auto const& var{sub_query.get_vars()[0]}; + REQUIRE(var.is_dict_var()); + REQUIRE(false == var.is_precise_var()); + REQUIRE(7 == var.get_possible_var_dict_ids().size()); + for (size_t i{0}; i < var.get_possible_var_dict_ids().size(); ++i) { + REQUIRE(var.get_possible_var_dict_ids().contains(i)); + } + } + + SECTION("interpret_as_non_encoded_imprecise_has_number") { + SubQuery sub_query; + VariableQueryToken const has_number_token{0, "10*0", true}; + REQUIRE(clp::GrepCoreTest::process_token(has_number_token, var_dict, sub_query)); + REQUIRE(false == sub_query.wildcard_match_required()); + REQUIRE(1 == sub_query.get_num_possible_vars()); + auto const& var{sub_query.get_vars()[0]}; + REQUIRE(var.is_dict_var()); + REQUIRE(false == var.is_precise_var()); + REQUIRE(7 == var.get_possible_var_dict_ids().size()); + for (size_t i{0}; i < var.get_possible_var_dict_ids().size(); ++i) { + REQUIRE(var.get_possible_var_dict_ids().contains(i)); + } + } + + SECTION("interpret_as_non_encoded_precise_has_number") { + SubQuery sub_query; + VariableQueryToken const has_number_token{0, "10b*", true}; + REQUIRE(clp::GrepCoreTest::process_token(has_number_token, var_dict, sub_query)); + REQUIRE(false == sub_query.wildcard_match_required()); + REQUIRE(1 == sub_query.get_num_possible_vars()); + auto const& var{sub_query.get_vars()[0]}; + REQUIRE(var.is_dict_var()); + REQUIRE(var.is_precise_var()); + REQUIRE(1 == var.get_var_dict_id()); + REQUIRE(var.get_possible_var_dict_ids().empty()); + } + + SECTION("interpret_as_encoded_int") { + SubQuery sub_query; + VariableQueryToken const int_token{0, "10*0", true}; + REQUIRE(clp::GrepCoreTest::process_encoded_token(int_token, var_dict, sub_query)); + REQUIRE(sub_query.wildcard_match_required()); + REQUIRE(0 == sub_query.get_num_possible_vars()); + } + + SECTION("interpret_as_encoded_float") { + SubQuery sub_query; + VariableQueryToken const float_token{1, "10*0", true}; + REQUIRE(clp::GrepCoreTest::process_encoded_token(float_token, var_dict, sub_query)); + REQUIRE(sub_query.wildcard_match_required()); + REQUIRE(0 == sub_query.get_num_possible_vars()); + } +} + +// Tests: `generate_schema_sub_queries` +TEST_CASE("generate_schema_sub_queries", "[dfa_search]") { + constexpr uint32_t cFloatId{static_cast(TokenFloat)}; + constexpr uint32_t cHasNumberId{100}; + constexpr uint32_t cIntId{static_cast(TokenInt)}; + + FakeVarDict var_dict; + var_dict.add_entry(0, "10a"); + var_dict.add_entry(1, "1a3"); + + FakeLogTypeDict logtype_dict; + + string logtype_string{"static_text "}; + EncodedVariableInterpreter::add_int_var(logtype_string); + logtype_string += " "; + EncodedVariableInterpreter::add_int_var(logtype_string); + logtype_string += " "; + EncodedVariableInterpreter::add_float_var(logtype_string); + logtype_dict.add_entry(logtype_string, 0); + + logtype_string = "static_text "; + EncodedVariableInterpreter::add_int_var(logtype_string); + logtype_string += " "; + EncodedVariableInterpreter::add_dict_var(logtype_string); + logtype_string += " "; + EncodedVariableInterpreter::add_float_var(logtype_string); + logtype_dict.add_entry(logtype_string, 1); + + logtype_string = "static_text "; + EncodedVariableInterpreter::add_int_var(logtype_string); + logtype_string += " "; + EncodedVariableInterpreter::add_dict_var(logtype_string); + logtype_string += " 3.14ab'"; + logtype_dict.add_entry(logtype_string, 2); + + logtype_string = "static_text "; + EncodedVariableInterpreter::add_int_var(logtype_string); + logtype_string += " "; + EncodedVariableInterpreter::add_dict_var(logtype_string); + logtype_string += " 3.15ab'"; + logtype_dict.add_entry(logtype_string, 3); + + logtype_string = "static_text "; + EncodedVariableInterpreter::add_int_var(logtype_string); + logtype_string += " 10' "; + EncodedVariableInterpreter::add_float_var(logtype_string); + logtype_dict.add_entry(logtype_string, 4); + + set interpretations; + + QueryInterpretation interpretation1{}; + interpretation1.append_static_token("static_text "); + interpretation1.append_variable_token(cIntId, "100", false); + interpretation1.append_static_token(" "); + interpretation1.append_variable_token(cIntId, "10?", true); + interpretation1.append_static_token(" "); + interpretation1.append_variable_token(cFloatId, "3.14*", true); + interpretations.insert(interpretation1); + + QueryInterpretation interpretation2{}; + interpretation2.append_static_token("static_text "); + interpretation2.append_variable_token(cIntId, "100", false); + interpretation2.append_static_token(" "); + interpretation2.append_variable_token(cIntId, "10?", true); + interpretation2.append_static_token(" "); + interpretation2.append_variable_token(cHasNumberId, "3.14*", true); + interpretations.insert(interpretation2); + + QueryInterpretation interpretation3{}; + interpretation3.append_static_token("static_text "); + interpretation3.append_variable_token(cIntId, "100", false); + interpretation3.append_static_token(" "); + interpretation3.append_variable_token(cIntId, "10?", true); + interpretation3.append_static_token(" 3.14*"); + interpretations.insert(interpretation3); + + QueryInterpretation interpretation4{}; + interpretation4.append_static_token("static_text "); + interpretation4.append_variable_token(cIntId, "100", false); + interpretation4.append_static_token(" "); + interpretation4.append_variable_token(cHasNumberId, "10?", true); + interpretation4.append_static_token(" "); + interpretation4.append_variable_token(cFloatId, "3.14*", true); + interpretations.insert(interpretation4); + + QueryInterpretation interpretation5{}; + interpretation5.append_static_token("static_text "); + interpretation5.append_variable_token(cIntId, "100", false); + interpretation5.append_static_token(" "); + interpretation5.append_variable_token(cHasNumberId, "10?", true); + interpretation5.append_static_token(" "); + interpretation5.append_variable_token(cHasNumberId, "3.14*", true); + interpretations.insert(interpretation5); + + QueryInterpretation interpretation6{}; + interpretation6.append_static_token("static_text "); + interpretation6.append_variable_token(cIntId, "100", false); + interpretation6.append_static_token(" "); + interpretation6.append_variable_token(cHasNumberId, "10?", true); + interpretation6.append_static_token(" 3.14*"); + interpretations.insert(interpretation6); + + QueryInterpretation interpretation7{}; + interpretation7.append_static_token("static_text "); + interpretation7.append_variable_token(cIntId, "100", false); + interpretation7.append_static_token(" 10? "); + interpretation7.append_variable_token(cFloatId, "3.14*", true); + interpretations.insert(interpretation7); + + QueryInterpretation interpretation8{}; + interpretation8.append_static_token("static_text "); + interpretation8.append_variable_token(cIntId, "100", false); + interpretation8.append_static_token(" 10? "); + interpretation8.append_variable_token(cHasNumberId, "3.14*", true); + interpretations.insert(interpretation8); + + QueryInterpretation interpretation9{}; + interpretation9.append_static_token("static_text "); + interpretation9.append_variable_token(cIntId, "100", false); + interpretation9.append_static_token(" 10? 3.14*"); + interpretations.insert(interpretation9); + + vector sub_queries; + clp::GrepCoreTest::generate_schema_sub_queries( + interpretations, + logtype_dict, + var_dict, + sub_queries + ); + + REQUIRE(6 == sub_queries.size()); + + REQUIRE(sub_queries[0].wildcard_match_required()); + REQUIRE(2 == sub_queries[0].get_num_possible_vars()); + { + auto const& var{sub_queries[0].get_vars()[0]}; + REQUIRE(false == var.is_dict_var()); + REQUIRE(var.is_precise_var()); + } + { + auto const& var{sub_queries[0].get_vars()[1]}; + REQUIRE(var.is_dict_var()); + REQUIRE(var.is_precise_var()); + REQUIRE(0 == var.get_var_dict_id()); + REQUIRE(var.get_possible_var_dict_ids().empty()); + } + { + auto logtype_ids{sub_queries[1].get_possible_logtypes()}; + REQUIRE(1 == logtype_ids.size()); + CAPTURE(logtype_ids); + REQUIRE(logtype_ids.contains(0)); + } + + REQUIRE(sub_queries[1].wildcard_match_required()); + REQUIRE(1 == sub_queries[1].get_num_possible_vars()); + { + auto const& var{sub_queries[1].get_vars()[0]}; + REQUIRE(false == var.is_dict_var()); + REQUIRE(var.is_precise_var()); + } + { + auto logtype_ids{sub_queries[1].get_possible_logtypes()}; + REQUIRE(1 == logtype_ids.size()); + CAPTURE(logtype_ids); + REQUIRE(logtype_ids.contains(0)); + } + + REQUIRE(false == sub_queries[2].wildcard_match_required()); + REQUIRE(2 == sub_queries[2].get_num_possible_vars()); + { + auto const& var{sub_queries[2].get_vars()[0]}; + REQUIRE(false == var.is_dict_var()); + REQUIRE(var.is_precise_var()); + } + { + auto const& var{sub_queries[2].get_vars()[1]}; + REQUIRE(var.is_dict_var()); + REQUIRE(var.is_precise_var()); + } + { + auto logtype_ids{sub_queries[2].get_possible_logtypes()}; + REQUIRE(1 == logtype_ids.size()); + CAPTURE(logtype_ids); + REQUIRE(logtype_ids.contains(2)); + } + + REQUIRE(sub_queries[3].wildcard_match_required()); + REQUIRE(2 == sub_queries[3].get_num_possible_vars()); + { + auto const& var{sub_queries[3].get_vars()[0]}; + REQUIRE(false == var.is_dict_var()); + REQUIRE(var.is_precise_var()); + } + { + auto const& var{sub_queries[3].get_vars()[1]}; + REQUIRE(var.is_dict_var()); + REQUIRE(var.is_precise_var()); + } + { + auto logtype_ids{sub_queries[3].get_possible_logtypes()}; + REQUIRE(1 == logtype_ids.size()); + CAPTURE(logtype_ids); + REQUIRE(logtype_ids.contains(1)); + } + + REQUIRE(false == sub_queries[4].wildcard_match_required()); + REQUIRE(2 == sub_queries[4].get_num_possible_vars()); + { + auto const& var{sub_queries[4].get_vars()[0]}; + REQUIRE(false == var.is_dict_var()); + REQUIRE(var.is_precise_var()); + } + { + auto const& var{sub_queries[4].get_vars()[1]}; + REQUIRE(var.is_dict_var()); + REQUIRE(var.is_precise_var()); + } + { + auto logtype_ids{sub_queries[4].get_possible_logtypes()}; + REQUIRE(1 == logtype_ids.size()); + CAPTURE(logtype_ids); + REQUIRE(logtype_ids.contains(2)); + } + + REQUIRE(sub_queries[5].wildcard_match_required()); + REQUIRE(1 == sub_queries[5].get_num_possible_vars()); + { + auto const& var{sub_queries[5].get_vars()[0]}; + REQUIRE(false == var.is_dict_var()); + REQUIRE(var.is_precise_var()); + } + { + auto logtype_ids{sub_queries[5].get_possible_logtypes()}; + REQUIRE(1 == logtype_ids.size()); + CAPTURE(logtype_ids); + REQUIRE(logtype_ids.contains(4)); + } +} + +/* +// Tests: `process_raw_query` +TEST_CASE("process_raw_query", "[dfa_search]") { + constexpr uint32_t cFloatId{static_cast(TokenFloat)}; + constexpr uint32_t cHasNumberId{100}; + constexpr uint32_t cIntId{static_cast(TokenInt)}; + + FakeVarDict var_dict; + var_dict.add_entry(0, "10a"); + var_dict.add_entry(1, "1a3"); + + FakeLogTypeDict logtype_dict; + + string logtype_string{"static_text "}; + EncodedVariableInterpreter::add_int_var(logtype_string); + logtype_string += " "; + EncodedVariableInterpreter::add_int_var(logtype_string); + logtype_string += " "; + EncodedVariableInterpreter::add_float_var(logtype_string); + logtype_dict.add_entry(logtype_string, 0); + + logtype_string = "static_text "; + EncodedVariableInterpreter::add_int_var(logtype_string); + logtype_string += " "; + EncodedVariableInterpreter::add_dict_var(logtype_string); + logtype_string += " "; + EncodedVariableInterpreter::add_float_var(logtype_string); + logtype_dict.add_entry(logtype_string, 0); + + logtype_string = "static_text "; + EncodedVariableInterpreter::add_int_var(logtype_string); + logtype_string += " "; + EncodedVariableInterpreter::add_dict_var(logtype_string); + logtype_string += " 3.14ab'"; + logtype_dict.add_entry(logtype_string, 0); + + logtype_string = "static_text "; + EncodedVariableInterpreter::add_int_var(logtype_string); + logtype_string += " "; + EncodedVariableInterpreter::add_dict_var(logtype_string); + logtype_string += " 3.15ab'"; + logtype_dict.add_entry(logtype_string, 0); + + logtype_string = "static_text "; + EncodedVariableInterpreter::add_int_var(logtype_string); + logtype_string += " 10' "; + EncodedVariableInterpreter::add_float_var(logtype_string); + logtype_dict.add_entry(logtype_string, 0); + + string raw_query{"static_text 100 10? 3.14*"}; + + auto const query{GrepCore::process_raw_query( + logtype_dict, + var_dict, + raw_query, + 0, + 0, + false, + lexer, + false + )}; + + auto const& sub_queries{query.get_sub_queries()}; + REQUIRE(6 == sub_queries.size()); +} +*/ +// Tests: `get_bounds_of_next_potential_var` TEST_CASE("get_bounds_of_next_potential_var", "[get_bounds_of_next_potential_var]") { string str; size_t begin_pos{}; From 9d3e15270fbfd3e0b2634b66513e335d9bb859c4 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Fri, 26 Sep 2025 10:50:24 -0400 Subject: [PATCH 004/164] Format. --- components/core/src/clp/GrepCore.hpp | 2 +- components/core/tests/test-GrepCore.cpp | 130 +++++++++--------------- 2 files changed, 47 insertions(+), 85 deletions(-) diff --git a/components/core/src/clp/GrepCore.hpp b/components/core/src/clp/GrepCore.hpp index 8bf38d0d32..64e137bfb7 100644 --- a/components/core/src/clp/GrepCore.hpp +++ b/components/core/src/clp/GrepCore.hpp @@ -27,7 +27,7 @@ namespace clp { class GrepCoreTest; class GrepCore { -friend class GrepCoreTest; + friend class GrepCoreTest; public: // Methods diff --git a/components/core/tests/test-GrepCore.cpp b/components/core/tests/test-GrepCore.cpp index 40ee4b5f26..c23b9fa076 100644 --- a/components/core/tests/test-GrepCore.cpp +++ b/components/core/tests/test-GrepCore.cpp @@ -10,7 +10,6 @@ #include #include - #include #include #include @@ -46,15 +45,14 @@ using std::vector; class clp::GrepCoreTest { public: - static auto get_wildcard_encodable_positions( - QueryInterpretation const& interpretation - ) -> vector { + static auto get_wildcard_encodable_positions(QueryInterpretation const& interpretation) + -> vector { return GrepCore::get_wildcard_encodable_positions(interpretation); } static auto generate_logtype_string( - QueryInterpretation const& interpretation, - unordered_map const& wildcard_mask_map + QueryInterpretation const& interpretation, + unordered_map const& wildcard_mask_map ) -> string { return GrepCore::generate_logtype_string(interpretation, wildcard_mask_map); } @@ -65,13 +63,7 @@ class clp::GrepCoreTest { VariableDictionaryReaderType const& var_dict, SubQuery& sub_query ) -> bool { - return GrepCore::process_schema_var_token( - var_token, - var_dict, - false, - false, - sub_query - ); + return GrepCore::process_schema_var_token(var_token, var_dict, false, false, sub_query); } template @@ -80,13 +72,7 @@ class clp::GrepCoreTest { VariableDictionaryReaderType const& var_dict, SubQuery& sub_query ) -> bool { - return GrepCore::process_schema_var_token( - var_token, - var_dict, - false, - true, - sub_query - ); + return GrepCore::process_schema_var_token(var_token, var_dict, false, true, sub_query); } template < @@ -110,15 +96,15 @@ class clp::GrepCoreTest { namespace { class FakeVarEntry { -public: - explicit FakeVarEntry(variable_dictionary_id_t const id, string value) : - m_id{id}, - m_value{value} {} - - [[nodiscard]] auto get_id() const -> variable_dictionary_id_t {return m_id;} - - [[nodiscard]] auto get_value() const -> string const& {return m_value;} - +public: + explicit FakeVarEntry(variable_dictionary_id_t const id, string value) + : m_id{id}, + m_value{value} {} + + [[nodiscard]] auto get_id() const -> variable_dictionary_id_t { return m_id; } + + [[nodiscard]] auto get_value() const -> string const& { return m_value; } + private: variable_dictionary_id_t m_id; string m_value; @@ -129,24 +115,22 @@ class FakeVarDict { using Entry = FakeVarEntry; using dictionary_id_t = variable_dictionary_id_t; - auto add_entry(dictionary_id_t const id, string value) -> void{ + auto add_entry(dictionary_id_t const id, string value) -> void { m_storage.emplace(id, Entry{id, std::move(value)}); } - + [[nodiscard]] auto get_value(dictionary_id_t const id) const -> string const& { - static const string empty{}; + static string const empty{}; if (m_storage.contains(id)) { return m_storage.at(id).get_value(); } return empty; } - auto get_entry_matching_value( - string_view const val, - bool ignore_case - ) const -> vector { + auto get_entry_matching_value(string_view const val, bool ignore_case) const + -> vector { vector results; - for(auto const& [id, entry] : m_storage) { + for (auto const& [id, entry] : m_storage) { if (val == entry.get_value()) { results.push_back(&entry); } @@ -159,7 +143,7 @@ class FakeVarDict { bool ignore_case, unordered_set& results ) const -> void { - for(auto const& [id, entry] : m_storage) { + for (auto const& [id, entry] : m_storage) { if (wildcard_match_unsafe_case_sensitive(entry.get_value(), val)) { results.insert(&entry); } @@ -172,17 +156,13 @@ class FakeVarDict { class FakeLogTypeEntry { public: - FakeLogTypeEntry(string const value, clp::logtype_dictionary_id_t const id) : - m_value(value), - m_id(id) {} - - auto clear() -> void { - m_value.clear(); - } + FakeLogTypeEntry(string const value, clp::logtype_dictionary_id_t const id) + : m_value(value), + m_id(id) {} - auto reserve_constant_length(size_t length) -> void { - m_value.reserve(length); - } + auto clear() -> void { m_value.clear(); } + + auto reserve_constant_length(size_t length) -> void { m_value.reserve(length); } auto parse_next_var(string_view msg, size_t begin, size_t end, string_view& parsed) -> bool { return false; @@ -192,45 +172,30 @@ class FakeLogTypeEntry { m_value.append(msg.substr(begin_pos, length)); } - auto add_int_var() -> void { - EncodedVariableInterpreter::add_int_var(m_value); - } + auto add_int_var() -> void { EncodedVariableInterpreter::add_int_var(m_value); } - auto add_float_var() -> void { - EncodedVariableInterpreter::add_float_var(m_value); - } + auto add_float_var() -> void { EncodedVariableInterpreter::add_float_var(m_value); } - auto add_dictionary_var() -> void { - EncodedVariableInterpreter::add_dict_var(m_value); - } + auto add_dictionary_var() -> void { EncodedVariableInterpreter::add_dict_var(m_value); } - [[nodiscard]] auto get_value() const -> string const& { - return m_value; - } + [[nodiscard]] auto get_value() const -> string const& { return m_value; } - [[nodiscard]] auto get_num_variables() const -> size_t { - return 0; - } + [[nodiscard]] auto get_num_variables() const -> size_t { return 0; } - [[nodiscard]] auto get_num_placeholders() const -> size_t { - return 0; - } + [[nodiscard]] auto get_num_placeholders() const -> size_t { return 0; } [[nodiscard]] auto get_placeholder_info(size_t idx, auto& ref) const -> size_t { return SIZE_MAX; } - [[nodiscard]] auto get_id() const -> clp::logtype_dictionary_id_t { - return m_id; - } + [[nodiscard]] auto get_id() const -> clp::logtype_dictionary_id_t { return m_id; } private: string m_value; clp::logtype_dictionary_id_t m_id{0}; }; - -class FakeLogTypeDict{ +class FakeLogTypeDict { public: using Entry = FakeLogTypeEntry; using dictionary_id_t = clp::logtype_dictionary_id_t; @@ -239,12 +204,10 @@ class FakeLogTypeDict{ m_storage.emplace_back(value, id); } - auto get_entry_matching_value( - string_view const logtype, - bool ignore_case - ) const -> vector { + auto get_entry_matching_value(string_view const logtype, bool ignore_case) const + -> vector { vector results; - for(auto const& entry : m_storage) { + for (auto const& entry : m_storage) { if (logtype == entry.get_value()) { results.push_back(&entry); } @@ -257,17 +220,17 @@ class FakeLogTypeDict{ bool ignore_case, unordered_set& results ) const -> void { - for(auto const& entry : m_storage) { + for (auto const& entry : m_storage) { if (wildcard_match_unsafe_case_sensitive(entry.get_value(), logtype)) { results.insert(&entry); } } - } + } private: vector m_storage; }; -} // namespace +} // namespace // Tests: `get_wildcard_encodable_positions` TEST_CASE("get_wildcard_encodable_positions_for_empty_interpretation", "[dfa_search]") { @@ -294,7 +257,6 @@ TEST_CASE("get_wildcard_encodable_positions_for_multi_variable_interpretation", REQUIRE(4 == positions[1]); } - // Tests: `generate_logtype_string` TEST_CASE("generate_logtype_string_for_empty_interpretation", "[dfa_search]") { QueryInterpretation const interpretation{}; @@ -674,7 +636,7 @@ TEST_CASE("generate_schema_sub_queries", "[dfa_search]") { logtype_dict.add_entry(logtype_string, 4); set interpretations; - + QueryInterpretation interpretation1{}; interpretation1.append_static_token("static_text "); interpretation1.append_variable_token(cIntId, "100", false); @@ -700,7 +662,7 @@ TEST_CASE("generate_schema_sub_queries", "[dfa_search]") { interpretation3.append_variable_token(cIntId, "10?", true); interpretation3.append_static_token(" 3.14*"); interpretations.insert(interpretation3); - + QueryInterpretation interpretation4{}; interpretation4.append_static_token("static_text "); interpretation4.append_variable_token(cIntId, "100", false); @@ -726,7 +688,7 @@ TEST_CASE("generate_schema_sub_queries", "[dfa_search]") { interpretation6.append_variable_token(cHasNumberId, "10?", true); interpretation6.append_static_token(" 3.14*"); interpretations.insert(interpretation6); - + QueryInterpretation interpretation7{}; interpretation7.append_static_token("static_text "); interpretation7.append_variable_token(cIntId, "100", false); @@ -777,7 +739,7 @@ TEST_CASE("generate_schema_sub_queries", "[dfa_search]") { CAPTURE(logtype_ids); REQUIRE(logtype_ids.contains(0)); } - + REQUIRE(sub_queries[1].wildcard_match_required()); REQUIRE(1 == sub_queries[1].get_num_possible_vars()); { From b587085a5b0ae0a7376dbba1ad8c9fe639783d60 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Fri, 26 Sep 2025 11:12:52 -0400 Subject: [PATCH 005/164] Add missing header; Flip cast for comparison to SymbolID over unit32_t. --- components/core/src/clp/GrepCore.cpp | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/components/core/src/clp/GrepCore.cpp b/components/core/src/clp/GrepCore.cpp index 267bd67cef..82c83d7776 100644 --- a/components/core/src/clp/GrepCore.cpp +++ b/components/core/src/clp/GrepCore.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include #include @@ -151,11 +152,11 @@ auto GrepCore::get_wildcard_encodable_positions(QueryInterpretation const& inter for (size_t i{0}; i < interpretation.get_logtype().size(); ++i) { auto const token{interpretation.get_logtype()[i]}; if (std::holds_alternative(token)) { - auto const& variable_token{std::get(token)}; - auto const var_type{variable_token.get_variable_type()}; - bool const is_int{static_cast(TokenInt) == var_type}; - bool const is_float{static_cast(TokenFloat) == var_type}; - if (variable_token.get_contains_wildcard() && (is_int || is_float)) { + auto const& var_token{std::get(token)}; + auto const var_type{static_cast(var_token.get_variable_type())}; + bool const is_int{TokenInt == var_type}; + bool const is_float{TokenFloat == var_type}; + if (var_token.get_contains_wildcard() && (is_int || is_float)) { wildcard_encodable_positions.push_back(i); } } @@ -191,11 +192,9 @@ auto GrepCore::generate_logtype_string( auto const& var_token{std::get(token)}; auto const& raw_string{var_token.get_query_substring()}; - auto const var_type{var_token.get_variable_type()}; - - bool const is_int{static_cast(TokenInt) == var_type}; - bool const is_float{static_cast(TokenFloat) == var_type}; - + auto const var_type{static_cast(var_token.get_variable_type())}; + bool const is_int{TokenInt == var_type}; + bool const is_float{TokenFloat == var_type}; if (wildcard_mask_map.contains(i)) { bool const use_encoded{wildcard_mask_map.at(i)}; if (use_encoded) { From 9d5dd94090d1bcb7e26f161dd4851bfd928d2b43 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Fri, 26 Sep 2025 11:16:05 -0400 Subject: [PATCH 006/164] Fix typos. --- components/core/src/clp/GrepCore.cpp | 2 +- components/core/src/clp/GrepCore.hpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/components/core/src/clp/GrepCore.cpp b/components/core/src/clp/GrepCore.cpp index 82c83d7776..d2ad52ce3a 100644 --- a/components/core/src/clp/GrepCore.cpp +++ b/components/core/src/clp/GrepCore.cpp @@ -182,7 +182,7 @@ auto GrepCore::generate_logtype_string( } logtype_string.reserve(logtype_string_size); - // Generete `logtype_string`. + // Generate `logtype_string`. for (size_t i{0}; i < interpretation.get_logtype().size(); ++i) { auto const token{interpretation.get_logtype()[i]}; if (std::holds_alternative(token)) { diff --git a/components/core/src/clp/GrepCore.hpp b/components/core/src/clp/GrepCore.hpp index 64e137bfb7..74569691c3 100644 --- a/components/core/src/clp/GrepCore.hpp +++ b/components/core/src/clp/GrepCore.hpp @@ -185,7 +185,7 @@ class GrepCore { ); /** - * Scans the interpretation and returns the indicies of all encodable wildcard variables. + * Scans the interpretation and returns the indices of all encodable wildcard variables. * * An encodable variable is a variable token than: * - Contains a wildcard (e.g. *1). From c5bd43b7d5f8efed5913fdc6a4cdb20c19f6591d Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Fri, 26 Sep 2025 11:17:21 -0400 Subject: [PATCH 007/164] Remove magic number. --- components/core/src/clp/GrepCore.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/components/core/src/clp/GrepCore.hpp b/components/core/src/clp/GrepCore.hpp index 74569691c3..f9c53ca9aa 100644 --- a/components/core/src/clp/GrepCore.hpp +++ b/components/core/src/clp/GrepCore.hpp @@ -539,9 +539,10 @@ void GrepCore::generate_schema_sub_queries( bool const ignore_case, std::vector& sub_queries ) { + constexpr size_t cMaxEncodableWildcardVariables{32}; for (auto const& interpretation : interpretations) { auto wildcard_encodable_positions{get_wildcard_encodable_positions(interpretation)}; - if (wildcard_encodable_positions.size() > 32) { + if (wildcard_encodable_positions.size() > cMaxEncodableWildcardVariables) { throw std::runtime_error("Too many encodable variables."); } size_t const num_combos{static_cast(1) << wildcard_encodable_positions.size()}; From d7cfb0ddfbdfce724af4c24fbd4124a5ab0a4625 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Fri, 26 Sep 2025 11:19:59 -0400 Subject: [PATCH 008/164] Add missing header; Fix typo; Fix if statement. --- components/core/src/clp/GrepCore.hpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/components/core/src/clp/GrepCore.hpp b/components/core/src/clp/GrepCore.hpp index f9c53ca9aa..6b84aaecfe 100644 --- a/components/core/src/clp/GrepCore.hpp +++ b/components/core/src/clp/GrepCore.hpp @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -204,7 +205,7 @@ class GrepCore { * - 0: Treat as dictionary variable. * - 1: Treat as an encoded variable. * - * @param interpretation The interpetation to convert to a logtype string. + * @param interpretation The interpretation to convert to a logtype string. * @param wildcard_mask_map A map indicating the state of encodable wildcard variables. * @return The logtype string corresponding to this combination of encoded variables. */ @@ -289,7 +290,9 @@ std::optional GrepCore::process_raw_query( // we fall-back to decompression + wildcard matching for those. std::vector ambiguous_tokens; for (auto& query_token : query_tokens) { - if (!query_token.has_greedy_wildcard_in_middle() && query_token.is_ambiguous_token()) { + if (false == query_token.has_greedy_wildcard_in_middle() + && query_token.is_ambiguous_token()) + { ambiguous_tokens.push_back(&query_token); } } From 27ebc0fcd548f23a42b7c4b4e58e5956e4ebd59a Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Fri, 26 Sep 2025 11:25:40 -0400 Subject: [PATCH 009/164] Remove thread-unsafe statics. --- components/core/src/clp/GrepCore.hpp | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/components/core/src/clp/GrepCore.hpp b/components/core/src/clp/GrepCore.hpp index 6b84aaecfe..aabb9010ac 100644 --- a/components/core/src/clp/GrepCore.hpp +++ b/components/core/src/clp/GrepCore.hpp @@ -342,18 +342,9 @@ std::optional GrepCore::process_raw_query( } } } else { - static bool interpretations_generated{false}; - static std::set interpretations; - - // TODO: This needs to be done for every archive until we have per schema logic. - constexpr bool cExecuteForEveryArchive{true}; - if (cExecuteForEveryArchive || false == interpretations_generated) { - log_surgeon::wildcard_query_parser::Query const query(search_string); - interpretations.clear(); - interpretations = query.get_all_multi_token_interpretations(lexer); - interpretations_generated = true; - } - // Transfrom log-surgeon interpretations into CLP sub-queries. + // TODO: Optimize such that interpretations are only generated once per schema. + log_surgeon::wildcard_query_parser::Query const query{search_string}; + auto const interpretations{query.get_all_multi_token_interpretations(lexer)}; generate_schema_sub_queries( interpretations, logtype_dict, From 7a2f74ec226ac7ec34fbb4a979fd05ee690919f8 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Fri, 26 Sep 2025 11:30:09 -0400 Subject: [PATCH 010/164] Use uint64_t with bit shift operator. --- components/core/src/clp/GrepCore.hpp | 6 +++--- components/core/tests/test-GrepCore.cpp | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/components/core/src/clp/GrepCore.hpp b/components/core/src/clp/GrepCore.hpp index aabb9010ac..423eec51b1 100644 --- a/components/core/src/clp/GrepCore.hpp +++ b/components/core/src/clp/GrepCore.hpp @@ -539,11 +539,11 @@ void GrepCore::generate_schema_sub_queries( if (wildcard_encodable_positions.size() > cMaxEncodableWildcardVariables) { throw std::runtime_error("Too many encodable variables."); } - size_t const num_combos{static_cast(1) << wildcard_encodable_positions.size()}; - for (size_t mask{0}; mask < num_combos; ++mask) { + uint64_t const num_combos{1ULL << wildcard_encodable_positions.size()}; + for (uint64_t mask{0}; mask < num_combos; ++mask) { std::unordered_map wildcard_mask_map; for (size_t i{0}; i < wildcard_encodable_positions.size(); ++i) { - wildcard_mask_map[wildcard_encodable_positions[i]] = mask >> i & 1; + wildcard_mask_map[wildcard_encodable_positions[i]] = mask >> i & 1ULL; } auto logtype_string{generate_logtype_string(interpretation, wildcard_mask_map)}; diff --git a/components/core/tests/test-GrepCore.cpp b/components/core/tests/test-GrepCore.cpp index c23b9fa076..6ee07acbb3 100644 --- a/components/core/tests/test-GrepCore.cpp +++ b/components/core/tests/test-GrepCore.cpp @@ -264,12 +264,12 @@ TEST_CASE("generate_logtype_string_for_empty_interpretation", "[dfa_search]") { auto const wildcard_encodable_positions{ clp::GrepCoreTest::get_wildcard_encodable_positions(interpretation) }; - size_t const num_combos{static_cast(1) << wildcard_encodable_positions.size()}; + uint64_t const num_combos{1ULL << wildcard_encodable_positions.size()}; REQUIRE(1 == num_combos); - for (size_t mask{0}; mask < num_combos; ++mask) { + for (uint64_t mask{0}; mask < num_combos; ++mask) { std::unordered_map wildcard_mask_map; for (size_t i{0}; i < wildcard_encodable_positions.size(); ++i) { - wildcard_mask_map[wildcard_encodable_positions[i]] = mask >> i & 1; + wildcard_mask_map[wildcard_encodable_positions[i]] = mask >> i & 1ULL; } auto logtype_string{ clp::GrepCoreTest::generate_logtype_string(interpretation, wildcard_mask_map) From 9848096e437f450b6de002d7e4033fc9cf47a6af Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Fri, 26 Sep 2025 11:41:23 -0400 Subject: [PATCH 011/164] Switch to default initialization for all false map. --- components/core/tests/test-GrepCore.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/core/tests/test-GrepCore.cpp b/components/core/tests/test-GrepCore.cpp index 6ee07acbb3..55ecca2fb6 100644 --- a/components/core/tests/test-GrepCore.cpp +++ b/components/core/tests/test-GrepCore.cpp @@ -291,7 +291,7 @@ TEST_CASE("generate_logtype_string_for_single_variable_interpretation", "[dfa_se size_t const num_combos{static_cast(1) << wildcard_encodable_positions.size()}; REQUIRE(1 == num_combos); - std::unordered_map const wildcard_mask_map{false}; + std::unordered_map const wildcard_mask_map{}; auto logtype_string{ clp::GrepCoreTest::generate_logtype_string(interpretation, wildcard_mask_map) }; From 3b4572bb0c9f9987ad61d2302d3fdbd357c23cb7 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 1 Oct 2025 07:36:34 -0400 Subject: [PATCH 012/164] Fix bug in string_utils and add to test-string_utils; Refactor unit-testing code for DP algo; Add final unit-test for testing PR end-to-end; Still lots of edge cases and sub-steps that could be tested more rigorously. --- components/core/src/clp/GrepCore.hpp | 6 +- .../src/clp/string_utils/string_utils.cpp | 20 +- components/core/tests/test-GrepCore.cpp | 703 +++++++++--------- components/core/tests/test-string_utils.cpp | 30 + 4 files changed, 391 insertions(+), 368 deletions(-) diff --git a/components/core/src/clp/GrepCore.hpp b/components/core/src/clp/GrepCore.hpp index 423eec51b1..ce57c72120 100644 --- a/components/core/src/clp/GrepCore.hpp +++ b/components/core/src/clp/GrepCore.hpp @@ -151,9 +151,9 @@ class GrepCore { * Interpretation (one of many): "a (*1) (*2) b" * Possible logtypes (for the above interpretation): * mask 00 -> "a \d \d b" - * mask 01 ->"a \d \f b" - * mask 10 ->"a \i \d b" - * mask 11 ->"a \i \f b" + * mask 01 -> "a \d \f b" + * mask 10 -> "a \i \d b" + * mask 11 -> "a \i \f b" * * B. Each candidate combination becomes a useful subquery only if: * 1. The logtype exists in the logtype dictionary, and diff --git a/components/core/src/clp/string_utils/string_utils.cpp b/components/core/src/clp/string_utils/string_utils.cpp index adf903ab73..6b1a08d82d 100644 --- a/components/core/src/clp/string_utils/string_utils.cpp +++ b/components/core/src/clp/string_utils/string_utils.cpp @@ -14,11 +14,13 @@ namespace { * Helper for ``wildcard_match_unsafe_case_sensitive`` to advance the pointer in * tame to the next character which matches wild. This method should be inlined * for performance. + * + * This method assumes that `wild_current` has no duplicate greedy wildcards ('*'). + * * @param tame_current * @param tame_bookmark * @param tame_end * @param wild_current - * @param wild_bookmark * @return true on success, false if wild cannot match tame */ inline bool advance_tame_to_next_match( @@ -258,10 +260,12 @@ bool wildcard_match_unsafe_case_sensitive(string_view tame, string_view wild) { while (true) { w = *wild_current; if ('*' == w) { - ++wild_current; - if (wild_end == wild_current) { - // Trailing '*' means everything remaining in tame will match - return true; + while ('*' == *wild_current) { + ++wild_current; + if (wild_end == wild_current) { + // Trailing '*' means everything remaining in tame will match + return true; + } } // Set wild and tame bookmarks @@ -309,8 +313,10 @@ bool wildcard_match_unsafe_case_sensitive(string_view tame, string_view wild) { // Handle reaching the end of tame or wild if (tame_end == tame_current) { - return (wild_end == wild_current - || ('*' == *wild_current && (wild_current + 1) == wild_end)); + while (wild_end != wild_current && '*' == *wild_current) { + ++wild_current; + } + return wild_end == wild_current; } else { if (wild_end == wild_current) { if (nullptr == wild_bookmark) { diff --git a/components/core/tests/test-GrepCore.cpp b/components/core/tests/test-GrepCore.cpp index 55ecca2fb6..130507b616 100644 --- a/components/core/tests/test-GrepCore.cpp +++ b/components/core/tests/test-GrepCore.cpp @@ -4,14 +4,18 @@ #include #include #include +#include #include #include #include +#include #include #include #include #include +#include +#include #include #include "../src/clp/Defs.h" @@ -30,17 +34,20 @@ using clp::SubQuery; using clp::variable_dictionary_id_t; using clp::VariableDictionaryReaderReq; using log_surgeon::lexers::ByteLexer; +using log_surgeon::Schema; +using log_surgeon::SchemaVarAST; using log_surgeon::SymbolId::TokenFloat; using log_surgeon::SymbolId::TokenInt; using log_surgeon::wildcard_query_parser::QueryInterpretation; using log_surgeon::wildcard_query_parser::VariableQueryToken; -using std::make_unique; +using std::pair; using std::set; using std::string; using std::string_view; -using std::unique_ptr; +using std::tuple; using std::unordered_map; using std::unordered_set; +using std::variant; using std::vector; class clp::GrepCoreTest { @@ -99,7 +106,7 @@ class FakeVarEntry { public: explicit FakeVarEntry(variable_dictionary_id_t const id, string value) : m_id{id}, - m_value{value} {} + m_value{std::move(value)} {} [[nodiscard]] auto get_id() const -> variable_dictionary_id_t { return m_id; } @@ -120,14 +127,14 @@ class FakeVarDict { } [[nodiscard]] auto get_value(dictionary_id_t const id) const -> string const& { - static string const empty{}; + static string const cEmpty{}; if (m_storage.contains(id)) { return m_storage.at(id).get_value(); } - return empty; + return cEmpty; } - auto get_entry_matching_value(string_view const val, bool ignore_case) const + auto get_entry_matching_value(string_view const val, [[maybe_unused]] bool ignore_case) const -> vector { vector results; for (auto const& [id, entry] : m_storage) { @@ -140,7 +147,7 @@ class FakeVarDict { auto get_entries_matching_wildcard_string( string_view const val, - bool ignore_case, + [[maybe_unused]] bool ignore_case, unordered_set& results ) const -> void { for (auto const& [id, entry] : m_storage) { @@ -156,15 +163,20 @@ class FakeVarDict { class FakeLogTypeEntry { public: - FakeLogTypeEntry(string const value, clp::logtype_dictionary_id_t const id) - : m_value(value), + FakeLogTypeEntry(string value, clp::logtype_dictionary_id_t const id) + : m_value(std::move(value)), m_id(id) {} auto clear() -> void { m_value.clear(); } - auto reserve_constant_length(size_t length) -> void { m_value.reserve(length); } + auto reserve_constant_length(size_t const length) -> void { m_value.reserve(length); } - auto parse_next_var(string_view msg, size_t begin, size_t end, string_view& parsed) -> bool { + auto parse_next_var( + [[maybe_unused]] string_view msg, + [[maybe_unused]] size_t begin, + [[maybe_unused]] size_t end, + [[maybe_unused]] string_view& parsed + ) -> bool { return false; } @@ -184,7 +196,10 @@ class FakeLogTypeEntry { [[nodiscard]] auto get_num_placeholders() const -> size_t { return 0; } - [[nodiscard]] auto get_placeholder_info(size_t idx, auto& ref) const -> size_t { + [[nodiscard]] auto get_placeholder_info( + [[maybe_unused]] size_t idx, + [[maybe_unused]] auto& ref + ) const -> size_t { return SIZE_MAX; } @@ -204,7 +219,7 @@ class FakeLogTypeDict { m_storage.emplace_back(value, id); } - auto get_entry_matching_value(string_view const logtype, bool ignore_case) const + auto get_entry_matching_value(string_view const logtype, [[maybe_unused]] bool ignore_case) const -> vector { vector results; for (auto const& entry : m_storage) { @@ -217,7 +232,7 @@ class FakeLogTypeDict { auto get_entries_matching_wildcard_string( string_view const logtype, - bool ignore_case, + [[maybe_unused]] bool ignore_case, unordered_set& results ) const -> void { for (auto const& entry : m_storage) { @@ -230,6 +245,151 @@ class FakeLogTypeDict { private: vector m_storage; }; + +auto make_var_dict(vector> const& entries) -> FakeVarDict; + +auto make_logtype_dict(vector>> const& entries) + -> FakeLogTypeDict; + +auto make_query_interpretation( + vector>> const& tokens +) -> QueryInterpretation; + +auto generate_expected_logtype_string(vector> const& tokens) -> string; + +auto check_sub_query( + size_t id, + vector const& sub_queries, + bool wildcard_match_required, + vector>> const& vars_info, + unordered_set const& logtype_ids +) -> void; + +/** + * Initializes a `ByteLexer` with space as a delimiter and the given `schema_rules`. + * + * @param schema_rules A vector of strings, each string representing a schema rule. + * @return The initialized `ByteLexer`. + */ +auto make_test_lexer(vector const& schema_rules) -> ByteLexer; + +auto make_var_dict(vector> const& entries) -> FakeVarDict { + FakeVarDict dict; + for (auto const& [id, val] : entries) { + dict.add_entry(id, val); + } + return dict; +} + +auto make_logtype_dict(vector>> const& entries) + -> FakeLogTypeDict { + FakeLogTypeDict dict; + clp::logtype_dictionary_id_t id{0}; + for (auto const& entry : entries) { + dict.add_entry(generate_expected_logtype_string(entry), id++); + } + return dict; +} + +auto make_query_interpretation( + vector>> const& tokens +) -> QueryInterpretation { + QueryInterpretation interp; + for (auto const& token : tokens) { + if (holds_alternative(token)) { + interp.append_static_token(get(token)); + } else { + auto const& [symbol, value]{get>(token)}; + auto const contains_wildcard{value.find_first_of("*?") != string::npos}; + interp.append_variable_token(symbol, value, contains_wildcard); + } + } + return interp; +} + +auto generate_expected_logtype_string(vector> const& tokens) -> string { + string result; + for (auto const& token : tokens) { + if (holds_alternative(token)) { + result.append(get(token)); + } else { + switch (get(token)) { + case 'i': EncodedVariableInterpreter::add_int_var(result); break; + case 'f': EncodedVariableInterpreter::add_float_var(result); break; + case 'd': EncodedVariableInterpreter::add_dict_var(result); break; + default: break; + } + } + } + return result; +} + +auto check_sub_query( + size_t id, + vector const& sub_queries, + bool const wildcard_match_required, + vector>> const& vars_info, + unordered_set const& logtype_ids +) -> void { + CAPTURE(id); + auto const& sub_query{sub_queries[id]}; + + REQUIRE(wildcard_match_required == sub_query.wildcard_match_required()); + REQUIRE(vars_info.size() == sub_query.get_num_possible_vars()); + + for (size_t i{0}; i < vars_info.size(); ++i) { + auto const& [is_dict_var, is_precise_var, var_dict_ids]{vars_info[i]}; + auto const& var{sub_query.get_vars()[i]}; + REQUIRE(is_dict_var == var.is_dict_var()); + REQUIRE(is_precise_var == var.is_precise_var()); + if (is_dict_var) { + if (is_precise_var) { + REQUIRE(1 == var_dict_ids.size()); + REQUIRE(var_dict_ids.contains(var.get_var_dict_id())); + } else { + REQUIRE(var_dict_ids == var.get_possible_var_dict_ids()); + } + } + } + + REQUIRE(logtype_ids == sub_query.get_possible_logtypes()); +} + +auto make_test_lexer(vector const& schema_rules) -> ByteLexer { + constexpr uint32_t cIntId{static_cast(TokenInt)}; + constexpr uint32_t cFloatId{static_cast(TokenFloat)}; + constexpr uint32_t cHasNumId{111}; + + ByteLexer lexer; + lexer.m_symbol_id["int"] = cIntId; + lexer.m_symbol_id["float"] = cFloatId; + lexer.m_symbol_id["hasNumber"] = cHasNumId; + lexer.m_id_symbol[cIntId] = "int"; + lexer.m_id_symbol[cFloatId] = "float"; + lexer.m_id_symbol[cHasNumId] = "hasNumber"; + lexer.set_delimiters({' '}); + + Schema schema; + for (auto const& schema_rule : schema_rules) { + schema.add_variable(schema_rule, -1); + } + + auto const schema_ast = schema.release_schema_ast_ptr(); + REQUIRE(nullptr != schema_ast); + REQUIRE(schema_rules.size() == schema_ast->m_schema_vars.size()); + for (size_t i{0}; i < schema_ast->m_schema_vars.size(); ++i) { + REQUIRE(nullptr != schema_ast->m_schema_vars[i]); + auto* capture_rule_ast{dynamic_cast(schema_ast->m_schema_vars[i].get())}; + REQUIRE(nullptr != capture_rule_ast); + lexer.add_rule( + lexer.m_symbol_id[capture_rule_ast->m_name], + std::move(capture_rule_ast->m_regex_ptr) + ); + } + + lexer.generate(); + return lexer; +} } // namespace // Tests: `get_wildcard_encodable_positions` @@ -241,15 +401,18 @@ TEST_CASE("get_wildcard_encodable_positions_for_empty_interpretation", "[dfa_sea } TEST_CASE("get_wildcard_encodable_positions_for_multi_variable_interpretation", "[dfa_search]") { - constexpr uint32_t cHasNumberId{100}; + constexpr uint32_t cIntId{static_cast(TokenInt)}; + constexpr uint32_t cFloatId{static_cast(TokenFloat)}; + constexpr uint32_t cHasNumId{111}; - QueryInterpretation interpretation{}; - interpretation.append_static_token("static_text"); - interpretation.append_variable_token(static_cast(TokenInt), "100", false); - interpretation.append_variable_token(static_cast(TokenFloat), "32.2", false); - interpretation.append_variable_token(static_cast(TokenInt), "10?", true); - interpretation.append_variable_token(static_cast(TokenFloat), "3.14*", true); - interpretation.append_variable_token(cHasNumberId, "3.14*", true); + auto const interpretation{make_query_interpretation({ + "text", + pair{cIntId,"100"}, + pair{cFloatId,"32.2"}, + pair{cIntId,"10?"}, + pair{cFloatId,"3.14*"}, + pair{cHasNumId,"3.14*"} + })}; auto const positions{clp::GrepCoreTest::get_wildcard_encodable_positions(interpretation)}; REQUIRE(2 == positions.size()); @@ -274,7 +437,7 @@ TEST_CASE("generate_logtype_string_for_empty_interpretation", "[dfa_search]") { auto logtype_string{ clp::GrepCoreTest::generate_logtype_string(interpretation, wildcard_mask_map) }; - REQUIRE("" == logtype_string); + REQUIRE(logtype_string.empty()); } } @@ -299,44 +462,25 @@ TEST_CASE("generate_logtype_string_for_single_variable_interpretation", "[dfa_se } TEST_CASE("generate_logtype_string_for_multi_variable_interpretation", "[dfa_search]") { - constexpr uint32_t cHasNumberId{100}; - - vector expected_logtype_strings; - expected_logtype_strings.push_back("static_text"); - EncodedVariableInterpreter::add_int_var(expected_logtype_strings.back()); - EncodedVariableInterpreter::add_float_var(expected_logtype_strings.back()); - EncodedVariableInterpreter::add_dict_var(expected_logtype_strings.back()); - EncodedVariableInterpreter::add_dict_var(expected_logtype_strings.back()); - EncodedVariableInterpreter::add_dict_var(expected_logtype_strings.back()); - - expected_logtype_strings.push_back("static_text"); - EncodedVariableInterpreter::add_int_var(expected_logtype_strings.back()); - EncodedVariableInterpreter::add_float_var(expected_logtype_strings.back()); - EncodedVariableInterpreter::add_int_var(expected_logtype_strings.back()); - EncodedVariableInterpreter::add_dict_var(expected_logtype_strings.back()); - EncodedVariableInterpreter::add_dict_var(expected_logtype_strings.back()); - - expected_logtype_strings.push_back("static_text"); - EncodedVariableInterpreter::add_int_var(expected_logtype_strings.back()); - EncodedVariableInterpreter::add_float_var(expected_logtype_strings.back()); - EncodedVariableInterpreter::add_dict_var(expected_logtype_strings.back()); - EncodedVariableInterpreter::add_float_var(expected_logtype_strings.back()); - EncodedVariableInterpreter::add_dict_var(expected_logtype_strings.back()); - - expected_logtype_strings.push_back("static_text"); - EncodedVariableInterpreter::add_int_var(expected_logtype_strings.back()); - EncodedVariableInterpreter::add_float_var(expected_logtype_strings.back()); - EncodedVariableInterpreter::add_int_var(expected_logtype_strings.back()); - EncodedVariableInterpreter::add_float_var(expected_logtype_strings.back()); - EncodedVariableInterpreter::add_dict_var(expected_logtype_strings.back()); - - QueryInterpretation interpretation{}; - interpretation.append_static_token("static_text"); - interpretation.append_variable_token(static_cast(TokenInt), "100", false); - interpretation.append_variable_token(static_cast(TokenFloat), "32.2", false); - interpretation.append_variable_token(static_cast(TokenInt), "10?", true); - interpretation.append_variable_token(static_cast(TokenFloat), "3.14*", true); - interpretation.append_variable_token(cHasNumberId, "3.14*", true); + constexpr uint32_t cIntId{static_cast(TokenInt)}; + constexpr uint32_t cFloatId{static_cast(TokenFloat)}; + constexpr uint32_t cHasNumId{111}; + + unordered_set const expected_logtype_strings{{ + {generate_expected_logtype_string({"text", 'i', 'f', 'd', 'd', 'd'})}, + {generate_expected_logtype_string({"text", 'i', 'f', 'i', 'd', 'd'})}, + {generate_expected_logtype_string({"text", 'i', 'f', 'd', 'f', 'd'})}, + {generate_expected_logtype_string({"text", 'i', 'f', 'i', 'f', 'd'})} + }}; + + auto const interpretation{make_query_interpretation({ + "text", + pair{cIntId,"100"}, + pair{cFloatId,"32.2"}, + pair{cIntId,"10?"}, + pair{cFloatId,"3.14*"}, + pair{cHasNumId,"3.14*"} + })}; auto const wildcard_encodable_positions{ clp::GrepCoreTest::get_wildcard_encodable_positions(interpretation) @@ -344,23 +488,22 @@ TEST_CASE("generate_logtype_string_for_multi_variable_interpretation", "[dfa_sea size_t const num_combos{static_cast(1) << wildcard_encodable_positions.size()}; REQUIRE(num_combos == 4); + unordered_set logtype_strings; for (size_t mask{0}; mask < num_combos; ++mask) { unordered_map wildcard_mask_map; for (size_t i{0}; i < wildcard_encodable_positions.size(); ++i) { wildcard_mask_map[wildcard_encodable_positions[i]] = mask >> i & 1; } - auto logtype_string{ + logtype_strings.insert( clp::GrepCoreTest::generate_logtype_string(interpretation, wildcard_mask_map) - }; - CAPTURE(mask); - REQUIRE(expected_logtype_strings[mask] == logtype_string); + ); } + REQUIRE(expected_logtype_strings == logtype_strings); } // Tests: `process_schema_var_token` TEST_CASE("process_schema_empty_token ", "[dfa_search]") { - FakeVarDict var_dict; - var_dict.add_entry(0, "100"); + FakeVarDict const var_dict{make_var_dict({pair{0, "100"}})}; SubQuery sub_query; VariableQueryToken const static_token{0, "", false}; @@ -370,8 +513,7 @@ TEST_CASE("process_schema_empty_token ", "[dfa_search]") { } TEST_CASE("process_schema_unmatched_token ", "[dfa_search]") { - FakeVarDict var_dict; - var_dict.add_entry(0, "100"); + FakeVarDict const var_dict{make_var_dict({pair{0, "100"}})}; SubQuery sub_query; VariableQueryToken const static_token{0, "200", false}; @@ -381,8 +523,7 @@ TEST_CASE("process_schema_unmatched_token ", "[dfa_search]") { } TEST_CASE("process_schema_int_token ", "[dfa_search]") { - FakeVarDict var_dict; - var_dict.add_entry(0, "100"); + FakeVarDict const var_dict{make_var_dict({pair{0, "100"}})}; SubQuery sub_query; VariableQueryToken const int_token{0, "100", false}; @@ -397,9 +538,7 @@ TEST_CASE("process_schema_int_token ", "[dfa_search]") { } TEST_CASE("process_schema_encoded_non_greedy_wildcard_token ", "[dfa_search]") { - FakeVarDict var_dict; - var_dict.add_entry(0, "10a0"); - var_dict.add_entry(1, "10b0"); + FakeVarDict const var_dict{make_var_dict({pair{0, "10a0"}, pair{1, "10b0"}})}; SECTION("interpret_as_int") { SubQuery sub_query; @@ -449,12 +588,14 @@ TEST_CASE("process_schema_encoded_non_greedy_wildcard_token ", "[dfa_search]") { // NOTE: CLP currently treats all non-encoded variables as the same, so the below test demonstrates // this. In the future if CLP is more sophisticated, the two sections behave differently. TEST_CASE("process_schema_non_encoded_non_greedy_wildcard_token ", "[dfa_search]") { - FakeVarDict var_dict; - var_dict.add_entry(0, "100000000000000000000000010"); - var_dict.add_entry(1, "100000000000000000000000020"); - var_dict.add_entry(2, "100000000000000000000000030"); - var_dict.add_entry(3, "1000000000000000000000000.0"); - var_dict.add_entry(4, "1000000000000000000000000a0"); + size_t id{0}; + FakeVarDict const var_dict{make_var_dict({ + pair{id++, "100000000000000000000000010"}, + pair{id++, "100000000000000000000000020"}, + pair{id++, "100000000000000000000000030"}, + pair{id++, "1000000000000000000000000.0"}, + pair{id++, "1000000000000000000000000a0"} + })}; SECTION("interpret_as_int") { SubQuery sub_query; @@ -503,14 +644,16 @@ TEST_CASE("process_schema_non_encoded_non_greedy_wildcard_token ", "[dfa_search] } TEST_CASE("process_schema_greedy_wildcard_token ", "[dfa_search]") { - FakeVarDict var_dict; - var_dict.add_entry(0, "10a0"); - var_dict.add_entry(1, "10b0"); - var_dict.add_entry(2, "100000000000000000000000010"); - var_dict.add_entry(3, "100000000000000000000000020"); - var_dict.add_entry(4, "100000000000000000000000030"); - var_dict.add_entry(5, "1000000000000000000000000.0"); - var_dict.add_entry(6, "1000000000000000000000000a0"); + size_t id{0}; + FakeVarDict const var_dict{make_var_dict({ + pair{id++, "10a0"}, + pair{id++, "10b0"}, + pair{id++, "100000000000000000000000010"}, + pair{id++, "100000000000000000000000020"}, + pair{id++, "100000000000000000000000030"}, + pair{id++, "1000000000000000000000000.0"}, + pair{id++, "1000000000000000000000000a0"} + })}; SECTION("interpret_as_non_encoded_int") { SubQuery sub_query; @@ -589,125 +732,36 @@ TEST_CASE("process_schema_greedy_wildcard_token ", "[dfa_search]") { // Tests: `generate_schema_sub_queries` TEST_CASE("generate_schema_sub_queries", "[dfa_search]") { - constexpr uint32_t cFloatId{static_cast(TokenFloat)}; - constexpr uint32_t cHasNumberId{100}; constexpr uint32_t cIntId{static_cast(TokenInt)}; - - FakeVarDict var_dict; - var_dict.add_entry(0, "10a"); - var_dict.add_entry(1, "1a3"); - - FakeLogTypeDict logtype_dict; - - string logtype_string{"static_text "}; - EncodedVariableInterpreter::add_int_var(logtype_string); - logtype_string += " "; - EncodedVariableInterpreter::add_int_var(logtype_string); - logtype_string += " "; - EncodedVariableInterpreter::add_float_var(logtype_string); - logtype_dict.add_entry(logtype_string, 0); - - logtype_string = "static_text "; - EncodedVariableInterpreter::add_int_var(logtype_string); - logtype_string += " "; - EncodedVariableInterpreter::add_dict_var(logtype_string); - logtype_string += " "; - EncodedVariableInterpreter::add_float_var(logtype_string); - logtype_dict.add_entry(logtype_string, 1); - - logtype_string = "static_text "; - EncodedVariableInterpreter::add_int_var(logtype_string); - logtype_string += " "; - EncodedVariableInterpreter::add_dict_var(logtype_string); - logtype_string += " 3.14ab'"; - logtype_dict.add_entry(logtype_string, 2); - - logtype_string = "static_text "; - EncodedVariableInterpreter::add_int_var(logtype_string); - logtype_string += " "; - EncodedVariableInterpreter::add_dict_var(logtype_string); - logtype_string += " 3.15ab'"; - logtype_dict.add_entry(logtype_string, 3); - - logtype_string = "static_text "; - EncodedVariableInterpreter::add_int_var(logtype_string); - logtype_string += " 10' "; - EncodedVariableInterpreter::add_float_var(logtype_string); - logtype_dict.add_entry(logtype_string, 4); - + constexpr uint32_t cFloatId{static_cast(TokenFloat)}; + constexpr uint32_t cHasNumId{111}; + + FakeVarDict const var_dict{make_var_dict({pair{0, "10a"}, pair{1, "1a3"}})}; + FakeLogTypeDict const logtype_dict{make_logtype_dict({ + {"text ", 'i', " ", 'i', " ", 'f'}, + {"text ", 'i', " ", 'd', " ", 'f'}, + {"text ", 'i', " ", 'd', " 3.14ab$"}, + {"text ", 'i', " ", 'd', " 3.14abc$"}, + {"text ", 'i', " ", 'd', " 3.15ab$"}, + {"text ", 'i', " 10$ ", 'f'} + })}; + + using V = pair; + vector>> raw_interpretations{ + {"text ", V{cIntId, "100"}, " ", V{cIntId, "10?"}, " ", V{cFloatId," 3.14*"}}, + {"text ", V{cIntId, "100"}, " ", V{cIntId, "10?"}, " ", V{cHasNumId, "3.14*"}}, + {"text ", V{cIntId, "100"}, " ", V{cIntId, "10?"}, " 3.14*"}, + {"text ", V{cIntId, "100"}, " ", V{cHasNumId, "10?"}, " ", V{cFloatId," 3.14*"}}, + {"text ", V{cIntId, "100"}, " ", V{cHasNumId, "10?"}, " ", V{cHasNumId, "3.14*"}}, + {"text ", V{cIntId, "100"}, " ", V{cHasNumId, "10?"}, " 3.14*"}, + {"text ", V{cIntId, "100"}, " 10? ", V{cFloatId," 3.14*"}}, + {"text ", V{cIntId, "100"}, " 10? ", V{cHasNumId, "3.14*"}}, + {"text ", V{cIntId, "100"}, " 10? 3.14*"} + }; set interpretations; - - QueryInterpretation interpretation1{}; - interpretation1.append_static_token("static_text "); - interpretation1.append_variable_token(cIntId, "100", false); - interpretation1.append_static_token(" "); - interpretation1.append_variable_token(cIntId, "10?", true); - interpretation1.append_static_token(" "); - interpretation1.append_variable_token(cFloatId, "3.14*", true); - interpretations.insert(interpretation1); - - QueryInterpretation interpretation2{}; - interpretation2.append_static_token("static_text "); - interpretation2.append_variable_token(cIntId, "100", false); - interpretation2.append_static_token(" "); - interpretation2.append_variable_token(cIntId, "10?", true); - interpretation2.append_static_token(" "); - interpretation2.append_variable_token(cHasNumberId, "3.14*", true); - interpretations.insert(interpretation2); - - QueryInterpretation interpretation3{}; - interpretation3.append_static_token("static_text "); - interpretation3.append_variable_token(cIntId, "100", false); - interpretation3.append_static_token(" "); - interpretation3.append_variable_token(cIntId, "10?", true); - interpretation3.append_static_token(" 3.14*"); - interpretations.insert(interpretation3); - - QueryInterpretation interpretation4{}; - interpretation4.append_static_token("static_text "); - interpretation4.append_variable_token(cIntId, "100", false); - interpretation4.append_static_token(" "); - interpretation4.append_variable_token(cHasNumberId, "10?", true); - interpretation4.append_static_token(" "); - interpretation4.append_variable_token(cFloatId, "3.14*", true); - interpretations.insert(interpretation4); - - QueryInterpretation interpretation5{}; - interpretation5.append_static_token("static_text "); - interpretation5.append_variable_token(cIntId, "100", false); - interpretation5.append_static_token(" "); - interpretation5.append_variable_token(cHasNumberId, "10?", true); - interpretation5.append_static_token(" "); - interpretation5.append_variable_token(cHasNumberId, "3.14*", true); - interpretations.insert(interpretation5); - - QueryInterpretation interpretation6{}; - interpretation6.append_static_token("static_text "); - interpretation6.append_variable_token(cIntId, "100", false); - interpretation6.append_static_token(" "); - interpretation6.append_variable_token(cHasNumberId, "10?", true); - interpretation6.append_static_token(" 3.14*"); - interpretations.insert(interpretation6); - - QueryInterpretation interpretation7{}; - interpretation7.append_static_token("static_text "); - interpretation7.append_variable_token(cIntId, "100", false); - interpretation7.append_static_token(" 10? "); - interpretation7.append_variable_token(cFloatId, "3.14*", true); - interpretations.insert(interpretation7); - - QueryInterpretation interpretation8{}; - interpretation8.append_static_token("static_text "); - interpretation8.append_variable_token(cIntId, "100", false); - interpretation8.append_static_token(" 10? "); - interpretation8.append_variable_token(cHasNumberId, "3.14*", true); - interpretations.insert(interpretation8); - - QueryInterpretation interpretation9{}; - interpretation9.append_static_token("static_text "); - interpretation9.append_variable_token(cIntId, "100", false); - interpretation9.append_static_token(" 10? 3.14*"); - interpretations.insert(interpretation9); + for (auto const& raw_interpretation : raw_interpretations) { + interpretations.insert(make_query_interpretation(raw_interpretation)); + } vector sub_queries; clp::GrepCoreTest::generate_schema_sub_queries( @@ -717,165 +771,89 @@ TEST_CASE("generate_schema_sub_queries", "[dfa_search]") { sub_queries ); + using Info = tuple>; REQUIRE(6 == sub_queries.size()); + size_t i{0}; + // NOTE: sub queries 0 and 2 are a duplicate of 3 and 5 because we use a vector instead of a set + // when storing `m_sub_queries` in `Query`. + check_sub_query(i++, sub_queries, true, {Info{false, true, {}}, Info{true, true, {0}}}, {1}); + check_sub_query(i++, sub_queries, true, {Info{false, true, {}}}, {0}); + check_sub_query(i++, sub_queries, false, {Info{false, true, {}}, Info{true, true, {0}}}, {2,3}); + check_sub_query(i++, sub_queries, true, {Info{false, true, {}}, Info{true, true, {0}}}, {1}); + check_sub_query(i++, sub_queries, false, {Info{false, true, {}}, Info{true, true, {0}}}, {2,3}); + check_sub_query(i++, sub_queries, true, {Info{false, true, {}}}, {5}); +} - REQUIRE(sub_queries[0].wildcard_match_required()); - REQUIRE(2 == sub_queries[0].get_num_possible_vars()); - { - auto const& var{sub_queries[0].get_vars()[0]}; - REQUIRE(false == var.is_dict_var()); - REQUIRE(var.is_precise_var()); - } - { - auto const& var{sub_queries[0].get_vars()[1]}; - REQUIRE(var.is_dict_var()); - REQUIRE(var.is_precise_var()); - REQUIRE(0 == var.get_var_dict_id()); - REQUIRE(var.get_possible_var_dict_ids().empty()); - } - { - auto logtype_ids{sub_queries[1].get_possible_logtypes()}; - REQUIRE(1 == logtype_ids.size()); - CAPTURE(logtype_ids); - REQUIRE(logtype_ids.contains(0)); - } - - REQUIRE(sub_queries[1].wildcard_match_required()); - REQUIRE(1 == sub_queries[1].get_num_possible_vars()); - { - auto const& var{sub_queries[1].get_vars()[0]}; - REQUIRE(false == var.is_dict_var()); - REQUIRE(var.is_precise_var()); - } - { - auto logtype_ids{sub_queries[1].get_possible_logtypes()}; - REQUIRE(1 == logtype_ids.size()); - CAPTURE(logtype_ids); - REQUIRE(logtype_ids.contains(0)); - } - - REQUIRE(false == sub_queries[2].wildcard_match_required()); - REQUIRE(2 == sub_queries[2].get_num_possible_vars()); - { - auto const& var{sub_queries[2].get_vars()[0]}; - REQUIRE(false == var.is_dict_var()); - REQUIRE(var.is_precise_var()); - } - { - auto const& var{sub_queries[2].get_vars()[1]}; - REQUIRE(var.is_dict_var()); - REQUIRE(var.is_precise_var()); - } - { - auto logtype_ids{sub_queries[2].get_possible_logtypes()}; - REQUIRE(1 == logtype_ids.size()); - CAPTURE(logtype_ids); - REQUIRE(logtype_ids.contains(2)); - } - - REQUIRE(sub_queries[3].wildcard_match_required()); - REQUIRE(2 == sub_queries[3].get_num_possible_vars()); - { - auto const& var{sub_queries[3].get_vars()[0]}; - REQUIRE(false == var.is_dict_var()); - REQUIRE(var.is_precise_var()); - } - { - auto const& var{sub_queries[3].get_vars()[1]}; - REQUIRE(var.is_dict_var()); - REQUIRE(var.is_precise_var()); - } - { - auto logtype_ids{sub_queries[3].get_possible_logtypes()}; - REQUIRE(1 == logtype_ids.size()); - CAPTURE(logtype_ids); - REQUIRE(logtype_ids.contains(1)); +TEST_CASE("generate_schema_sub_queries_with_wildcard_duplication", "[dfa_search]") { + constexpr uint32_t cIntId{static_cast(TokenInt)}; + constexpr uint32_t cFloatId{static_cast(TokenFloat)}; + constexpr uint32_t cHasNumId{111}; + + FakeVarDict const var_dict{make_var_dict({pair{0, "10a"}, pair{1, "1a3"}})}; + FakeLogTypeDict const logtype_dict{make_logtype_dict({ + {"text ", 'i', " ", 'i', " ", 'f'}, + {"text ", 'i', " ", 'd', " ", 'f'}, + {"text ", 'i', " ", 'd', " 3.14ab$"}, + {"text ", 'i', " ", 'd', " 3.14abc$"}, + {"text ", 'i', " ", 'd', " 3.15ab$"}, + {"text ", 'i', " 10$ ", 'f'} + })}; + + using V = pair; + vector>> raw_interpretations{ + {"text ", V{cIntId, "100"}, " ", V{cIntId, "10?"}, " ", V{cFloatId," 3.14*"}, "*"}, + {"text ", V{cIntId, "100"}, " ", V{cIntId, "10?"}, " ", V{cHasNumId, "3.14*"}, "*"}, + {"text ", V{cIntId, "100"}, " ", V{cIntId, "10?"}, " 3.14**"}, + {"text ", V{cIntId, "100"}, " ", V{cHasNumId, "10?"}, " ", V{cFloatId," 3.14*"}, "*"}, + {"text ", V{cIntId, "100"}, " ", V{cHasNumId, "10?"}, " ", V{cHasNumId, "3.14*"}, "*"}, + {"text ", V{cIntId, "100"}, " ", V{cHasNumId, "10?"}, " 3.14**"}, + {"text ", V{cIntId, "100"}, " 10? ", V{cFloatId," 3.14*"}, "*"}, + {"text ", V{cIntId, "100"}, " 10? ", V{cHasNumId, "3.14*"}, "*"}, + {"text ", V{cIntId, "100"}, " 10? 3.14**"} + }; + set interpretations; + for (auto const& raw_interpretation : raw_interpretations) { + interpretations.insert(make_query_interpretation(raw_interpretation)); } - REQUIRE(false == sub_queries[4].wildcard_match_required()); - REQUIRE(2 == sub_queries[4].get_num_possible_vars()); - { - auto const& var{sub_queries[4].get_vars()[0]}; - REQUIRE(false == var.is_dict_var()); - REQUIRE(var.is_precise_var()); - } - { - auto const& var{sub_queries[4].get_vars()[1]}; - REQUIRE(var.is_dict_var()); - REQUIRE(var.is_precise_var()); - } - { - auto logtype_ids{sub_queries[4].get_possible_logtypes()}; - REQUIRE(1 == logtype_ids.size()); - CAPTURE(logtype_ids); - REQUIRE(logtype_ids.contains(2)); - } + vector sub_queries; + clp::GrepCoreTest::generate_schema_sub_queries( + interpretations, + logtype_dict, + var_dict, + sub_queries + ); - REQUIRE(sub_queries[5].wildcard_match_required()); - REQUIRE(1 == sub_queries[5].get_num_possible_vars()); - { - auto const& var{sub_queries[5].get_vars()[0]}; - REQUIRE(false == var.is_dict_var()); - REQUIRE(var.is_precise_var()); - } - { - auto logtype_ids{sub_queries[5].get_possible_logtypes()}; - REQUIRE(1 == logtype_ids.size()); - CAPTURE(logtype_ids); - REQUIRE(logtype_ids.contains(4)); - } + using Info = tuple>; + REQUIRE(6 == sub_queries.size()); + size_t i{0}; + check_sub_query(i++, sub_queries, true, {Info{false, true, {}}, Info{true, true, {0}}}, {1}); + check_sub_query(i++, sub_queries, true, {Info{false, true, {}}}, {0}); + check_sub_query(i++, sub_queries, false, {Info{false, true, {}}, Info{true, true, {0}}}, {2,3}); + check_sub_query(i++, sub_queries, true, {Info{false, true, {}}, Info{true, true, {0}}}, {1}); + check_sub_query(i++, sub_queries, false, {Info{false, true, {}}, Info{true, true, {0}}}, {2,3}); + check_sub_query(i++, sub_queries, true, {Info{false, true, {}}}, {5}); } -/* // Tests: `process_raw_query` TEST_CASE("process_raw_query", "[dfa_search]") { - constexpr uint32_t cFloatId{static_cast(TokenFloat)}; - constexpr uint32_t cHasNumberId{100}; - constexpr uint32_t cIntId{static_cast(TokenInt)}; - - FakeVarDict var_dict; - var_dict.add_entry(0, "10a"); - var_dict.add_entry(1, "1a3"); - - FakeLogTypeDict logtype_dict; - - string logtype_string{"static_text "}; - EncodedVariableInterpreter::add_int_var(logtype_string); - logtype_string += " "; - EncodedVariableInterpreter::add_int_var(logtype_string); - logtype_string += " "; - EncodedVariableInterpreter::add_float_var(logtype_string); - logtype_dict.add_entry(logtype_string, 0); - - logtype_string = "static_text "; - EncodedVariableInterpreter::add_int_var(logtype_string); - logtype_string += " "; - EncodedVariableInterpreter::add_dict_var(logtype_string); - logtype_string += " "; - EncodedVariableInterpreter::add_float_var(logtype_string); - logtype_dict.add_entry(logtype_string, 0); - - logtype_string = "static_text "; - EncodedVariableInterpreter::add_int_var(logtype_string); - logtype_string += " "; - EncodedVariableInterpreter::add_dict_var(logtype_string); - logtype_string += " 3.14ab'"; - logtype_dict.add_entry(logtype_string, 0); - - logtype_string = "static_text "; - EncodedVariableInterpreter::add_int_var(logtype_string); - logtype_string += " "; - EncodedVariableInterpreter::add_dict_var(logtype_string); - logtype_string += " 3.15ab'"; - logtype_dict.add_entry(logtype_string, 0); - - logtype_string = "static_text "; - EncodedVariableInterpreter::add_int_var(logtype_string); - logtype_string += " 10' "; - EncodedVariableInterpreter::add_float_var(logtype_string); - logtype_dict.add_entry(logtype_string, 0); - - string raw_query{"static_text 100 10? 3.14*"}; + auto lexer{make_test_lexer({ + {R"(int:(\d+))"}, + {R"(float:(\d+\.\d+))"}, + {R"(hasNumber:[^ $]*\d+[^ $]*)"} + })}; + + FakeVarDict const var_dict{make_var_dict({pair{0, "10a"}, pair{1, "1a3"}})}; + FakeLogTypeDict const logtype_dict{make_logtype_dict({ + {"text ", 'i', " ", 'i', " ", 'f'}, + {"text ", 'i', " ", 'd', " ", 'f'}, + {"text ", 'i', " ", 'd', " 3.14ab$"}, + {"text ", 'i', " ", 'd', " 3.14abc$"}, + {"text ", 'i', " ", 'd', " 3.15ab$"}, + {"text ", 'i', " 10$ ", 'f'} + })}; + + string const raw_query{"text 100 10? 3.14*"}; auto const query{GrepCore::process_raw_query( logtype_dict, @@ -883,15 +861,24 @@ TEST_CASE("process_raw_query", "[dfa_search]") { raw_query, 0, 0, - false, + true, lexer, false )}; - auto const& sub_queries{query.get_sub_queries()}; + REQUIRE(query.has_value()); + + using Info = tuple>; + auto const& sub_queries{query.value().get_sub_queries()}; REQUIRE(6 == sub_queries.size()); + size_t i{0}; + check_sub_query(i++, sub_queries, true, {Info{false, true, {}}, Info{true, true, {0}}}, {1}); + check_sub_query(i++, sub_queries, true, {Info{false, true, {}}}, {0}); + check_sub_query(i++, sub_queries, false, {Info{false, true, {}}, Info{true, true, {0}}}, {2,3}); + check_sub_query(i++, sub_queries, true, {Info{false, true, {}}, Info{true, true, {0}}}, {1}); + check_sub_query(i++, sub_queries, false, {Info{false, true, {}}, Info{true, true, {0}}}, {2,3}); + check_sub_query(i++, sub_queries, true, {Info{false, true, {}}}, {5}); } -*/ // Tests: `get_bounds_of_next_potential_var` TEST_CASE("get_bounds_of_next_potential_var", "[get_bounds_of_next_potential_var]") { diff --git a/components/core/tests/test-string_utils.cpp b/components/core/tests/test-string_utils.cpp index e168a704c2..0219a84989 100644 --- a/components/core/tests/test-string_utils.cpp +++ b/components/core/tests/test-string_utils.cpp @@ -145,30 +145,60 @@ SCENARIO("Test case sensitive wild card match in all possible ways", "[wildcard] REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); } + GIVEN("Double wild with no suffix char") { + tameString = "abcd", wildString = "a**"; + REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); + } + GIVEN("Single wild with no prefix char") { tameString = "abcd", wildString = "*d"; REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); } + GIVEN("Double wild with no prefix char") { + tameString = "abcd", wildString = "**d"; + REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); + } + GIVEN("Single wild on both side & has 1st char as literal") { tameString = "abcd", wildString = "*a*"; REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); } + GIVEN("Double wild on both side & has 1st char as literal") { + tameString = "abcd", wildString = "**a**"; + REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); + } + GIVEN("Single wild on both side & has middle char as literal") { tameString = "abcd", wildString = "*b*"; REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); } + GIVEN("Double wild on both side & has middle char as literal") { + tameString = "abcd", wildString = "**b**"; + REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); + } + GIVEN("Single wild on both side & has last char as literal") { tameString = "abcd", wildString = "*d*"; REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); } + GIVEN("Double wild on both side & has last char as literal") { + tameString = "abcd", wildString = "**d**"; + REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); + } + GIVEN("Single wild only") { tameString = "abcd", wildString = "*"; REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); } + + GIVEN("Double wild only") { + tameString = "abcd", wildString = "**"; + REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); + } } WHEN("Match is expected if Wild card character is \"?\"") { From b6339802de6d0c01f49a5250e60aee02b97fe49f Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 1 Oct 2025 07:49:47 -0400 Subject: [PATCH 013/164] Format. --- .../src/clp/string_utils/string_utils.cpp | 4 +- components/core/tests/test-GrepCore.cpp | 235 +++++++++--------- components/core/tests/test-string_utils.cpp | 8 +- 3 files changed, 121 insertions(+), 126 deletions(-) diff --git a/components/core/src/clp/string_utils/string_utils.cpp b/components/core/src/clp/string_utils/string_utils.cpp index 6b1a08d82d..854312a587 100644 --- a/components/core/src/clp/string_utils/string_utils.cpp +++ b/components/core/src/clp/string_utils/string_utils.cpp @@ -263,8 +263,8 @@ bool wildcard_match_unsafe_case_sensitive(string_view tame, string_view wild) { while ('*' == *wild_current) { ++wild_current; if (wild_end == wild_current) { - // Trailing '*' means everything remaining in tame will match - return true; + // Trailing '*' means everything remaining in tame will match + return true; } } diff --git a/components/core/tests/test-GrepCore.cpp b/components/core/tests/test-GrepCore.cpp index 130507b616..1ebcbb2ebd 100644 --- a/components/core/tests/test-GrepCore.cpp +++ b/components/core/tests/test-GrepCore.cpp @@ -196,10 +196,8 @@ class FakeLogTypeEntry { [[nodiscard]] auto get_num_placeholders() const -> size_t { return 0; } - [[nodiscard]] auto get_placeholder_info( - [[maybe_unused]] size_t idx, - [[maybe_unused]] auto& ref - ) const -> size_t { + [[nodiscard]] auto + get_placeholder_info([[maybe_unused]] size_t idx, [[maybe_unused]] auto& ref) const -> size_t { return SIZE_MAX; } @@ -219,7 +217,8 @@ class FakeLogTypeDict { m_storage.emplace_back(value, id); } - auto get_entry_matching_value(string_view const logtype, [[maybe_unused]] bool ignore_case) const + auto + get_entry_matching_value(string_view const logtype, [[maybe_unused]] bool ignore_case) const -> vector { vector results; for (auto const& entry : m_storage) { @@ -251,9 +250,8 @@ auto make_var_dict(vector> const& entries) -> FakeVarDict; auto make_logtype_dict(vector>> const& entries) -> FakeLogTypeDict; -auto make_query_interpretation( - vector>> const& tokens -) -> QueryInterpretation; +auto make_query_interpretation(vector>> const& tokens) + -> QueryInterpretation; auto generate_expected_logtype_string(vector> const& tokens) -> string; @@ -291,9 +289,8 @@ auto make_logtype_dict(vector>> const& entries return dict; } -auto make_query_interpretation( - vector>> const& tokens -) -> QueryInterpretation { +auto make_query_interpretation(vector>> const& tokens) + -> QueryInterpretation { QueryInterpretation interp; for (auto const& token : tokens) { if (holds_alternative(token)) { @@ -314,10 +311,17 @@ auto generate_expected_logtype_string(vector> const& result.append(get(token)); } else { switch (get(token)) { - case 'i': EncodedVariableInterpreter::add_int_var(result); break; - case 'f': EncodedVariableInterpreter::add_float_var(result); break; - case 'd': EncodedVariableInterpreter::add_dict_var(result); break; - default: break; + case 'i': + EncodedVariableInterpreter::add_int_var(result); + break; + case 'f': + EncodedVariableInterpreter::add_float_var(result); + break; + case 'd': + EncodedVariableInterpreter::add_dict_var(result); + break; + default: + break; } } } @@ -405,14 +409,14 @@ TEST_CASE("get_wildcard_encodable_positions_for_multi_variable_interpretation", constexpr uint32_t cFloatId{static_cast(TokenFloat)}; constexpr uint32_t cHasNumId{111}; - auto const interpretation{make_query_interpretation({ - "text", - pair{cIntId,"100"}, - pair{cFloatId,"32.2"}, - pair{cIntId,"10?"}, - pair{cFloatId,"3.14*"}, - pair{cHasNumId,"3.14*"} - })}; + auto const interpretation{make_query_interpretation( + {"text", + pair{cIntId, "100"}, + pair{cFloatId, "32.2"}, + pair{cIntId, "10?"}, + pair{cFloatId, "3.14*"}, + pair{cHasNumId, "3.14*"}} + )}; auto const positions{clp::GrepCoreTest::get_wildcard_encodable_positions(interpretation)}; REQUIRE(2 == positions.size()); @@ -466,21 +470,21 @@ TEST_CASE("generate_logtype_string_for_multi_variable_interpretation", "[dfa_sea constexpr uint32_t cFloatId{static_cast(TokenFloat)}; constexpr uint32_t cHasNumId{111}; - unordered_set const expected_logtype_strings{{ - {generate_expected_logtype_string({"text", 'i', 'f', 'd', 'd', 'd'})}, - {generate_expected_logtype_string({"text", 'i', 'f', 'i', 'd', 'd'})}, - {generate_expected_logtype_string({"text", 'i', 'f', 'd', 'f', 'd'})}, - {generate_expected_logtype_string({"text", 'i', 'f', 'i', 'f', 'd'})} - }}; - - auto const interpretation{make_query_interpretation({ - "text", - pair{cIntId,"100"}, - pair{cFloatId,"32.2"}, - pair{cIntId,"10?"}, - pair{cFloatId,"3.14*"}, - pair{cHasNumId,"3.14*"} - })}; + unordered_set const expected_logtype_strings{ + {{generate_expected_logtype_string({"text", 'i', 'f', 'd', 'd', 'd'})}, + {generate_expected_logtype_string({"text", 'i', 'f', 'i', 'd', 'd'})}, + {generate_expected_logtype_string({"text", 'i', 'f', 'd', 'f', 'd'})}, + {generate_expected_logtype_string({"text", 'i', 'f', 'i', 'f', 'd'})}} + }; + + auto const interpretation{make_query_interpretation( + {"text", + pair{cIntId, "100"}, + pair{cFloatId, "32.2"}, + pair{cIntId, "10?"}, + pair{cFloatId, "3.14*"}, + pair{cHasNumId, "3.14*"}} + )}; auto const wildcard_encodable_positions{ clp::GrepCoreTest::get_wildcard_encodable_positions(interpretation) @@ -589,13 +593,13 @@ TEST_CASE("process_schema_encoded_non_greedy_wildcard_token ", "[dfa_search]") { // this. In the future if CLP is more sophisticated, the two sections behave differently. TEST_CASE("process_schema_non_encoded_non_greedy_wildcard_token ", "[dfa_search]") { size_t id{0}; - FakeVarDict const var_dict{make_var_dict({ - pair{id++, "100000000000000000000000010"}, - pair{id++, "100000000000000000000000020"}, - pair{id++, "100000000000000000000000030"}, - pair{id++, "1000000000000000000000000.0"}, - pair{id++, "1000000000000000000000000a0"} - })}; + FakeVarDict const var_dict{make_var_dict( + {pair{id++, "100000000000000000000000010"}, + pair{id++, "100000000000000000000000020"}, + pair{id++, "100000000000000000000000030"}, + pair{id++, "1000000000000000000000000.0"}, + pair{id++, "1000000000000000000000000a0"}} + )}; SECTION("interpret_as_int") { SubQuery sub_query; @@ -645,15 +649,15 @@ TEST_CASE("process_schema_non_encoded_non_greedy_wildcard_token ", "[dfa_search] TEST_CASE("process_schema_greedy_wildcard_token ", "[dfa_search]") { size_t id{0}; - FakeVarDict const var_dict{make_var_dict({ - pair{id++, "10a0"}, - pair{id++, "10b0"}, - pair{id++, "100000000000000000000000010"}, - pair{id++, "100000000000000000000000020"}, - pair{id++, "100000000000000000000000030"}, - pair{id++, "1000000000000000000000000.0"}, - pair{id++, "1000000000000000000000000a0"} - })}; + FakeVarDict const var_dict{make_var_dict( + {pair{id++, "10a0"}, + pair{id++, "10b0"}, + pair{id++, "100000000000000000000000010"}, + pair{id++, "100000000000000000000000020"}, + pair{id++, "100000000000000000000000030"}, + pair{id++, "1000000000000000000000000.0"}, + pair{id++, "1000000000000000000000000a0"}} + )}; SECTION("interpret_as_non_encoded_int") { SubQuery sub_query; @@ -737,24 +741,24 @@ TEST_CASE("generate_schema_sub_queries", "[dfa_search]") { constexpr uint32_t cHasNumId{111}; FakeVarDict const var_dict{make_var_dict({pair{0, "10a"}, pair{1, "1a3"}})}; - FakeLogTypeDict const logtype_dict{make_logtype_dict({ - {"text ", 'i', " ", 'i', " ", 'f'}, - {"text ", 'i', " ", 'd', " ", 'f'}, - {"text ", 'i', " ", 'd', " 3.14ab$"}, - {"text ", 'i', " ", 'd', " 3.14abc$"}, - {"text ", 'i', " ", 'd', " 3.15ab$"}, - {"text ", 'i', " 10$ ", 'f'} - })}; + FakeLogTypeDict const logtype_dict{make_logtype_dict( + {{"text ", 'i', " ", 'i', " ", 'f'}, + {"text ", 'i', " ", 'd', " ", 'f'}, + {"text ", 'i', " ", 'd', " 3.14ab$"}, + {"text ", 'i', " ", 'd', " 3.14abc$"}, + {"text ", 'i', " ", 'd', " 3.15ab$"}, + {"text ", 'i', " 10$ ", 'f'}} + )}; using V = pair; vector>> raw_interpretations{ - {"text ", V{cIntId, "100"}, " ", V{cIntId, "10?"}, " ", V{cFloatId," 3.14*"}}, + {"text ", V{cIntId, "100"}, " ", V{cIntId, "10?"}, " ", V{cFloatId, " 3.14*"}}, {"text ", V{cIntId, "100"}, " ", V{cIntId, "10?"}, " ", V{cHasNumId, "3.14*"}}, {"text ", V{cIntId, "100"}, " ", V{cIntId, "10?"}, " 3.14*"}, - {"text ", V{cIntId, "100"}, " ", V{cHasNumId, "10?"}, " ", V{cFloatId," 3.14*"}}, + {"text ", V{cIntId, "100"}, " ", V{cHasNumId, "10?"}, " ", V{cFloatId, " 3.14*"}}, {"text ", V{cIntId, "100"}, " ", V{cHasNumId, "10?"}, " ", V{cHasNumId, "3.14*"}}, {"text ", V{cIntId, "100"}, " ", V{cHasNumId, "10?"}, " 3.14*"}, - {"text ", V{cIntId, "100"}, " 10? ", V{cFloatId," 3.14*"}}, + {"text ", V{cIntId, "100"}, " 10? ", V{cFloatId, " 3.14*"}}, {"text ", V{cIntId, "100"}, " 10? ", V{cHasNumId, "3.14*"}}, {"text ", V{cIntId, "100"}, " 10? 3.14*"} }; @@ -771,17 +775,17 @@ TEST_CASE("generate_schema_sub_queries", "[dfa_search]") { sub_queries ); - using Info = tuple>; + using Var = tuple>; REQUIRE(6 == sub_queries.size()); size_t i{0}; // NOTE: sub queries 0 and 2 are a duplicate of 3 and 5 because we use a vector instead of a set // when storing `m_sub_queries` in `Query`. - check_sub_query(i++, sub_queries, true, {Info{false, true, {}}, Info{true, true, {0}}}, {1}); - check_sub_query(i++, sub_queries, true, {Info{false, true, {}}}, {0}); - check_sub_query(i++, sub_queries, false, {Info{false, true, {}}, Info{true, true, {0}}}, {2,3}); - check_sub_query(i++, sub_queries, true, {Info{false, true, {}}, Info{true, true, {0}}}, {1}); - check_sub_query(i++, sub_queries, false, {Info{false, true, {}}, Info{true, true, {0}}}, {2,3}); - check_sub_query(i++, sub_queries, true, {Info{false, true, {}}}, {5}); + check_sub_query(i++, sub_queries, true, {Var{false, true, {}}, Var{true, true, {0}}}, {1}); + check_sub_query(i++, sub_queries, true, {Var{false, true, {}}}, {0}); + check_sub_query(i++, sub_queries, false, {Var{false, true, {}}, Var{true, true, {0}}}, {2, 3}); + check_sub_query(i++, sub_queries, true, {Var{false, true, {}}, Var{true, true, {0}}}, {1}); + check_sub_query(i++, sub_queries, false, {Var{false, true, {}}, Var{true, true, {0}}}, {2, 3}); + check_sub_query(i++, sub_queries, true, {Var{false, true, {}}}, {5}); } TEST_CASE("generate_schema_sub_queries_with_wildcard_duplication", "[dfa_search]") { @@ -790,24 +794,24 @@ TEST_CASE("generate_schema_sub_queries_with_wildcard_duplication", "[dfa_search] constexpr uint32_t cHasNumId{111}; FakeVarDict const var_dict{make_var_dict({pair{0, "10a"}, pair{1, "1a3"}})}; - FakeLogTypeDict const logtype_dict{make_logtype_dict({ - {"text ", 'i', " ", 'i', " ", 'f'}, - {"text ", 'i', " ", 'd', " ", 'f'}, - {"text ", 'i', " ", 'd', " 3.14ab$"}, - {"text ", 'i', " ", 'd', " 3.14abc$"}, - {"text ", 'i', " ", 'd', " 3.15ab$"}, - {"text ", 'i', " 10$ ", 'f'} - })}; + FakeLogTypeDict const logtype_dict{make_logtype_dict( + {{"text ", 'i', " ", 'i', " ", 'f'}, + {"text ", 'i', " ", 'd', " ", 'f'}, + {"text ", 'i', " ", 'd', " 3.14ab$"}, + {"text ", 'i', " ", 'd', " 3.14abc$"}, + {"text ", 'i', " ", 'd', " 3.15ab$"}, + {"text ", 'i', " 10$ ", 'f'}} + )}; using V = pair; vector>> raw_interpretations{ - {"text ", V{cIntId, "100"}, " ", V{cIntId, "10?"}, " ", V{cFloatId," 3.14*"}, "*"}, + {"text ", V{cIntId, "100"}, " ", V{cIntId, "10?"}, " ", V{cFloatId, " 3.14*"}, "*"}, {"text ", V{cIntId, "100"}, " ", V{cIntId, "10?"}, " ", V{cHasNumId, "3.14*"}, "*"}, {"text ", V{cIntId, "100"}, " ", V{cIntId, "10?"}, " 3.14**"}, - {"text ", V{cIntId, "100"}, " ", V{cHasNumId, "10?"}, " ", V{cFloatId," 3.14*"}, "*"}, + {"text ", V{cIntId, "100"}, " ", V{cHasNumId, "10?"}, " ", V{cFloatId, " 3.14*"}, "*"}, {"text ", V{cIntId, "100"}, " ", V{cHasNumId, "10?"}, " ", V{cHasNumId, "3.14*"}, "*"}, {"text ", V{cIntId, "100"}, " ", V{cHasNumId, "10?"}, " 3.14**"}, - {"text ", V{cIntId, "100"}, " 10? ", V{cFloatId," 3.14*"}, "*"}, + {"text ", V{cIntId, "100"}, " 10? ", V{cFloatId, " 3.14*"}, "*"}, {"text ", V{cIntId, "100"}, " 10? ", V{cHasNumId, "3.14*"}, "*"}, {"text ", V{cIntId, "100"}, " 10? 3.14**"} }; @@ -824,60 +828,51 @@ TEST_CASE("generate_schema_sub_queries_with_wildcard_duplication", "[dfa_search] sub_queries ); - using Info = tuple>; + using Var = tuple>; REQUIRE(6 == sub_queries.size()); size_t i{0}; - check_sub_query(i++, sub_queries, true, {Info{false, true, {}}, Info{true, true, {0}}}, {1}); - check_sub_query(i++, sub_queries, true, {Info{false, true, {}}}, {0}); - check_sub_query(i++, sub_queries, false, {Info{false, true, {}}, Info{true, true, {0}}}, {2,3}); - check_sub_query(i++, sub_queries, true, {Info{false, true, {}}, Info{true, true, {0}}}, {1}); - check_sub_query(i++, sub_queries, false, {Info{false, true, {}}, Info{true, true, {0}}}, {2,3}); - check_sub_query(i++, sub_queries, true, {Info{false, true, {}}}, {5}); + check_sub_query(i++, sub_queries, true, {Var{false, true, {}}, Var{true, true, {0}}}, {1}); + check_sub_query(i++, sub_queries, true, {Var{false, true, {}}}, {0}); + check_sub_query(i++, sub_queries, false, {Var{false, true, {}}, Var{true, true, {0}}}, {2, 3}); + check_sub_query(i++, sub_queries, true, {Var{false, true, {}}, Var{true, true, {0}}}, {1}); + check_sub_query(i++, sub_queries, false, {Var{false, true, {}}, Var{true, true, {0}}}, {2, 3}); + check_sub_query(i++, sub_queries, true, {Var{false, true, {}}}, {5}); } // Tests: `process_raw_query` TEST_CASE("process_raw_query", "[dfa_search]") { - auto lexer{make_test_lexer({ - {R"(int:(\d+))"}, - {R"(float:(\d+\.\d+))"}, - {R"(hasNumber:[^ $]*\d+[^ $]*)"} - })}; + auto lexer{make_test_lexer( + {{R"(int:(\d+))"}, {R"(float:(\d+\.\d+))"}, {R"(hasNumber:[^ $]*\d+[^ $]*)"}} + )}; FakeVarDict const var_dict{make_var_dict({pair{0, "10a"}, pair{1, "1a3"}})}; - FakeLogTypeDict const logtype_dict{make_logtype_dict({ - {"text ", 'i', " ", 'i', " ", 'f'}, - {"text ", 'i', " ", 'd', " ", 'f'}, - {"text ", 'i', " ", 'd', " 3.14ab$"}, - {"text ", 'i', " ", 'd', " 3.14abc$"}, - {"text ", 'i', " ", 'd', " 3.15ab$"}, - {"text ", 'i', " 10$ ", 'f'} - })}; + FakeLogTypeDict const logtype_dict{make_logtype_dict( + {{"text ", 'i', " ", 'i', " ", 'f'}, + {"text ", 'i', " ", 'd', " ", 'f'}, + {"text ", 'i', " ", 'd', " 3.14ab$"}, + {"text ", 'i', " ", 'd', " 3.14abc$"}, + {"text ", 'i', " ", 'd', " 3.15ab$"}, + {"text ", 'i', " 10$ ", 'f'}} + )}; string const raw_query{"text 100 10? 3.14*"}; - auto const query{GrepCore::process_raw_query( - logtype_dict, - var_dict, - raw_query, - 0, - 0, - true, - lexer, - false - )}; + auto const query{ + GrepCore::process_raw_query(logtype_dict, var_dict, raw_query, 0, 0, true, lexer, false) + }; REQUIRE(query.has_value()); - using Info = tuple>; + using Var = tuple>; auto const& sub_queries{query.value().get_sub_queries()}; REQUIRE(6 == sub_queries.size()); size_t i{0}; - check_sub_query(i++, sub_queries, true, {Info{false, true, {}}, Info{true, true, {0}}}, {1}); - check_sub_query(i++, sub_queries, true, {Info{false, true, {}}}, {0}); - check_sub_query(i++, sub_queries, false, {Info{false, true, {}}, Info{true, true, {0}}}, {2,3}); - check_sub_query(i++, sub_queries, true, {Info{false, true, {}}, Info{true, true, {0}}}, {1}); - check_sub_query(i++, sub_queries, false, {Info{false, true, {}}, Info{true, true, {0}}}, {2,3}); - check_sub_query(i++, sub_queries, true, {Info{false, true, {}}}, {5}); + check_sub_query(i++, sub_queries, true, {Var{false, true, {}}, Var{true, true, {0}}}, {1}); + check_sub_query(i++, sub_queries, true, {Var{false, true, {}}}, {0}); + check_sub_query(i++, sub_queries, false, {Var{false, true, {}}, Var{true, true, {0}}}, {2, 3}); + check_sub_query(i++, sub_queries, true, {Var{false, true, {}}, Var{true, true, {0}}}, {1}); + check_sub_query(i++, sub_queries, false, {Var{false, true, {}}, Var{true, true, {0}}}, {2, 3}); + check_sub_query(i++, sub_queries, true, {Var{false, true, {}}}, {5}); } // Tests: `get_bounds_of_next_potential_var` diff --git a/components/core/tests/test-string_utils.cpp b/components/core/tests/test-string_utils.cpp index 0219a84989..86a32a9397 100644 --- a/components/core/tests/test-string_utils.cpp +++ b/components/core/tests/test-string_utils.cpp @@ -146,8 +146,8 @@ SCENARIO("Test case sensitive wild card match in all possible ways", "[wildcard] } GIVEN("Double wild with no suffix char") { - tameString = "abcd", wildString = "a**"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); + tameString = "abcd", wildString = "a**"; + REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); } GIVEN("Single wild with no prefix char") { @@ -156,8 +156,8 @@ SCENARIO("Test case sensitive wild card match in all possible ways", "[wildcard] } GIVEN("Double wild with no prefix char") { - tameString = "abcd", wildString = "**d"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); + tameString = "abcd", wildString = "**d"; + REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); } GIVEN("Single wild on both side & has 1st char as literal") { From 62f23b029ade7ceb0752d6895ef13bf4e2c2cb34 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 1 Oct 2025 08:17:16 -0400 Subject: [PATCH 014/164] Add flags to hide test helper in production. --- components/core/CMakeLists.txt | 3 +++ components/core/src/clp/GrepCore.hpp | 4 ++++ 2 files changed, 7 insertions(+) diff --git a/components/core/CMakeLists.txt b/components/core/CMakeLists.txt index d75ccd7b6c..5121cde1f9 100644 --- a/components/core/CMakeLists.txt +++ b/components/core/CMakeLists.txt @@ -771,4 +771,7 @@ if(CLP_BUILD_TESTING) target_compile_features(unitTest PRIVATE cxx_std_20 ) + target_compile_definitions(unitTest + PRIVATE CLP_BUILD_TESTING + ) endif() diff --git a/components/core/src/clp/GrepCore.hpp b/components/core/src/clp/GrepCore.hpp index ce57c72120..b295e4e087 100644 --- a/components/core/src/clp/GrepCore.hpp +++ b/components/core/src/clp/GrepCore.hpp @@ -25,10 +25,14 @@ #include "VariableDictionaryReaderReq.hpp" namespace clp { +#ifdef CLP_BUILD_TESTING class GrepCoreTest; +#endif class GrepCore { +#ifdef CLP_BUILD_TESTING friend class GrepCoreTest; +#endif public: // Methods From 8b093a0734ce7f5bf49713c5ee0f73f367aa68ab Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 1 Oct 2025 08:19:56 -0400 Subject: [PATCH 015/164] Cast to SymbolId instead of uint32_t. --- components/core/src/clp/GrepCore.hpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/components/core/src/clp/GrepCore.hpp b/components/core/src/clp/GrepCore.hpp index b295e4e087..91f3849ebc 100644 --- a/components/core/src/clp/GrepCore.hpp +++ b/components/core/src/clp/GrepCore.hpp @@ -611,10 +611,9 @@ auto GrepCore::process_schema_var_token( ) -> bool { auto const& raw_string{variable_token.get_query_substring()}; auto const var_has_wildcard{variable_token.get_contains_wildcard()}; - auto const var_type{variable_token.get_variable_type()}; - - bool const is_int{static_cast(log_surgeon::SymbolId::TokenInt) == var_type}; - bool const is_float{static_cast(log_surgeon::SymbolId::TokenFloat) == var_type}; + auto const var_type{static_cast(variable_token.get_variable_type())}; + bool const is_int{log_surgeon::SymbolId::TokenInt == var_type}; + bool const is_float{log_surgeon::SymbolId::TokenFloat == var_type}; if (is_wildcard_mask_encoded) { sub_query.mark_wildcard_match_required(); From 49dc9257d1455ea642e76079787d62ead9774f5d Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 1 Oct 2025 08:21:27 -0400 Subject: [PATCH 016/164] Add missing include. --- components/core/src/clp/GrepCore.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/components/core/src/clp/GrepCore.cpp b/components/core/src/clp/GrepCore.cpp index d2ad52ce3a..ddbf3a9292 100644 --- a/components/core/src/clp/GrepCore.cpp +++ b/components/core/src/clp/GrepCore.cpp @@ -10,6 +10,7 @@ #include #include +#include "EncodedVariableInterpreter.hpp" #include "ir/parsing.hpp" #include "StringReader.hpp" From 7b737541cc46997c090a8d1eb29e8821cfd611e4 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 1 Oct 2025 08:27:01 -0400 Subject: [PATCH 017/164] Avoid multiple get_logtype() calls; Reserve size. --- components/core/src/clp/GrepCore.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/components/core/src/clp/GrepCore.cpp b/components/core/src/clp/GrepCore.cpp index ddbf3a9292..467034df4c 100644 --- a/components/core/src/clp/GrepCore.cpp +++ b/components/core/src/clp/GrepCore.cpp @@ -149,9 +149,12 @@ bool GrepCore::get_bounds_of_next_potential_var( auto GrepCore::get_wildcard_encodable_positions(QueryInterpretation const& interpretation) -> vector { + auto const logtype{interpretation.get_logtype()}; vector wildcard_encodable_positions; - for (size_t i{0}; i < interpretation.get_logtype().size(); ++i) { - auto const token{interpretation.get_logtype()[i]}; + wildcard_encodable_positions.reserve(logtype.size()); + + for (size_t i{0}; i < logtype.size(); ++i) { + auto const token{logtype[i]}; if (std::holds_alternative(token)) { auto const& var_token{std::get(token)}; auto const var_type{static_cast(var_token.get_variable_type())}; From c8f90460153429dcc76e50622eedce1fe1d29feb Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 1 Oct 2025 08:35:09 -0400 Subject: [PATCH 018/164] Remove more get_logtype() calls and make tokens be references into the now stored logtype as it will no longer dangle. --- components/core/src/clp/GrepCore.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/components/core/src/clp/GrepCore.cpp b/components/core/src/clp/GrepCore.cpp index 467034df4c..1fb5396907 100644 --- a/components/core/src/clp/GrepCore.cpp +++ b/components/core/src/clp/GrepCore.cpp @@ -154,7 +154,7 @@ auto GrepCore::get_wildcard_encodable_positions(QueryInterpretation const& inter wildcard_encodable_positions.reserve(logtype.size()); for (size_t i{0}; i < logtype.size(); ++i) { - auto const token{logtype[i]}; + auto const& token{logtype[i]}; if (std::holds_alternative(token)) { auto const& var_token{std::get(token)}; auto const var_type{static_cast(var_token.get_variable_type())}; @@ -176,7 +176,8 @@ auto GrepCore::generate_logtype_string( // Reserve size for `logtype_string`. size_t logtype_string_size{0}; - for (auto const token : interpretation.get_logtype()) { + auto const logtype{interpretation.get_logtype()}; + for (auto const& token : logtype) { if (std::holds_alternative(token)) { auto const& static_token{std::get(token)}; logtype_string_size += static_token.get_query_substring().size(); @@ -187,8 +188,8 @@ auto GrepCore::generate_logtype_string( logtype_string.reserve(logtype_string_size); // Generate `logtype_string`. - for (size_t i{0}; i < interpretation.get_logtype().size(); ++i) { - auto const token{interpretation.get_logtype()[i]}; + for (size_t i{0}; i < logtype.size(); ++i) { + auto const& token{logtype[i]}; if (std::holds_alternative(token)) { logtype_string += std::get(token).get_query_substring(); continue; From 9eba2d5d5283e6ef741e79fd2b5cf814023c4f7e Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 1 Oct 2025 08:39:15 -0400 Subject: [PATCH 019/164] Switch from contains to find to avoid double lookup. --- components/core/src/clp/GrepCore.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/components/core/src/clp/GrepCore.cpp b/components/core/src/clp/GrepCore.cpp index 1fb5396907..530f7879bf 100644 --- a/components/core/src/clp/GrepCore.cpp +++ b/components/core/src/clp/GrepCore.cpp @@ -200,9 +200,10 @@ auto GrepCore::generate_logtype_string( auto const var_type{static_cast(var_token.get_variable_type())}; bool const is_int{TokenInt == var_type}; bool const is_float{TokenFloat == var_type}; - if (wildcard_mask_map.contains(i)) { - bool const use_encoded{wildcard_mask_map.at(i)}; - if (use_encoded) { + + auto const it{wildcard_mask_map.find(i)}; + if (wildcard_mask_map.end() != it) { + if (it->second) { if (is_int) { EncodedVariableInterpreter::add_int_var(logtype_string); } else { From 41157863e6a553e19966285e7e5f8befbedcd171 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 1 Oct 2025 08:40:43 -0400 Subject: [PATCH 020/164] Fix typos. --- components/core/src/clp/GrepCore.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/components/core/src/clp/GrepCore.hpp b/components/core/src/clp/GrepCore.hpp index 91f3849ebc..ebc7a5cd3b 100644 --- a/components/core/src/clp/GrepCore.hpp +++ b/components/core/src/clp/GrepCore.hpp @@ -192,12 +192,12 @@ class GrepCore { /** * Scans the interpretation and returns the indices of all encodable wildcard variables. * - * An encodable variable is a variable token than: + * An encodable variable is a variable token that: * - Contains a wildcard (e.g. *1). * - Is of an encodable type (integer or float). * * @param interpretation The `QueryInterpretation` to scan. - * @return A vector of positions of encodabe wildcard variables. + * @return A vector of positions of encodable wildcard variables. */ static auto get_wildcard_encodable_positions( log_surgeon::wildcard_query_parser::QueryInterpretation const& interpretation From 8aa356a1588e58b607fa4bc9774f32f42353722d Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 1 Oct 2025 08:45:49 -0400 Subject: [PATCH 021/164] Reduce mask size to 16 bits; Update documentation to match. --- components/core/src/clp/GrepCore.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/components/core/src/clp/GrepCore.hpp b/components/core/src/clp/GrepCore.hpp index ebc7a5cd3b..7fefad9f35 100644 --- a/components/core/src/clp/GrepCore.hpp +++ b/components/core/src/clp/GrepCore.hpp @@ -147,8 +147,8 @@ class GrepCore { * - 0: treat as a dictionary variable (\d) * - 1: treat as an encoded variable (\i for integers, \f for floats) * - * If there are k encodable wildcard variables, then 2^k logtype strings are possible. Each - * bit in the mask corresponds to one variable. + * If there are k encodable wildcard variables, then 2^k logtype strings are possible. As a + * result we limit k <= 16. We represents these alternatives using a bit mask. * * Example: * Search query: "a *1 *2 b", @@ -537,7 +537,7 @@ void GrepCore::generate_schema_sub_queries( bool const ignore_case, std::vector& sub_queries ) { - constexpr size_t cMaxEncodableWildcardVariables{32}; + constexpr size_t cMaxEncodableWildcardVariables{16}; for (auto const& interpretation : interpretations) { auto wildcard_encodable_positions{get_wildcard_encodable_positions(interpretation)}; if (wildcard_encodable_positions.size() > cMaxEncodableWildcardVariables) { From e5afe04cabe83017a12eb3d67e8649e199eb55f0 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 1 Oct 2025 08:47:42 -0400 Subject: [PATCH 022/164] Use propery type for bit mask. --- components/core/tests/test-GrepCore.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/components/core/tests/test-GrepCore.cpp b/components/core/tests/test-GrepCore.cpp index 1ebcbb2ebd..a8bedc4c7d 100644 --- a/components/core/tests/test-GrepCore.cpp +++ b/components/core/tests/test-GrepCore.cpp @@ -455,7 +455,7 @@ TEST_CASE("generate_logtype_string_for_single_variable_interpretation", "[dfa_se auto const wildcard_encodable_positions{ clp::GrepCoreTest::get_wildcard_encodable_positions(interpretation) }; - size_t const num_combos{static_cast(1) << wildcard_encodable_positions.size()}; + uint64_t const num_combos{1ULL << wildcard_encodable_positions.size()}; REQUIRE(1 == num_combos); std::unordered_map const wildcard_mask_map{}; @@ -490,7 +490,7 @@ TEST_CASE("generate_logtype_string_for_multi_variable_interpretation", "[dfa_sea clp::GrepCoreTest::get_wildcard_encodable_positions(interpretation) }; - size_t const num_combos{static_cast(1) << wildcard_encodable_positions.size()}; + uint64_t const num_combos{1ULL << wildcard_encodable_positions.size()}; REQUIRE(num_combos == 4); unordered_set logtype_strings; for (size_t mask{0}; mask < num_combos; ++mask) { From 7352fd22dcd490a846f251c3cb5bb9d75d33db79 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 1 Oct 2025 08:57:51 -0400 Subject: [PATCH 023/164] Define cross-test constants in a single place; Make sure constants are used wherever possible. --- components/core/tests/test-GrepCore.cpp | 32 +++++++------------------ 1 file changed, 8 insertions(+), 24 deletions(-) diff --git a/components/core/tests/test-GrepCore.cpp b/components/core/tests/test-GrepCore.cpp index a8bedc4c7d..120c200c50 100644 --- a/components/core/tests/test-GrepCore.cpp +++ b/components/core/tests/test-GrepCore.cpp @@ -50,6 +50,10 @@ using std::unordered_set; using std::variant; using std::vector; +constexpr uint32_t cIntId{static_cast(TokenInt)}; +constexpr uint32_t cFloatId{static_cast(TokenFloat)}; +constexpr uint32_t cHasNumId{111}; + class clp::GrepCoreTest { public: static auto get_wildcard_encodable_positions(QueryInterpretation const& interpretation) @@ -360,10 +364,6 @@ auto check_sub_query( } auto make_test_lexer(vector const& schema_rules) -> ByteLexer { - constexpr uint32_t cIntId{static_cast(TokenInt)}; - constexpr uint32_t cFloatId{static_cast(TokenFloat)}; - constexpr uint32_t cHasNumId{111}; - ByteLexer lexer; lexer.m_symbol_id["int"] = cIntId; lexer.m_symbol_id["float"] = cFloatId; @@ -405,10 +405,6 @@ TEST_CASE("get_wildcard_encodable_positions_for_empty_interpretation", "[dfa_sea } TEST_CASE("get_wildcard_encodable_positions_for_multi_variable_interpretation", "[dfa_search]") { - constexpr uint32_t cIntId{static_cast(TokenInt)}; - constexpr uint32_t cFloatId{static_cast(TokenFloat)}; - constexpr uint32_t cHasNumId{111}; - auto const interpretation{make_query_interpretation( {"text", pair{cIntId, "100"}, @@ -466,10 +462,6 @@ TEST_CASE("generate_logtype_string_for_single_variable_interpretation", "[dfa_se } TEST_CASE("generate_logtype_string_for_multi_variable_interpretation", "[dfa_search]") { - constexpr uint32_t cIntId{static_cast(TokenInt)}; - constexpr uint32_t cFloatId{static_cast(TokenFloat)}; - constexpr uint32_t cHasNumId{111}; - unordered_set const expected_logtype_strings{ {{generate_expected_logtype_string({"text", 'i', 'f', 'd', 'd', 'd'})}, {generate_expected_logtype_string({"text", 'i', 'f', 'i', 'd', 'd'})}, @@ -546,7 +538,7 @@ TEST_CASE("process_schema_encoded_non_greedy_wildcard_token ", "[dfa_search]") { SECTION("interpret_as_int") { SubQuery sub_query; - VariableQueryToken const int_token{0, "10?0", true}; + VariableQueryToken const int_token{cIntId, "10?0", true}; REQUIRE(clp::GrepCoreTest::process_encoded_token(int_token, var_dict, sub_query)); REQUIRE(sub_query.wildcard_match_required()); REQUIRE(0 == sub_query.get_num_possible_vars()); @@ -554,7 +546,7 @@ TEST_CASE("process_schema_encoded_non_greedy_wildcard_token ", "[dfa_search]") { SECTION("interpret_as_float") { SubQuery sub_query; - VariableQueryToken const float_token{1, "10?0", true}; + VariableQueryToken const float_token{cFloatId, "10?0", true}; REQUIRE(clp::GrepCoreTest::process_encoded_token(float_token, var_dict, sub_query)); REQUIRE(sub_query.wildcard_match_required()); REQUIRE(0 == sub_query.get_num_possible_vars()); @@ -562,7 +554,7 @@ TEST_CASE("process_schema_encoded_non_greedy_wildcard_token ", "[dfa_search]") { SECTION("interpret_as_precise_has_number") { SubQuery sub_query; - VariableQueryToken const has_number_token{2, "10a?", true}; + VariableQueryToken const has_number_token{cHasNumId, "10a?", true}; REQUIRE(clp::GrepCoreTest::process_token(has_number_token, var_dict, sub_query)); REQUIRE(false == sub_query.wildcard_match_required()); REQUIRE(1 == sub_query.get_num_possible_vars()); @@ -575,7 +567,7 @@ TEST_CASE("process_schema_encoded_non_greedy_wildcard_token ", "[dfa_search]") { SECTION("interpret_as_imprecise_has_number") { SubQuery sub_query; - VariableQueryToken const has_number_token{2, "10?0", true}; + VariableQueryToken const has_number_token{cHasNumId, "10?0", true}; REQUIRE(clp::GrepCoreTest::process_token(has_number_token, var_dict, sub_query)); REQUIRE(false == sub_query.wildcard_match_required()); REQUIRE(1 == sub_query.get_num_possible_vars()); @@ -736,10 +728,6 @@ TEST_CASE("process_schema_greedy_wildcard_token ", "[dfa_search]") { // Tests: `generate_schema_sub_queries` TEST_CASE("generate_schema_sub_queries", "[dfa_search]") { - constexpr uint32_t cIntId{static_cast(TokenInt)}; - constexpr uint32_t cFloatId{static_cast(TokenFloat)}; - constexpr uint32_t cHasNumId{111}; - FakeVarDict const var_dict{make_var_dict({pair{0, "10a"}, pair{1, "1a3"}})}; FakeLogTypeDict const logtype_dict{make_logtype_dict( {{"text ", 'i', " ", 'i', " ", 'f'}, @@ -789,10 +777,6 @@ TEST_CASE("generate_schema_sub_queries", "[dfa_search]") { } TEST_CASE("generate_schema_sub_queries_with_wildcard_duplication", "[dfa_search]") { - constexpr uint32_t cIntId{static_cast(TokenInt)}; - constexpr uint32_t cFloatId{static_cast(TokenFloat)}; - constexpr uint32_t cHasNumId{111}; - FakeVarDict const var_dict{make_var_dict({pair{0, "10a"}, pair{1, "1a3"}})}; FakeLogTypeDict const logtype_dict{make_logtype_dict( {{"text ", 'i', " ", 'i', " ", 'f'}, From 80354f37ba9fa59529ebbff39ad08c36a47311d4 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 1 Oct 2025 09:02:02 -0400 Subject: [PATCH 024/164] Reserve var dict size. --- components/core/src/clp/GrepCore.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/components/core/src/clp/GrepCore.hpp b/components/core/src/clp/GrepCore.hpp index 7fefad9f35..dc627b6f00 100644 --- a/components/core/src/clp/GrepCore.hpp +++ b/components/core/src/clp/GrepCore.hpp @@ -657,6 +657,7 @@ auto GrepCore::process_schema_var_token( std::unordered_set encoded_vars; std::unordered_set var_dict_ids; encoded_vars.reserve(entries.size()); + var_dict_ids.reserve(entries.size()); for (auto const* entry : entries) { encoded_vars.emplace(EncodedVariableInterpreter::encode_var_dict_id(entry->get_id())); var_dict_ids.emplace(entry->get_id()); From 1680f330d07c04007b5be80d40092f286955c936 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 1 Oct 2025 09:04:18 -0400 Subject: [PATCH 025/164] Cache logtype and use references into it. --- components/core/src/clp/GrepCore.hpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/components/core/src/clp/GrepCore.hpp b/components/core/src/clp/GrepCore.hpp index dc627b6f00..0d189a6f8f 100644 --- a/components/core/src/clp/GrepCore.hpp +++ b/components/core/src/clp/GrepCore.hpp @@ -564,8 +564,9 @@ void GrepCore::generate_schema_sub_queries( SubQuery sub_query; bool has_vars{true}; - for (size_t i{0}; i < interpretation.get_logtype().size(); ++i) { - auto const token{interpretation.get_logtype()[i]}; + auto const logtype{interpretation.get_logtype()}; + for (size_t i{0}; i < logtype.size(); ++i) { + auto const& token{logtype[i]}; if (std::holds_alternative( token )) From 5bc40ff3fd3be5ed576dd100d20c191ead4ab137 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 1 Oct 2025 09:09:29 -0400 Subject: [PATCH 026/164] Replace contain with find to avoid double lookup. --- components/core/src/clp/GrepCore.hpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/components/core/src/clp/GrepCore.hpp b/components/core/src/clp/GrepCore.hpp index 0d189a6f8f..3cb2498e4a 100644 --- a/components/core/src/clp/GrepCore.hpp +++ b/components/core/src/clp/GrepCore.hpp @@ -572,8 +572,9 @@ void GrepCore::generate_schema_sub_queries( )) { bool is_wildcard_mask_encoded{false}; - if (wildcard_mask_map.contains(i)) { - is_wildcard_mask_encoded = wildcard_mask_map.at(i); + auto const it{wildcard_mask_map.find(i)}; + if (wildcard_mask_map.end() != it) { + is_wildcard_mask_encoded = it->second; } has_vars = process_schema_var_token( From decca7e027e4893a860df516453cec1fc9962e0e Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 1 Oct 2025 09:19:40 -0400 Subject: [PATCH 027/164] Reserve possible_logtype_ids. --- components/core/src/clp/GrepCore.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/components/core/src/clp/GrepCore.hpp b/components/core/src/clp/GrepCore.hpp index 3cb2498e4a..72b5a7c66f 100644 --- a/components/core/src/clp/GrepCore.hpp +++ b/components/core/src/clp/GrepCore.hpp @@ -594,6 +594,7 @@ void GrepCore::generate_schema_sub_queries( } std::unordered_set possible_logtype_ids; + possible_logtype_ids.reserve(logtype_entries.size()); for (auto const* entry : logtype_entries) { possible_logtype_ids.emplace(entry->get_id()); } From 26c8e49de046b0206a151cd76ef73024fefda12d Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 1 Oct 2025 09:21:17 -0400 Subject: [PATCH 028/164] Use correct types for bit mask. --- components/core/tests/test-GrepCore.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/components/core/tests/test-GrepCore.cpp b/components/core/tests/test-GrepCore.cpp index 120c200c50..b4e7e2315a 100644 --- a/components/core/tests/test-GrepCore.cpp +++ b/components/core/tests/test-GrepCore.cpp @@ -485,10 +485,10 @@ TEST_CASE("generate_logtype_string_for_multi_variable_interpretation", "[dfa_sea uint64_t const num_combos{1ULL << wildcard_encodable_positions.size()}; REQUIRE(num_combos == 4); unordered_set logtype_strings; - for (size_t mask{0}; mask < num_combos; ++mask) { + for (uint64_t mask{0}; mask < num_combos; ++mask) { unordered_map wildcard_mask_map; for (size_t i{0}; i < wildcard_encodable_positions.size(); ++i) { - wildcard_mask_map[wildcard_encodable_positions[i]] = mask >> i & 1; + wildcard_mask_map[wildcard_encodable_positions[i]] = mask >> i & 1ULL; } logtype_strings.insert( clp::GrepCoreTest::generate_logtype_string(interpretation, wildcard_mask_map) From 0908106a12cd2f5dab50495ece8c0fac9c79a717 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 1 Oct 2025 09:25:49 -0400 Subject: [PATCH 029/164] Use constants in place of magic numbers. --- components/core/tests/test-GrepCore.cpp | 28 ++++++++++++------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/components/core/tests/test-GrepCore.cpp b/components/core/tests/test-GrepCore.cpp index b4e7e2315a..7eb0125706 100644 --- a/components/core/tests/test-GrepCore.cpp +++ b/components/core/tests/test-GrepCore.cpp @@ -502,8 +502,8 @@ TEST_CASE("process_schema_empty_token ", "[dfa_search]") { FakeVarDict const var_dict{make_var_dict({pair{0, "100"}})}; SubQuery sub_query; - VariableQueryToken const static_token{0, "", false}; - REQUIRE(false == clp::GrepCoreTest::process_token(static_token, var_dict, sub_query)); + VariableQueryToken const empty_int_token{cIntId, "", false}; + REQUIRE(false == clp::GrepCoreTest::process_token(empty_int_token, var_dict, sub_query)); REQUIRE(false == sub_query.wildcard_match_required()); REQUIRE(0 == sub_query.get_num_possible_vars()); } @@ -512,8 +512,8 @@ TEST_CASE("process_schema_unmatched_token ", "[dfa_search]") { FakeVarDict const var_dict{make_var_dict({pair{0, "100"}})}; SubQuery sub_query; - VariableQueryToken const static_token{0, "200", false}; - REQUIRE(false == clp::GrepCoreTest::process_token(static_token, var_dict, sub_query)); + VariableQueryToken const int_token{cIntId, "200", false}; + REQUIRE(false == clp::GrepCoreTest::process_token(int_token, var_dict, sub_query)); REQUIRE(false == sub_query.wildcard_match_required()); REQUIRE(0 == sub_query.get_num_possible_vars()); } @@ -522,7 +522,7 @@ TEST_CASE("process_schema_int_token ", "[dfa_search]") { FakeVarDict const var_dict{make_var_dict({pair{0, "100"}})}; SubQuery sub_query; - VariableQueryToken const int_token{0, "100", false}; + VariableQueryToken const int_token{cIntId, "100", false}; REQUIRE(clp::GrepCoreTest::process_token(int_token, var_dict, sub_query)); REQUIRE(false == sub_query.wildcard_match_required()); REQUIRE(1 == sub_query.get_num_possible_vars()); @@ -595,7 +595,7 @@ TEST_CASE("process_schema_non_encoded_non_greedy_wildcard_token ", "[dfa_search] SECTION("interpret_as_int") { SubQuery sub_query; - VariableQueryToken const int_token{0, "1000000000000000000000000?0", true}; + VariableQueryToken const int_token{cIntId, "1000000000000000000000000?0", true}; REQUIRE(clp::GrepCoreTest::process_token(int_token, var_dict, sub_query)); REQUIRE(false == sub_query.wildcard_match_required()); REQUIRE(1 == sub_query.get_num_possible_vars()); @@ -610,7 +610,7 @@ TEST_CASE("process_schema_non_encoded_non_greedy_wildcard_token ", "[dfa_search] SECTION("interpret_as_float") { SubQuery sub_query; - VariableQueryToken const float_token{1, "1000000000000000000000000?0", true}; + VariableQueryToken const float_token{cFloatId, "1000000000000000000000000?0", true}; REQUIRE(clp::GrepCoreTest::process_token(float_token, var_dict, sub_query)); REQUIRE(false == sub_query.wildcard_match_required()); REQUIRE(1 == sub_query.get_num_possible_vars()); @@ -625,7 +625,7 @@ TEST_CASE("process_schema_non_encoded_non_greedy_wildcard_token ", "[dfa_search] SECTION("interpret_as_has_number") { SubQuery sub_query; - VariableQueryToken const has_number_token{2, "1000000000000000000000000?0", true}; + VariableQueryToken const has_number_token{cHasNumId, "1000000000000000000000000?0", true}; REQUIRE(clp::GrepCoreTest::process_token(has_number_token, var_dict, sub_query)); REQUIRE(false == sub_query.wildcard_match_required()); REQUIRE(1 == sub_query.get_num_possible_vars()); @@ -653,7 +653,7 @@ TEST_CASE("process_schema_greedy_wildcard_token ", "[dfa_search]") { SECTION("interpret_as_non_encoded_int") { SubQuery sub_query; - VariableQueryToken const int_token{0, "10*0", true}; + VariableQueryToken const int_token{cIntId, "10*0", true}; REQUIRE(clp::GrepCoreTest::process_token(int_token, var_dict, sub_query)); REQUIRE(false == sub_query.wildcard_match_required()); REQUIRE(1 == sub_query.get_num_possible_vars()); @@ -668,7 +668,7 @@ TEST_CASE("process_schema_greedy_wildcard_token ", "[dfa_search]") { SECTION("interpret_as_non_encoded_float") { SubQuery sub_query; - VariableQueryToken const float_token{0, "10*0", true}; + VariableQueryToken const float_token{cFloatId, "10*0", true}; REQUIRE(clp::GrepCoreTest::process_token(float_token, var_dict, sub_query)); REQUIRE(false == sub_query.wildcard_match_required()); REQUIRE(1 == sub_query.get_num_possible_vars()); @@ -683,7 +683,7 @@ TEST_CASE("process_schema_greedy_wildcard_token ", "[dfa_search]") { SECTION("interpret_as_non_encoded_imprecise_has_number") { SubQuery sub_query; - VariableQueryToken const has_number_token{0, "10*0", true}; + VariableQueryToken const has_number_token{cHasNumId, "10*0", true}; REQUIRE(clp::GrepCoreTest::process_token(has_number_token, var_dict, sub_query)); REQUIRE(false == sub_query.wildcard_match_required()); REQUIRE(1 == sub_query.get_num_possible_vars()); @@ -698,7 +698,7 @@ TEST_CASE("process_schema_greedy_wildcard_token ", "[dfa_search]") { SECTION("interpret_as_non_encoded_precise_has_number") { SubQuery sub_query; - VariableQueryToken const has_number_token{0, "10b*", true}; + VariableQueryToken const has_number_token{cHasNumId, "10b*", true}; REQUIRE(clp::GrepCoreTest::process_token(has_number_token, var_dict, sub_query)); REQUIRE(false == sub_query.wildcard_match_required()); REQUIRE(1 == sub_query.get_num_possible_vars()); @@ -711,7 +711,7 @@ TEST_CASE("process_schema_greedy_wildcard_token ", "[dfa_search]") { SECTION("interpret_as_encoded_int") { SubQuery sub_query; - VariableQueryToken const int_token{0, "10*0", true}; + VariableQueryToken const int_token{cIntId, "10*0", true}; REQUIRE(clp::GrepCoreTest::process_encoded_token(int_token, var_dict, sub_query)); REQUIRE(sub_query.wildcard_match_required()); REQUIRE(0 == sub_query.get_num_possible_vars()); @@ -719,7 +719,7 @@ TEST_CASE("process_schema_greedy_wildcard_token ", "[dfa_search]") { SECTION("interpret_as_encoded_float") { SubQuery sub_query; - VariableQueryToken const float_token{1, "10*0", true}; + VariableQueryToken const float_token{cFloatId, "10*0", true}; REQUIRE(clp::GrepCoreTest::process_encoded_token(float_token, var_dict, sub_query)); REQUIRE(sub_query.wildcard_match_required()); REQUIRE(0 == sub_query.get_num_possible_vars()); From 42d7a1caaa54bf4b5d28ecc70b7c1d783e771c3e Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 1 Oct 2025 09:28:04 -0400 Subject: [PATCH 030/164] Add paranthesis for bit operation clarity. --- components/core/src/clp/GrepCore.hpp | 2 +- components/core/tests/test-GrepCore.cpp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/components/core/src/clp/GrepCore.hpp b/components/core/src/clp/GrepCore.hpp index 72b5a7c66f..195515b4f2 100644 --- a/components/core/src/clp/GrepCore.hpp +++ b/components/core/src/clp/GrepCore.hpp @@ -547,7 +547,7 @@ void GrepCore::generate_schema_sub_queries( for (uint64_t mask{0}; mask < num_combos; ++mask) { std::unordered_map wildcard_mask_map; for (size_t i{0}; i < wildcard_encodable_positions.size(); ++i) { - wildcard_mask_map[wildcard_encodable_positions[i]] = mask >> i & 1ULL; + wildcard_mask_map[wildcard_encodable_positions[i]] = (mask >> i) & 1ULL; } auto logtype_string{generate_logtype_string(interpretation, wildcard_mask_map)}; diff --git a/components/core/tests/test-GrepCore.cpp b/components/core/tests/test-GrepCore.cpp index 7eb0125706..5d2ef5510a 100644 --- a/components/core/tests/test-GrepCore.cpp +++ b/components/core/tests/test-GrepCore.cpp @@ -432,7 +432,7 @@ TEST_CASE("generate_logtype_string_for_empty_interpretation", "[dfa_search]") { for (uint64_t mask{0}; mask < num_combos; ++mask) { std::unordered_map wildcard_mask_map; for (size_t i{0}; i < wildcard_encodable_positions.size(); ++i) { - wildcard_mask_map[wildcard_encodable_positions[i]] = mask >> i & 1ULL; + wildcard_mask_map[wildcard_encodable_positions[i]] = (mask >> i) & 1ULL; } auto logtype_string{ clp::GrepCoreTest::generate_logtype_string(interpretation, wildcard_mask_map) @@ -488,7 +488,7 @@ TEST_CASE("generate_logtype_string_for_multi_variable_interpretation", "[dfa_sea for (uint64_t mask{0}; mask < num_combos; ++mask) { unordered_map wildcard_mask_map; for (size_t i{0}; i < wildcard_encodable_positions.size(); ++i) { - wildcard_mask_map[wildcard_encodable_positions[i]] = mask >> i & 1ULL; + wildcard_mask_map[wildcard_encodable_positions[i]] = (mask >> i) & 1ULL; } logtype_strings.insert( clp::GrepCoreTest::generate_logtype_string(interpretation, wildcard_mask_map) From fae59132a7891b26039a44559c6d95c474fd06b7 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 1 Oct 2025 09:54:40 -0400 Subject: [PATCH 031/164] Add docstrings for unit-test helpers. --- components/core/tests/test-GrepCore.cpp | 87 +++++++++++++++++++++++++ 1 file changed, 87 insertions(+) diff --git a/components/core/tests/test-GrepCore.cpp b/components/core/tests/test-GrepCore.cpp index 5d2ef5510a..93e5a36caf 100644 --- a/components/core/tests/test-GrepCore.cpp +++ b/components/core/tests/test-GrepCore.cpp @@ -54,6 +54,18 @@ constexpr uint32_t cIntId{static_cast(TokenInt)}; constexpr uint32_t cFloatId{static_cast(TokenFloat)}; constexpr uint32_t cHasNumId{111}; +/** + * Helper to expose `GrepCore` functionality for unit-testing. + * + * This class provides static wrappers around `GrepCore` methods, allowing test + * code to access internal logic such as: + * - Finding wildcard encodable positions in a `QueryInterpretation`; + * - Generating logtype strings with wildcard masks; + * - Processing variable tokens with or without encoding; + * - Generating schema-based sub-queries. + * + * All methods forward directly to `GrepCore` and are intended for testing only. + */ class clp::GrepCoreTest { public: static auto get_wildcard_encodable_positions(QueryInterpretation const& interpretation) @@ -106,6 +118,11 @@ class clp::GrepCoreTest { }; namespace { +/** + * Simple helper class representing a fake variable dictionary entry for unit tests. + * + * Adheres to `VariableDictionaryEntryReq`. + */ class FakeVarEntry { public: explicit FakeVarEntry(variable_dictionary_id_t const id, string value) @@ -121,6 +138,11 @@ class FakeVarEntry { string m_value; }; +/** + * Simple helper class representing a fake variable dictionary for unit tests. + * + * Provides a method for adding entries and adheres to `VariableDictionaryReaderReq`. + */ class FakeVarDict { public: using Entry = FakeVarEntry; @@ -165,6 +187,11 @@ class FakeVarDict { unordered_map m_storage; }; +/** + * Simple helper class representing a fake logtype dictionary entry for unit tests. + * + * Adheres to `LogtypeDictionaryEntryReq`. + */ class FakeLogTypeEntry { public: FakeLogTypeEntry(string value, clp::logtype_dictionary_id_t const id) @@ -212,6 +239,11 @@ class FakeLogTypeEntry { clp::logtype_dictionary_id_t m_id{0}; }; +/** + * Simple helper class representing a fake logtype dictionary for unit tests. + * + * Provides a method for adding entries and adheres to `LogtypeDictionaryReaderReq`. + */ class FakeLogTypeDict { public: using Entry = FakeLogTypeEntry; @@ -249,16 +281,71 @@ class FakeLogTypeDict { vector m_storage; }; +/** + * @param entries Vector of (id, value) pairs to populate the variable + * dictionary. + * @return A `FakeVarDict` initialized with the given entries. + */ auto make_var_dict(vector> const& entries) -> FakeVarDict; +/** + * @param entries Vector of logtypes, where each logtype is represented by a vector of tokens. Each + * token is either a literal substring (`string_view`) or a variable placeholder (`char`). + * @return A `FakeLogtypeDict` initialized with the given entries. + */ auto make_logtype_dict(vector>> const& entries) -> FakeLogTypeDict; +/** + * Constructs a `QueryInterpretation` from a vector of tokens. + * + * Each token is either: + * - a `string` representing a static substring, or + * - a `pair`, representing a variable placeholder and its value. + * + * This method automatically detects whether a variable token contains a + * wildcard (`*` or `?`). + * + * @param tokens Vector of tokens to populate the `QueryInterpretation`. + * @return A `QueryInterpretation` populated with the given tokens. + */ auto make_query_interpretation(vector>> const& tokens) -> QueryInterpretation; +/** + * Generates a logtype string from a vector of tokens. + * + * Each token is either: + * - a literal substring (`string_view`) to append directly, or + * - a variable placeholder (`char`) indicating the type of variable: + * - `i` -> integer variable; + * - `f` -> float variable; + * - `d` -> dictionary variable. + * + * The function forwards variable tokens to `EncodedVariableInterpreter` to + * append their encoded representations to the resulting string. + * + * @param tokens Vector of tokens to convert into a logtype string. + * @return A `string` representing the expected encoded logtype. + */ auto generate_expected_logtype_string(vector> const& tokens) -> string; +/** + * Checks that a `SubQuery` at a given index matches the expected properties. + * + * This method verifies: + * - Whether wildcard matching is required; + * - The number and type of variables; + * - For dictionary variables, the precise or possible dictionary IDs; + * - The set of possible logtype IDs. + * + * @param id Index of the sub-query to check in `sub_queries`. + * @param sub_queries Vector of `SubQuery` objects. + * @param wildcard_match_required Expected wildcard match requirement. + * @param vars_info Vector of tuples describing expected variable properties: (`is_dict_var`, + * `is_precise_var`, `var_dict_ids`). + * @param logtype_ids Expected set of possible logtype IDs. + */ auto check_sub_query( size_t id, vector const& sub_queries, From 2bdf381149254303aa1a68422f75139c48f52be0 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 1 Oct 2025 10:05:43 -0400 Subject: [PATCH 032/164] Fix some unit-tests. --- components/core/tests/test-GrepCore.cpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/components/core/tests/test-GrepCore.cpp b/components/core/tests/test-GrepCore.cpp index 93e5a36caf..2a38292a57 100644 --- a/components/core/tests/test-GrepCore.cpp +++ b/components/core/tests/test-GrepCore.cpp @@ -600,9 +600,13 @@ TEST_CASE("process_schema_unmatched_token ", "[dfa_search]") { SubQuery sub_query; VariableQueryToken const int_token{cIntId, "200", false}; - REQUIRE(false == clp::GrepCoreTest::process_token(int_token, var_dict, sub_query)); + REQUIRE(clp::GrepCoreTest::process_token(int_token, var_dict, sub_query)); REQUIRE(false == sub_query.wildcard_match_required()); - REQUIRE(0 == sub_query.get_num_possible_vars()); + REQUIRE(1 == sub_query.get_num_possible_vars()); + auto const& var{sub_query.get_vars()[0]}; + REQUIRE(false == var.is_dict_var()); + REQUIRE(var.is_precise_var()); + REQUIRE(var.get_possible_var_dict_ids().empty()); } TEST_CASE("process_schema_int_token ", "[dfa_search]") { @@ -614,9 +618,8 @@ TEST_CASE("process_schema_int_token ", "[dfa_search]") { REQUIRE(false == sub_query.wildcard_match_required()); REQUIRE(1 == sub_query.get_num_possible_vars()); auto const& var{sub_query.get_vars()[0]}; - REQUIRE(var.is_dict_var()); + REQUIRE(false == var.is_dict_var()); REQUIRE(var.is_precise_var()); - REQUIRE(0 == var.get_var_dict_id()); REQUIRE(var.get_possible_var_dict_ids().empty()); } From 8996cfb9a5288310584ad802b9d02ac0cbe68d0d Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 1 Oct 2025 19:41:56 -0400 Subject: [PATCH 033/164] Format. --- components/core/src/clp/GrepCore.hpp | 6 ++++-- components/core/tests/test-GrepCore.cpp | 3 ++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/components/core/src/clp/GrepCore.hpp b/components/core/src/clp/GrepCore.hpp index 76757344c9..5df3679f55 100644 --- a/components/core/src/clp/GrepCore.hpp +++ b/components/core/src/clp/GrepCore.hpp @@ -181,7 +181,8 @@ class GrepCore { */ template < LogTypeDictionaryReaderReq LogTypeDictionaryReaderType, - VariableDictionaryReaderReq VariableDictionaryReaderType> + VariableDictionaryReaderReq VariableDictionaryReaderType + > static void generate_schema_sub_queries( std::set const& interpretations, @@ -533,7 +534,8 @@ GrepCore::SubQueryMatchabilityResult GrepCore::generate_logtypes_and_vars_for_su template < LogTypeDictionaryReaderReq LogTypeDictionaryReaderType, - VariableDictionaryReaderReq VariableDictionaryReaderType> + VariableDictionaryReaderReq VariableDictionaryReaderType +> void GrepCore::generate_schema_sub_queries( std::set const& interpretations, LogTypeDictionaryReaderType const& logtype_dict, diff --git a/components/core/tests/test-GrepCore.cpp b/components/core/tests/test-GrepCore.cpp index 2a38292a57..390b081d42 100644 --- a/components/core/tests/test-GrepCore.cpp +++ b/components/core/tests/test-GrepCore.cpp @@ -100,7 +100,8 @@ class clp::GrepCoreTest { template < LogTypeDictionaryReaderReq LogTypeDictionaryReaderType, - VariableDictionaryReaderReq VariableDictionaryReaderType> + VariableDictionaryReaderReq VariableDictionaryReaderType + > static void generate_schema_sub_queries( std::set const& interpretations, LogTypeDictionaryReaderType const& logtype_dict, From 7608cab0dacd18a9e17dce5626c8f0ec2e80edd5 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Fri, 3 Oct 2025 05:55:11 -0400 Subject: [PATCH 034/164] Remove consecutive wildcards from interpretation tokens; Undo changes to string_utils. --- components/core/src/clp/GrepCore.cpp | 44 +++++++++++++++-- components/core/src/clp/GrepCore.hpp | 20 ++++++-- .../src/clp/string_utils/string_utils.cpp | 20 +++----- components/core/tests/test-GrepCore.cpp | 48 +++++++++++-------- components/core/tests/test-string_utils.cpp | 30 ------------ 5 files changed, 93 insertions(+), 69 deletions(-) diff --git a/components/core/src/clp/GrepCore.cpp b/components/core/src/clp/GrepCore.cpp index 530f7879bf..dbcedb5f8b 100644 --- a/components/core/src/clp/GrepCore.cpp +++ b/components/core/src/clp/GrepCore.cpp @@ -1,6 +1,7 @@ #include "GrepCore.hpp" #include +#include #include #include #include @@ -22,6 +23,8 @@ using log_surgeon::SymbolId::TokenInt; using log_surgeon::wildcard_query_parser::QueryInterpretation; using log_surgeon::wildcard_query_parser::StaticQueryToken; using log_surgeon::wildcard_query_parser::VariableQueryToken; +using std::holds_alternative; +using std::set; using std::string; using std::unordered_map; using std::vector; @@ -147,6 +150,41 @@ bool GrepCore::get_bounds_of_next_potential_var( return (value_length != begin_pos); } +auto GrepCore::normalize_interpretations(set const& interpretations) + -> set { + set normalized_interpretations; + for (auto const& interpretation : interpretations) { + QueryInterpretation normalized_interpretation; + for (auto const& token : interpretation.get_logtype()) { + auto const& src_string{ + holds_alternative(token) + ? std::get(token).get_query_substring() + : std::get(token).get_query_substring() + }; + string normalized_string; + normalized_string.reserve(src_string.size()); + for (auto const c : src_string) { + if (c != '*' || normalized_string.empty() || normalized_string.back() != '*') { + normalized_string += c; + } + } + + if (holds_alternative(token)) { + auto const& variable_token{std::get(token)}; + normalized_interpretation.append_variable_token( + variable_token.get_variable_type(), + normalized_string, + variable_token.get_contains_wildcard() + ); + } else { + normalized_interpretation.append_static_token(normalized_string); + } + } + normalized_interpretations.insert(normalized_interpretation); + } + return normalized_interpretations; +} + auto GrepCore::get_wildcard_encodable_positions(QueryInterpretation const& interpretation) -> vector { auto const logtype{interpretation.get_logtype()}; @@ -155,7 +193,7 @@ auto GrepCore::get_wildcard_encodable_positions(QueryInterpretation const& inter for (size_t i{0}; i < logtype.size(); ++i) { auto const& token{logtype[i]}; - if (std::holds_alternative(token)) { + if (holds_alternative(token)) { auto const& var_token{std::get(token)}; auto const var_type{static_cast(var_token.get_variable_type())}; bool const is_int{TokenInt == var_type}; @@ -178,7 +216,7 @@ auto GrepCore::generate_logtype_string( size_t logtype_string_size{0}; auto const logtype{interpretation.get_logtype()}; for (auto const& token : logtype) { - if (std::holds_alternative(token)) { + if (holds_alternative(token)) { auto const& static_token{std::get(token)}; logtype_string_size += static_token.get_query_substring().size(); } else { @@ -190,7 +228,7 @@ auto GrepCore::generate_logtype_string( // Generate `logtype_string`. for (size_t i{0}; i < logtype.size(); ++i) { auto const& token{logtype[i]}; - if (std::holds_alternative(token)) { + if (holds_alternative(token)) { logtype_string += std::get(token).get_query_substring(); continue; } diff --git a/components/core/src/clp/GrepCore.hpp b/components/core/src/clp/GrepCore.hpp index 5df3679f55..83be771716 100644 --- a/components/core/src/clp/GrepCore.hpp +++ b/components/core/src/clp/GrepCore.hpp @@ -140,6 +140,19 @@ class GrepCore { SubQuery& sub_query ); + /** + * Normalizes a set of interpretations by collapsing consecutive greedy wildcards ('*') within + * each token. + * + * Consecutive wildcards that span across the boundary of tokens are preserved. + * + * @param interpretations The original set of `QueryInterpretation`s to normalize. + * @return The normalized set of `QueryInterpretation`s. + */ + static auto normalize_interpretations( + std::set const& interpretations + ) -> std::set; + /** * Compare all log-surgeon interpretations against the dictionaries to determine the sub queries * to search for within the archive. @@ -183,14 +196,14 @@ class GrepCore { LogTypeDictionaryReaderReq LogTypeDictionaryReaderType, VariableDictionaryReaderReq VariableDictionaryReaderType > - static void generate_schema_sub_queries( + static auto generate_schema_sub_queries( std::set const& interpretations, LogTypeDictionaryReaderType const& logtype_dict, VariableDictionaryReaderType const& var_dict, bool ignore_case, std::vector& sub_queries - ); + ) -> void; /** * Scans the interpretation and returns the indices of all encodable wildcard variables. @@ -353,8 +366,9 @@ std::optional GrepCore::process_raw_query( // TODO: Optimize such that interpretations are only generated once per schema. log_surgeon::wildcard_query_parser::Query const query{search_string}; auto const interpretations{query.get_all_multi_token_interpretations(lexer)}; + auto const normalized_interpretations{normalize_interpretations(interpretations)}; generate_schema_sub_queries( - interpretations, + normalized_interpretations, logtype_dict, var_dict, ignore_case, diff --git a/components/core/src/clp/string_utils/string_utils.cpp b/components/core/src/clp/string_utils/string_utils.cpp index 854312a587..adf903ab73 100644 --- a/components/core/src/clp/string_utils/string_utils.cpp +++ b/components/core/src/clp/string_utils/string_utils.cpp @@ -14,13 +14,11 @@ namespace { * Helper for ``wildcard_match_unsafe_case_sensitive`` to advance the pointer in * tame to the next character which matches wild. This method should be inlined * for performance. - * - * This method assumes that `wild_current` has no duplicate greedy wildcards ('*'). - * * @param tame_current * @param tame_bookmark * @param tame_end * @param wild_current + * @param wild_bookmark * @return true on success, false if wild cannot match tame */ inline bool advance_tame_to_next_match( @@ -260,12 +258,10 @@ bool wildcard_match_unsafe_case_sensitive(string_view tame, string_view wild) { while (true) { w = *wild_current; if ('*' == w) { - while ('*' == *wild_current) { - ++wild_current; - if (wild_end == wild_current) { - // Trailing '*' means everything remaining in tame will match - return true; - } + ++wild_current; + if (wild_end == wild_current) { + // Trailing '*' means everything remaining in tame will match + return true; } // Set wild and tame bookmarks @@ -313,10 +309,8 @@ bool wildcard_match_unsafe_case_sensitive(string_view tame, string_view wild) { // Handle reaching the end of tame or wild if (tame_end == tame_current) { - while (wild_end != wild_current && '*' == *wild_current) { - ++wild_current; - } - return wild_end == wild_current; + return (wild_end == wild_current + || ('*' == *wild_current && (wild_current + 1) == wild_end)); } else { if (wild_end == wild_current) { if (nullptr == wild_bookmark) { diff --git a/components/core/tests/test-GrepCore.cpp b/components/core/tests/test-GrepCore.cpp index 390b081d42..a15da89b05 100644 --- a/components/core/tests/test-GrepCore.cpp +++ b/components/core/tests/test-GrepCore.cpp @@ -68,6 +68,30 @@ constexpr uint32_t cHasNumId{111}; */ class clp::GrepCoreTest { public: + static auto normalize_interpretations(set const& interpretations) + -> set { + return GrepCore::normalize_interpretations(interpretations); + } + + template < + LogTypeDictionaryReaderReq LogTypeDictionaryReaderType, + VariableDictionaryReaderReq VariableDictionaryReaderType + > + static auto generate_schema_sub_queries( + std::set const& interpretations, + LogTypeDictionaryReaderType const& logtype_dict, + VariableDictionaryReaderType const& var_dict, + std::vector& sub_queries + ) -> void { + GrepCore::generate_schema_sub_queries( + interpretations, + logtype_dict, + var_dict, + false, + sub_queries + ); + } + static auto get_wildcard_encodable_positions(QueryInterpretation const& interpretation) -> vector { return GrepCore::get_wildcard_encodable_positions(interpretation); @@ -97,25 +121,6 @@ class clp::GrepCoreTest { ) -> bool { return GrepCore::process_schema_var_token(var_token, var_dict, false, true, sub_query); } - - template < - LogTypeDictionaryReaderReq LogTypeDictionaryReaderType, - VariableDictionaryReaderReq VariableDictionaryReaderType - > - static void generate_schema_sub_queries( - std::set const& interpretations, - LogTypeDictionaryReaderType const& logtype_dict, - VariableDictionaryReaderType const& var_dict, - std::vector& sub_queries - ) { - GrepCore::generate_schema_sub_queries( - interpretations, - logtype_dict, - var_dict, - false, - sub_queries - ); - } }; namespace { @@ -894,10 +899,13 @@ TEST_CASE("generate_schema_sub_queries_with_wildcard_duplication", "[dfa_search] for (auto const& raw_interpretation : raw_interpretations) { interpretations.insert(make_query_interpretation(raw_interpretation)); } + auto const normalized_interpretations{ + clp::GrepCoreTest::normalize_interpretations(interpretations) + }; vector sub_queries; clp::GrepCoreTest::generate_schema_sub_queries( - interpretations, + normalized_interpretations, logtype_dict, var_dict, sub_queries diff --git a/components/core/tests/test-string_utils.cpp b/components/core/tests/test-string_utils.cpp index 86a32a9397..e168a704c2 100644 --- a/components/core/tests/test-string_utils.cpp +++ b/components/core/tests/test-string_utils.cpp @@ -145,60 +145,30 @@ SCENARIO("Test case sensitive wild card match in all possible ways", "[wildcard] REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); } - GIVEN("Double wild with no suffix char") { - tameString = "abcd", wildString = "a**"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); - } - GIVEN("Single wild with no prefix char") { tameString = "abcd", wildString = "*d"; REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); } - GIVEN("Double wild with no prefix char") { - tameString = "abcd", wildString = "**d"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); - } - GIVEN("Single wild on both side & has 1st char as literal") { tameString = "abcd", wildString = "*a*"; REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); } - GIVEN("Double wild on both side & has 1st char as literal") { - tameString = "abcd", wildString = "**a**"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); - } - GIVEN("Single wild on both side & has middle char as literal") { tameString = "abcd", wildString = "*b*"; REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); } - GIVEN("Double wild on both side & has middle char as literal") { - tameString = "abcd", wildString = "**b**"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); - } - GIVEN("Single wild on both side & has last char as literal") { tameString = "abcd", wildString = "*d*"; REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); } - GIVEN("Double wild on both side & has last char as literal") { - tameString = "abcd", wildString = "**d**"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); - } - GIVEN("Single wild only") { tameString = "abcd", wildString = "*"; REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); } - - GIVEN("Double wild only") { - tameString = "abcd", wildString = "**"; - REQUIRE(wildcard_match_unsafe_case_sensitive(tameString, wildString) == true); - } } WHEN("Match is expected if Wild card character is \"?\"") { From a130b96a997779a2a87666810288d9f92a9b7a46 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Fri, 3 Oct 2025 05:59:15 -0400 Subject: [PATCH 035/164] Format. --- components/core/src/clp/GrepCore.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/components/core/src/clp/GrepCore.cpp b/components/core/src/clp/GrepCore.cpp index dbcedb5f8b..d0ce438e40 100644 --- a/components/core/src/clp/GrepCore.cpp +++ b/components/core/src/clp/GrepCore.cpp @@ -158,8 +158,8 @@ auto GrepCore::normalize_interpretations(set const& interpr for (auto const& token : interpretation.get_logtype()) { auto const& src_string{ holds_alternative(token) - ? std::get(token).get_query_substring() - : std::get(token).get_query_substring() + ? std::get(token).get_query_substring() + : std::get(token).get_query_substring() }; string normalized_string; normalized_string.reserve(src_string.size()); From f2f01d6fb3bed3dde399707686e0661e473d145e Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Fri, 3 Oct 2025 06:21:04 -0400 Subject: [PATCH 036/164] Add missing headers. --- components/core/src/clp/GrepCore.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/components/core/src/clp/GrepCore.hpp b/components/core/src/clp/GrepCore.hpp index 83be771716..a9845f6630 100644 --- a/components/core/src/clp/GrepCore.hpp +++ b/components/core/src/clp/GrepCore.hpp @@ -9,8 +9,10 @@ #include #include #include +#include #include +#include #include #include #include From 0eed0dbb7ee2d5d3de0c65c03f76380e7fbdc148 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Fri, 3 Oct 2025 06:22:39 -0400 Subject: [PATCH 037/164] Improve docstring. --- components/core/src/clp/GrepCore.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/components/core/src/clp/GrepCore.hpp b/components/core/src/clp/GrepCore.hpp index a9845f6630..238f6f89cf 100644 --- a/components/core/src/clp/GrepCore.hpp +++ b/components/core/src/clp/GrepCore.hpp @@ -165,7 +165,7 @@ class GrepCore { * - 1: treat as an encoded variable (\i for integers, \f for floats) * * If there are k encodable wildcard variables, then 2^k logtype strings are possible. As a - * result we limit k <= 16. We represents these alternatives using a bit mask. + * result we limit k <= 16. We represent these alternatives using a bitmask. * * Example: * Search query: "a *1 *2 b", @@ -190,7 +190,7 @@ class GrepCore { * @param interpretations Log-surgeon's interpretations of the search query. * @param logtype_dict The logtype dictionary. * @param var_dict The variable dictionary. - * @param ignore_case Flag indicating if search is case sensitive. + * @param ignore_case If true, perform a case-insensitive search. * @param sub_queries Returns the subqueries to compare against CLP's archives. * @throw std::runtime_error If there are too many candidate combinations. */ From e2c55a7664ade7bef2c37d6de75c2b4bf9480df4 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Fri, 3 Oct 2025 06:24:33 -0400 Subject: [PATCH 038/164] Use find over contains to avoid double lookup. --- components/core/tests/test-GrepCore.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/components/core/tests/test-GrepCore.cpp b/components/core/tests/test-GrepCore.cpp index a15da89b05..3b2a778a9d 100644 --- a/components/core/tests/test-GrepCore.cpp +++ b/components/core/tests/test-GrepCore.cpp @@ -160,8 +160,9 @@ class FakeVarDict { [[nodiscard]] auto get_value(dictionary_id_t const id) const -> string const& { static string const cEmpty{}; - if (m_storage.contains(id)) { - return m_storage.at(id).get_value(); + auto const it{m_storage.find(id)}; + if (m_storage.end() != it) { + return it->second.get_value(); } return cEmpty; } From 1cbbefdee975e40109c1b979d7316ec1673cc84a Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Fri, 3 Oct 2025 07:25:52 -0400 Subject: [PATCH 039/164] Fix type in test to use variable_dictionary_id_t. --- components/core/tests/test-GrepCore.cpp | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/components/core/tests/test-GrepCore.cpp b/components/core/tests/test-GrepCore.cpp index 3b2a778a9d..8ed396163b 100644 --- a/components/core/tests/test-GrepCore.cpp +++ b/components/core/tests/test-GrepCore.cpp @@ -28,6 +28,7 @@ using clp::EncodedVariableInterpreter; using clp::GrepCore; +using clp::logtype_dictionary_id_t; using clp::LogTypeDictionaryReaderReq; using clp::string_utils::wildcard_match_unsafe_case_sensitive; using clp::SubQuery; @@ -201,7 +202,7 @@ class FakeVarDict { */ class FakeLogTypeEntry { public: - FakeLogTypeEntry(string value, clp::logtype_dictionary_id_t const id) + FakeLogTypeEntry(string value, logtype_dictionary_id_t const id) : m_value(std::move(value)), m_id(id) {} @@ -239,11 +240,11 @@ class FakeLogTypeEntry { return SIZE_MAX; } - [[nodiscard]] auto get_id() const -> clp::logtype_dictionary_id_t { return m_id; } + [[nodiscard]] auto get_id() const -> logtype_dictionary_id_t { return m_id; } private: string m_value; - clp::logtype_dictionary_id_t m_id{0}; + logtype_dictionary_id_t m_id{0}; }; /** @@ -254,7 +255,7 @@ class FakeLogTypeEntry { class FakeLogTypeDict { public: using Entry = FakeLogTypeEntry; - using dictionary_id_t = clp::logtype_dictionary_id_t; + using dictionary_id_t = logtype_dictionary_id_t; auto add_entry(string const& value, dictionary_id_t id) -> void { m_storage.emplace_back(value, id); @@ -357,8 +358,8 @@ auto check_sub_query( size_t id, vector const& sub_queries, bool wildcard_match_required, - vector>> const& vars_info, - unordered_set const& logtype_ids + vector>> const& vars_info, + unordered_set const& logtype_ids ) -> void; /** @@ -380,7 +381,7 @@ auto make_var_dict(vector> const& entries) -> FakeVarDict { auto make_logtype_dict(vector>> const& entries) -> FakeLogTypeDict { FakeLogTypeDict dict; - clp::logtype_dictionary_id_t id{0}; + logtype_dictionary_id_t id{0}; for (auto const& entry : entries) { dict.add_entry(generate_expected_logtype_string(entry), id++); } @@ -430,8 +431,8 @@ auto check_sub_query( size_t id, vector const& sub_queries, bool const wildcard_match_required, - vector>> const& vars_info, - unordered_set const& logtype_ids + vector>> const& vars_info, + unordered_set const& logtype_ids ) -> void { CAPTURE(id); auto const& sub_query{sub_queries[id]}; From f3ab0a5794fcd9f47ac917c24da842cd1bfddb41 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Fri, 3 Oct 2025 07:27:33 -0400 Subject: [PATCH 040/164] Update docstring for consistency. --- components/core/src/clp/GrepCore.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/core/src/clp/GrepCore.hpp b/components/core/src/clp/GrepCore.hpp index 238f6f89cf..7bd325cc6a 100644 --- a/components/core/src/clp/GrepCore.hpp +++ b/components/core/src/clp/GrepCore.hpp @@ -249,7 +249,7 @@ class GrepCore { * @tparam VariableDictionaryReaderType Variable dictionary reader type. * @param variable_token The variable token to process. * @param var_dict The variable dictionary. - * @param ignore_case If the search is case sensitive. + * @param ignore_case If true, perform a case-insensitive search. * @param is_wildcard_mask_encoded If the token is an encodable wildcard and is to be encoded. * @param sub_query Returns the updated sub query object. * @return True if the variable is encoded or is in the variable dictionary, false otherwise. From ddf3e232ee836f61a7753deefa99eee40a34a617 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Fri, 3 Oct 2025 11:00:32 -0400 Subject: [PATCH 041/164] Fix macos ULL error. --- components/core/tests/test-GrepCore.cpp | 53 +++++++++++++------------ 1 file changed, 28 insertions(+), 25 deletions(-) diff --git a/components/core/tests/test-GrepCore.cpp b/components/core/tests/test-GrepCore.cpp index 8ed396163b..e5a442ae2d 100644 --- a/components/core/tests/test-GrepCore.cpp +++ b/components/core/tests/test-GrepCore.cpp @@ -826,7 +826,7 @@ TEST_CASE("process_schema_greedy_wildcard_token ", "[dfa_search]") { // Tests: `generate_schema_sub_queries` TEST_CASE("generate_schema_sub_queries", "[dfa_search]") { - FakeVarDict const var_dict{make_var_dict({pair{0, "10a"}, pair{1, "1a3"}})}; + FakeVarDict const var_dict{make_var_dict({pair{0, "1a3"}, pair{1, "10a"}, pair{2, "10b"}})}; FakeLogTypeDict const logtype_dict{make_logtype_dict( {{"text ", 'i', " ", 'i', " ", 'f'}, {"text ", 'i', " ", 'd', " ", 'f'}, @@ -861,21 +861,22 @@ TEST_CASE("generate_schema_sub_queries", "[dfa_search]") { sub_queries ); - using Var = tuple>; REQUIRE(6 == sub_queries.size()); size_t i{0}; + tuple> const wild_int{false, true, {}}; + tuple> const wild_has_num{true, false, {1LL, 2LL}}; // NOTE: sub queries 0 and 2 are a duplicate of 3 and 5 because we use a vector instead of a set // when storing `m_sub_queries` in `Query`. - check_sub_query(i++, sub_queries, true, {Var{false, true, {}}, Var{true, true, {0}}}, {1}); - check_sub_query(i++, sub_queries, true, {Var{false, true, {}}}, {0}); - check_sub_query(i++, sub_queries, false, {Var{false, true, {}}, Var{true, true, {0}}}, {2, 3}); - check_sub_query(i++, sub_queries, true, {Var{false, true, {}}, Var{true, true, {0}}}, {1}); - check_sub_query(i++, sub_queries, false, {Var{false, true, {}}, Var{true, true, {0}}}, {2, 3}); - check_sub_query(i++, sub_queries, true, {Var{false, true, {}}}, {5}); + check_sub_query(i++, sub_queries, true, {wild_int, wild_has_num}, {1LL}); + check_sub_query(i++, sub_queries, true, {wild_int}, {0LL}); + check_sub_query(i++, sub_queries, false, {wild_int, wild_has_num}, {2LL, 3LL}); + check_sub_query(i++, sub_queries, true, {wild_int, wild_has_num}, {1LL}); + check_sub_query(i++, sub_queries, false, {wild_int, wild_has_num}, {2LL, 3LL}); + check_sub_query(i++, sub_queries, true, {wild_int}, {5LL}); } TEST_CASE("generate_schema_sub_queries_with_wildcard_duplication", "[dfa_search]") { - FakeVarDict const var_dict{make_var_dict({pair{0, "10a"}, pair{1, "1a3"}})}; + FakeVarDict const var_dict{make_var_dict({pair{0, "1a3"}, pair{1, "10a"}})}; FakeLogTypeDict const logtype_dict{make_logtype_dict( {{"text ", 'i', " ", 'i', " ", 'f'}, {"text ", 'i', " ", 'd', " ", 'f'}, @@ -913,15 +914,16 @@ TEST_CASE("generate_schema_sub_queries_with_wildcard_duplication", "[dfa_search] sub_queries ); - using Var = tuple>; + tuple> const wild_int{false, true, {}}; + tuple> const wild_has_num{true, true, {1LL}}; REQUIRE(6 == sub_queries.size()); size_t i{0}; - check_sub_query(i++, sub_queries, true, {Var{false, true, {}}, Var{true, true, {0}}}, {1}); - check_sub_query(i++, sub_queries, true, {Var{false, true, {}}}, {0}); - check_sub_query(i++, sub_queries, false, {Var{false, true, {}}, Var{true, true, {0}}}, {2, 3}); - check_sub_query(i++, sub_queries, true, {Var{false, true, {}}, Var{true, true, {0}}}, {1}); - check_sub_query(i++, sub_queries, false, {Var{false, true, {}}, Var{true, true, {0}}}, {2, 3}); - check_sub_query(i++, sub_queries, true, {Var{false, true, {}}}, {5}); + check_sub_query(i++, sub_queries, true, {wild_int, wild_has_num}, {1LL}); + check_sub_query(i++, sub_queries, true, {wild_int}, {0LL}); + check_sub_query(i++, sub_queries, false, {wild_int, wild_has_num}, {2LL, 3LL}); + check_sub_query(i++, sub_queries, true, {wild_int, wild_has_num}, {1LL}); + check_sub_query(i++, sub_queries, false, {wild_int, wild_has_num}, {2LL, 3LL}); + check_sub_query(i++, sub_queries, true, {wild_int}, {5LL}); } // Tests: `process_raw_query` @@ -930,7 +932,7 @@ TEST_CASE("process_raw_query", "[dfa_search]") { {{R"(int:(\d+))"}, {R"(float:(\d+\.\d+))"}, {R"(hasNumber:[^ $]*\d+[^ $]*)"}} )}; - FakeVarDict const var_dict{make_var_dict({pair{0, "10a"}, pair{1, "1a3"}})}; + FakeVarDict const var_dict{make_var_dict({pair{0, "1a3"}, pair{1, "10a"}})}; FakeLogTypeDict const logtype_dict{make_logtype_dict( {{"text ", 'i', " ", 'i', " ", 'f'}, {"text ", 'i', " ", 'd', " ", 'f'}, @@ -947,17 +949,18 @@ TEST_CASE("process_raw_query", "[dfa_search]") { }; REQUIRE(query.has_value()); - - using Var = tuple>; auto const& sub_queries{query.value().get_sub_queries()}; + + tuple> const wild_int{false, true, {}}; + tuple> const wild_has_num{true, true, {1LL}}; REQUIRE(6 == sub_queries.size()); size_t i{0}; - check_sub_query(i++, sub_queries, true, {Var{false, true, {}}, Var{true, true, {0}}}, {1}); - check_sub_query(i++, sub_queries, true, {Var{false, true, {}}}, {0}); - check_sub_query(i++, sub_queries, false, {Var{false, true, {}}, Var{true, true, {0}}}, {2, 3}); - check_sub_query(i++, sub_queries, true, {Var{false, true, {}}, Var{true, true, {0}}}, {1}); - check_sub_query(i++, sub_queries, false, {Var{false, true, {}}, Var{true, true, {0}}}, {2, 3}); - check_sub_query(i++, sub_queries, true, {Var{false, true, {}}}, {5}); + check_sub_query(i++, sub_queries, true, {wild_int, wild_has_num}, {1LL}); + check_sub_query(i++, sub_queries, true, {wild_int}, {0LL}); + check_sub_query(i++, sub_queries, false, {wild_int, wild_has_num}, {2LL, 3LL}); + check_sub_query(i++, sub_queries, true, {wild_int, wild_has_num}, {1LL}); + check_sub_query(i++, sub_queries, false, {wild_int, wild_has_num}, {2LL, 3LL}); + check_sub_query(i++, sub_queries, true, {wild_int}, {5LL}); } // Tests: `get_bounds_of_next_potential_var` From d629ae9bcb537cad28362ad93474b8cd60bf2b05 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Fri, 3 Oct 2025 12:08:52 -0400 Subject: [PATCH 042/164] Replace map with bool vector. --- components/core/src/clp/GrepCore.cpp | 16 +++--- components/core/src/clp/GrepCore.hpp | 39 +++++++++------ components/core/tests/test-GrepCore.cpp | 65 +++++++++++++------------ 3 files changed, 68 insertions(+), 52 deletions(-) diff --git a/components/core/src/clp/GrepCore.cpp b/components/core/src/clp/GrepCore.cpp index d0ce438e40..1c56f19473 100644 --- a/components/core/src/clp/GrepCore.cpp +++ b/components/core/src/clp/GrepCore.cpp @@ -208,9 +208,10 @@ auto GrepCore::get_wildcard_encodable_positions(QueryInterpretation const& inter auto GrepCore::generate_logtype_string( QueryInterpretation const& interpretation, - unordered_map const& wildcard_mask_map -) -> std::string { - std::string logtype_string; + vector const& wildcard_encodable_positions, + vector const& mask_encoded_flags +) -> string { + string logtype_string; // Reserve size for `logtype_string`. size_t logtype_string_size{0}; @@ -239,9 +240,12 @@ auto GrepCore::generate_logtype_string( bool const is_int{TokenInt == var_type}; bool const is_float{TokenFloat == var_type}; - auto const it{wildcard_mask_map.find(i)}; - if (wildcard_mask_map.end() != it) { - if (it->second) { + if (wildcard_encodable_positions.end() != std::ranges::find( + wildcard_encodable_positions.begin(), + wildcard_encodable_positions.end(), + i + )) { + if (mask_encoded_flags[i]) { if (is_int) { EncodedVariableInterpreter::add_int_var(logtype_string); } else { diff --git a/components/core/src/clp/GrepCore.hpp b/components/core/src/clp/GrepCore.hpp index 7bd325cc6a..c996b0554f 100644 --- a/components/core/src/clp/GrepCore.hpp +++ b/components/core/src/clp/GrepCore.hpp @@ -228,12 +228,14 @@ class GrepCore { * - 1: Treat as an encoded variable. * * @param interpretation The interpretation to convert to a logtype string. - * @param wildcard_mask_map A map indicating the state of encodable wildcard variables. + * @param wildcard_encodable_positions A vector of positions of encodable wildcard variables. + * @param mask_encoded_flags A vector indicating if a variables is mask encoded. * @return The logtype string corresponding to this combination of encoded variables. */ static auto generate_logtype_string( log_surgeon::wildcard_query_parser::QueryInterpretation const& interpretation, - std::unordered_map const& wildcard_mask_map + std::vector const& wildcard_encodable_positions, + std::vector const& mask_encoded_flags ) -> std::string; /** @@ -250,7 +252,7 @@ class GrepCore { * @param variable_token The variable token to process. * @param var_dict The variable dictionary. * @param ignore_case If true, perform a case-insensitive search. - * @param is_wildcard_mask_encoded If the token is an encodable wildcard and is to be encoded. + * @param is_mask_encoded If the token is an encodable wildcard and is to be encoded. * @param sub_query Returns the updated sub query object. * @return True if the variable is encoded or is in the variable dictionary, false otherwise. */ @@ -259,7 +261,7 @@ class GrepCore { log_surgeon::wildcard_query_parser::VariableQueryToken const& variable_token, VariableDictionaryReaderType const& var_dict, bool ignore_case, - bool is_wildcard_mask_encoded, + bool is_mask_encoded, SubQuery& sub_query ) -> bool; }; @@ -561,18 +563,23 @@ void GrepCore::generate_schema_sub_queries( ) { constexpr size_t cMaxEncodableWildcardVariables{16}; for (auto const& interpretation : interpretations) { + auto const logtype{interpretation.get_logtype()}; auto wildcard_encodable_positions{get_wildcard_encodable_positions(interpretation)}; if (wildcard_encodable_positions.size() > cMaxEncodableWildcardVariables) { throw std::runtime_error("Too many encodable variables."); } uint64_t const num_combos{1ULL << wildcard_encodable_positions.size()}; for (uint64_t mask{0}; mask < num_combos; ++mask) { - std::unordered_map wildcard_mask_map; + std::vector mask_encoded_flags(logtype.size(), false); for (size_t i{0}; i < wildcard_encodable_positions.size(); ++i) { - wildcard_mask_map[wildcard_encodable_positions[i]] = (mask >> i) & 1ULL; + mask_encoded_flags[wildcard_encodable_positions[i]] = (mask >> i) & 1ULL; } - auto logtype_string{generate_logtype_string(interpretation, wildcard_mask_map)}; + auto logtype_string{generate_logtype_string( + interpretation, + wildcard_encodable_positions, + mask_encoded_flags) + }; std::unordered_set logtype_entries; logtype_dict.get_entries_matching_wildcard_string( @@ -586,24 +593,26 @@ void GrepCore::generate_schema_sub_queries( SubQuery sub_query; bool has_vars{true}; - auto const logtype{interpretation.get_logtype()}; for (size_t i{0}; i < logtype.size(); ++i) { auto const& token{logtype[i]}; if (std::holds_alternative( token )) { - bool is_wildcard_mask_encoded{false}; - auto const it{wildcard_mask_map.find(i)}; - if (wildcard_mask_map.end() != it) { - is_wildcard_mask_encoded = it->second; + bool is_mask_encoded{false}; + if (wildcard_encodable_positions.end() != std::ranges::find( + wildcard_encodable_positions.begin(), + wildcard_encodable_positions.end(), + i + )) { + is_mask_encoded = mask_encoded_flags[i]; } has_vars = process_schema_var_token( std::get(token), var_dict, ignore_case, - is_wildcard_mask_encoded, + is_mask_encoded, sub_query ); } @@ -631,7 +640,7 @@ auto GrepCore::process_schema_var_token( log_surgeon::wildcard_query_parser::VariableQueryToken const& variable_token, VariableDictionaryReaderType const& var_dict, bool const ignore_case, - bool const is_wildcard_mask_encoded, + bool const is_mask_encoded, SubQuery& sub_query ) -> bool { auto const& raw_string{variable_token.get_query_substring()}; @@ -640,7 +649,7 @@ auto GrepCore::process_schema_var_token( bool const is_int{log_surgeon::SymbolId::TokenInt == var_type}; bool const is_float{log_surgeon::SymbolId::TokenFloat == var_type}; - if (is_wildcard_mask_encoded) { + if (is_mask_encoded) { sub_query.mark_wildcard_match_required(); return true; } diff --git a/components/core/tests/test-GrepCore.cpp b/components/core/tests/test-GrepCore.cpp index e5a442ae2d..d9cedf18b4 100644 --- a/components/core/tests/test-GrepCore.cpp +++ b/components/core/tests/test-GrepCore.cpp @@ -79,10 +79,10 @@ class clp::GrepCoreTest { VariableDictionaryReaderReq VariableDictionaryReaderType > static auto generate_schema_sub_queries( - std::set const& interpretations, + set const& interpretations, LogTypeDictionaryReaderType const& logtype_dict, VariableDictionaryReaderType const& var_dict, - std::vector& sub_queries + vector& sub_queries ) -> void { GrepCore::generate_schema_sub_queries( interpretations, @@ -100,9 +100,14 @@ class clp::GrepCoreTest { static auto generate_logtype_string( QueryInterpretation const& interpretation, - unordered_map const& wildcard_mask_map + vector const& wildcard_encodable_positions, + vector const& mask_encoded_flags ) -> string { - return GrepCore::generate_logtype_string(interpretation, wildcard_mask_map); + return GrepCore::generate_logtype_string( + interpretation, + wildcard_encodable_positions, + mask_encoded_flags + ); } template @@ -522,18 +527,14 @@ TEST_CASE("generate_logtype_string_for_empty_interpretation", "[dfa_search]") { auto const wildcard_encodable_positions{ clp::GrepCoreTest::get_wildcard_encodable_positions(interpretation) }; - uint64_t const num_combos{1ULL << wildcard_encodable_positions.size()}; - REQUIRE(1 == num_combos); - for (uint64_t mask{0}; mask < num_combos; ++mask) { - std::unordered_map wildcard_mask_map; - for (size_t i{0}; i < wildcard_encodable_positions.size(); ++i) { - wildcard_mask_map[wildcard_encodable_positions[i]] = (mask >> i) & 1ULL; - } - auto logtype_string{ - clp::GrepCoreTest::generate_logtype_string(interpretation, wildcard_mask_map) - }; - REQUIRE(logtype_string.empty()); - } + + REQUIRE(wildcard_encodable_positions.empty()); + auto const logtype_string{clp::GrepCoreTest::generate_logtype_string( + interpretation, + wildcard_encodable_positions, + {false} + )}; + REQUIRE(logtype_string.empty()); } TEST_CASE("generate_logtype_string_for_single_variable_interpretation", "[dfa_search]") { @@ -546,22 +547,22 @@ TEST_CASE("generate_logtype_string_for_single_variable_interpretation", "[dfa_se auto const wildcard_encodable_positions{ clp::GrepCoreTest::get_wildcard_encodable_positions(interpretation) }; - uint64_t const num_combos{1ULL << wildcard_encodable_positions.size()}; - REQUIRE(1 == num_combos); - std::unordered_map const wildcard_mask_map{}; - auto logtype_string{ - clp::GrepCoreTest::generate_logtype_string(interpretation, wildcard_mask_map) - }; + REQUIRE(wildcard_encodable_positions.empty()); + auto const logtype_string{clp::GrepCoreTest::generate_logtype_string( + interpretation, + wildcard_encodable_positions, + {false} + )}; REQUIRE(expected_logtype_string == logtype_string); } TEST_CASE("generate_logtype_string_for_multi_variable_interpretation", "[dfa_search]") { unordered_set const expected_logtype_strings{ - {{generate_expected_logtype_string({"text", 'i', 'f', 'd', 'd', 'd'})}, - {generate_expected_logtype_string({"text", 'i', 'f', 'i', 'd', 'd'})}, - {generate_expected_logtype_string({"text", 'i', 'f', 'd', 'f', 'd'})}, - {generate_expected_logtype_string({"text", 'i', 'f', 'i', 'f', 'd'})}} + generate_expected_logtype_string({"text", 'i', 'f', 'd', 'd', 'd'}), + generate_expected_logtype_string({"text", 'i', 'f', 'i', 'd', 'd'}), + generate_expected_logtype_string({"text", 'i', 'f', 'd', 'f', 'd'}), + generate_expected_logtype_string({"text", 'i', 'f', 'i', 'f', 'd'}) }; auto const interpretation{make_query_interpretation( @@ -581,13 +582,15 @@ TEST_CASE("generate_logtype_string_for_multi_variable_interpretation", "[dfa_sea REQUIRE(num_combos == 4); unordered_set logtype_strings; for (uint64_t mask{0}; mask < num_combos; ++mask) { - unordered_map wildcard_mask_map; + vector mask_encoded_flags(interpretation.get_logtype().size(), false); for (size_t i{0}; i < wildcard_encodable_positions.size(); ++i) { - wildcard_mask_map[wildcard_encodable_positions[i]] = (mask >> i) & 1ULL; + mask_encoded_flags[wildcard_encodable_positions[i]] = (mask >> i) & 1ULL; } - logtype_strings.insert( - clp::GrepCoreTest::generate_logtype_string(interpretation, wildcard_mask_map) - ); + logtype_strings.insert(clp::GrepCoreTest::generate_logtype_string( + interpretation, + wildcard_encodable_positions, + mask_encoded_flags + )); } REQUIRE(expected_logtype_strings == logtype_strings); } From 9e3e9d9af7ab53db5dab00ed7c0b4a05821f4b0a Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Fri, 3 Oct 2025 12:20:43 -0400 Subject: [PATCH 043/164] Fix type mismatch. --- components/core/tests/test-GrepCore.cpp | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/components/core/tests/test-GrepCore.cpp b/components/core/tests/test-GrepCore.cpp index d9cedf18b4..15bcaa6ad5 100644 --- a/components/core/tests/test-GrepCore.cpp +++ b/components/core/tests/test-GrepCore.cpp @@ -51,6 +51,8 @@ using std::unordered_set; using std::variant; using std::vector; +using VarInfo = tuple>; + constexpr uint32_t cIntId{static_cast(TokenInt)}; constexpr uint32_t cFloatId{static_cast(TokenFloat)}; constexpr uint32_t cHasNumId{111}; @@ -363,7 +365,7 @@ auto check_sub_query( size_t id, vector const& sub_queries, bool wildcard_match_required, - vector>> const& vars_info, + vector const& vars_info, unordered_set const& logtype_ids ) -> void; @@ -436,7 +438,7 @@ auto check_sub_query( size_t id, vector const& sub_queries, bool const wildcard_match_required, - vector>> const& vars_info, + vector const& vars_info, unordered_set const& logtype_ids ) -> void { CAPTURE(id); @@ -866,8 +868,8 @@ TEST_CASE("generate_schema_sub_queries", "[dfa_search]") { REQUIRE(6 == sub_queries.size()); size_t i{0}; - tuple> const wild_int{false, true, {}}; - tuple> const wild_has_num{true, false, {1LL, 2LL}}; + VarInfo const wild_int{false, true, {}}; + VarInfo const wild_has_num{true, false, {1LL, 2LL}}; // NOTE: sub queries 0 and 2 are a duplicate of 3 and 5 because we use a vector instead of a set // when storing `m_sub_queries` in `Query`. check_sub_query(i++, sub_queries, true, {wild_int, wild_has_num}, {1LL}); @@ -917,8 +919,8 @@ TEST_CASE("generate_schema_sub_queries_with_wildcard_duplication", "[dfa_search] sub_queries ); - tuple> const wild_int{false, true, {}}; - tuple> const wild_has_num{true, true, {1LL}}; + VarInfo const wild_int{false, true, {}}; + VarInfo const wild_has_num{true, true, {1LL}}; REQUIRE(6 == sub_queries.size()); size_t i{0}; check_sub_query(i++, sub_queries, true, {wild_int, wild_has_num}, {1LL}); @@ -954,8 +956,8 @@ TEST_CASE("process_raw_query", "[dfa_search]") { REQUIRE(query.has_value()); auto const& sub_queries{query.value().get_sub_queries()}; - tuple> const wild_int{false, true, {}}; - tuple> const wild_has_num{true, true, {1LL}}; + VarInfo const wild_int{false, true, {}}; + VarInfo const wild_has_num{true, true, {1LL}}; REQUIRE(6 == sub_queries.size()); size_t i{0}; check_sub_query(i++, sub_queries, true, {wild_int, wild_has_num}, {1LL}); From 06ab6a6dff4cdbeba35f7cc541a5a64e9261accf Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Fri, 3 Oct 2025 12:22:11 -0400 Subject: [PATCH 044/164] Add missing header; Remove unused header. --- components/core/src/clp/GrepCore.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/core/src/clp/GrepCore.hpp b/components/core/src/clp/GrepCore.hpp index c996b0554f..c696a3416a 100644 --- a/components/core/src/clp/GrepCore.hpp +++ b/components/core/src/clp/GrepCore.hpp @@ -1,13 +1,13 @@ #ifndef CLP_GREPCORE_HPP #define CLP_GREPCORE_HPP +#include #include #include #include #include #include #include -#include #include #include #include From ad951ae2295171a82ec27815bfed9b0f9bda1b1c Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Fri, 3 Oct 2025 12:28:01 -0400 Subject: [PATCH 045/164] Add missing header. --- components/core/src/clp/GrepCore.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/components/core/src/clp/GrepCore.cpp b/components/core/src/clp/GrepCore.cpp index 1c56f19473..b0392cff2e 100644 --- a/components/core/src/clp/GrepCore.cpp +++ b/components/core/src/clp/GrepCore.cpp @@ -1,5 +1,6 @@ #include "GrepCore.hpp" +#include #include #include #include From f3d27b7c309242fc26dd54d45a1f9808ededeb21 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Fri, 3 Oct 2025 12:29:32 -0400 Subject: [PATCH 046/164] Format. --- components/core/src/clp/GrepCore.cpp | 12 +++++++----- components/core/src/clp/GrepCore.hpp | 16 +++++++++------- components/core/tests/test-GrepCore.cpp | 20 +++++++++++--------- 3 files changed, 27 insertions(+), 21 deletions(-) diff --git a/components/core/src/clp/GrepCore.cpp b/components/core/src/clp/GrepCore.cpp index b0392cff2e..8c4e225955 100644 --- a/components/core/src/clp/GrepCore.cpp +++ b/components/core/src/clp/GrepCore.cpp @@ -241,11 +241,13 @@ auto GrepCore::generate_logtype_string( bool const is_int{TokenInt == var_type}; bool const is_float{TokenFloat == var_type}; - if (wildcard_encodable_positions.end() != std::ranges::find( - wildcard_encodable_positions.begin(), - wildcard_encodable_positions.end(), - i - )) { + if (wildcard_encodable_positions.end() + != std::ranges::find( + wildcard_encodable_positions.begin(), + wildcard_encodable_positions.end(), + i + )) + { if (mask_encoded_flags[i]) { if (is_int) { EncodedVariableInterpreter::add_int_var(logtype_string); diff --git a/components/core/src/clp/GrepCore.hpp b/components/core/src/clp/GrepCore.hpp index c696a3416a..d398478868 100644 --- a/components/core/src/clp/GrepCore.hpp +++ b/components/core/src/clp/GrepCore.hpp @@ -578,8 +578,8 @@ void GrepCore::generate_schema_sub_queries( auto logtype_string{generate_logtype_string( interpretation, wildcard_encodable_positions, - mask_encoded_flags) - }; + mask_encoded_flags + )}; std::unordered_set logtype_entries; logtype_dict.get_entries_matching_wildcard_string( @@ -600,11 +600,13 @@ void GrepCore::generate_schema_sub_queries( )) { bool is_mask_encoded{false}; - if (wildcard_encodable_positions.end() != std::ranges::find( - wildcard_encodable_positions.begin(), - wildcard_encodable_positions.end(), - i - )) { + if (wildcard_encodable_positions.end() + != std::ranges::find( + wildcard_encodable_positions.begin(), + wildcard_encodable_positions.end(), + i + )) + { is_mask_encoded = mask_encoded_flags[i]; } diff --git a/components/core/tests/test-GrepCore.cpp b/components/core/tests/test-GrepCore.cpp index 15bcaa6ad5..0d262570a1 100644 --- a/components/core/tests/test-GrepCore.cpp +++ b/components/core/tests/test-GrepCore.cpp @@ -561,10 +561,10 @@ TEST_CASE("generate_logtype_string_for_single_variable_interpretation", "[dfa_se TEST_CASE("generate_logtype_string_for_multi_variable_interpretation", "[dfa_search]") { unordered_set const expected_logtype_strings{ - generate_expected_logtype_string({"text", 'i', 'f', 'd', 'd', 'd'}), - generate_expected_logtype_string({"text", 'i', 'f', 'i', 'd', 'd'}), - generate_expected_logtype_string({"text", 'i', 'f', 'd', 'f', 'd'}), - generate_expected_logtype_string({"text", 'i', 'f', 'i', 'f', 'd'}) + generate_expected_logtype_string({"text", 'i', 'f', 'd', 'd', 'd'}), + generate_expected_logtype_string({"text", 'i', 'f', 'i', 'd', 'd'}), + generate_expected_logtype_string({"text", 'i', 'f', 'd', 'f', 'd'}), + generate_expected_logtype_string({"text", 'i', 'f', 'i', 'f', 'd'}) }; auto const interpretation{make_query_interpretation( @@ -588,11 +588,13 @@ TEST_CASE("generate_logtype_string_for_multi_variable_interpretation", "[dfa_sea for (size_t i{0}; i < wildcard_encodable_positions.size(); ++i) { mask_encoded_flags[wildcard_encodable_positions[i]] = (mask >> i) & 1ULL; } - logtype_strings.insert(clp::GrepCoreTest::generate_logtype_string( - interpretation, - wildcard_encodable_positions, - mask_encoded_flags - )); + logtype_strings.insert( + clp::GrepCoreTest::generate_logtype_string( + interpretation, + wildcard_encodable_positions, + mask_encoded_flags + ) + ); } REQUIRE(expected_logtype_strings == logtype_strings); } From 9920bc95817d3fe56c872462dbb4ee6bb6e78aca Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Sat, 4 Oct 2025 10:16:29 -0400 Subject: [PATCH 047/164] Remove duplicate sub queries. --- components/core/src/clp/GrepCore.hpp | 4 +++- components/core/src/clp/Query.hpp | 4 ++++ components/core/tests/test-GrepCore.cpp | 16 ++++------------ 3 files changed, 11 insertions(+), 13 deletions(-) diff --git a/components/core/src/clp/GrepCore.hpp b/components/core/src/clp/GrepCore.hpp index d398478868..b85b172e88 100644 --- a/components/core/src/clp/GrepCore.hpp +++ b/components/core/src/clp/GrepCore.hpp @@ -632,7 +632,9 @@ void GrepCore::generate_schema_sub_queries( possible_logtype_ids.emplace(entry->get_id()); } sub_query.set_possible_logtypes(possible_logtype_ids); - sub_queries.push_back(std::move(sub_query)); + if (sub_queries.end() == std::ranges::find(sub_queries, sub_query)) { + sub_queries.push_back(std::move(sub_query)); + } } } } diff --git a/components/core/src/clp/Query.hpp b/components/core/src/clp/Query.hpp index 021868e2ec..5531b60078 100644 --- a/components/core/src/clp/Query.hpp +++ b/components/core/src/clp/Query.hpp @@ -25,6 +25,8 @@ class QueryVar { ); // Methods + auto operator==(const QueryVar& lhs) const -> bool = default; + /** * Checks if the given encoded variable matches this QueryVar * @param var @@ -75,6 +77,8 @@ class QueryVar { class SubQuery { public: // Methods + auto operator==(const SubQuery& lhs) const -> bool = default; + /** * Adds a precise non-dictionary variable to the subquery * @param precise_non_dict_var diff --git a/components/core/tests/test-GrepCore.cpp b/components/core/tests/test-GrepCore.cpp index 0d262570a1..3b4733cb69 100644 --- a/components/core/tests/test-GrepCore.cpp +++ b/components/core/tests/test-GrepCore.cpp @@ -868,17 +868,13 @@ TEST_CASE("generate_schema_sub_queries", "[dfa_search]") { sub_queries ); - REQUIRE(6 == sub_queries.size()); - size_t i{0}; VarInfo const wild_int{false, true, {}}; VarInfo const wild_has_num{true, false, {1LL, 2LL}}; - // NOTE: sub queries 0 and 2 are a duplicate of 3 and 5 because we use a vector instead of a set - // when storing `m_sub_queries` in `Query`. + REQUIRE(4 == sub_queries.size()); + size_t i{0}; check_sub_query(i++, sub_queries, true, {wild_int, wild_has_num}, {1LL}); check_sub_query(i++, sub_queries, true, {wild_int}, {0LL}); check_sub_query(i++, sub_queries, false, {wild_int, wild_has_num}, {2LL, 3LL}); - check_sub_query(i++, sub_queries, true, {wild_int, wild_has_num}, {1LL}); - check_sub_query(i++, sub_queries, false, {wild_int, wild_has_num}, {2LL, 3LL}); check_sub_query(i++, sub_queries, true, {wild_int}, {5LL}); } @@ -923,13 +919,11 @@ TEST_CASE("generate_schema_sub_queries_with_wildcard_duplication", "[dfa_search] VarInfo const wild_int{false, true, {}}; VarInfo const wild_has_num{true, true, {1LL}}; - REQUIRE(6 == sub_queries.size()); + REQUIRE(4 == sub_queries.size()); size_t i{0}; check_sub_query(i++, sub_queries, true, {wild_int, wild_has_num}, {1LL}); check_sub_query(i++, sub_queries, true, {wild_int}, {0LL}); check_sub_query(i++, sub_queries, false, {wild_int, wild_has_num}, {2LL, 3LL}); - check_sub_query(i++, sub_queries, true, {wild_int, wild_has_num}, {1LL}); - check_sub_query(i++, sub_queries, false, {wild_int, wild_has_num}, {2LL, 3LL}); check_sub_query(i++, sub_queries, true, {wild_int}, {5LL}); } @@ -960,13 +954,11 @@ TEST_CASE("process_raw_query", "[dfa_search]") { VarInfo const wild_int{false, true, {}}; VarInfo const wild_has_num{true, true, {1LL}}; - REQUIRE(6 == sub_queries.size()); + REQUIRE(4 == sub_queries.size()); size_t i{0}; check_sub_query(i++, sub_queries, true, {wild_int, wild_has_num}, {1LL}); check_sub_query(i++, sub_queries, true, {wild_int}, {0LL}); check_sub_query(i++, sub_queries, false, {wild_int, wild_has_num}, {2LL, 3LL}); - check_sub_query(i++, sub_queries, true, {wild_int, wild_has_num}, {1LL}); - check_sub_query(i++, sub_queries, false, {wild_int, wild_has_num}, {2LL, 3LL}); check_sub_query(i++, sub_queries, true, {wild_int}, {5LL}); } From e7dcc43a51fb986e04dd6606945fbe0a7407783e Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Sat, 4 Oct 2025 10:53:47 -0400 Subject: [PATCH 048/164] Format. --- components/core/src/clp/Query.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/components/core/src/clp/Query.hpp b/components/core/src/clp/Query.hpp index 5531b60078..9e902fba33 100644 --- a/components/core/src/clp/Query.hpp +++ b/components/core/src/clp/Query.hpp @@ -25,7 +25,7 @@ class QueryVar { ); // Methods - auto operator==(const QueryVar& lhs) const -> bool = default; + auto operator==(QueryVar const& lhs) const -> bool = default; /** * Checks if the given encoded variable matches this QueryVar @@ -77,7 +77,7 @@ class QueryVar { class SubQuery { public: // Methods - auto operator==(const SubQuery& lhs) const -> bool = default; + auto operator==(SubQuery const& lhs) const -> bool = default; /** * Adds a precise non-dictionary variable to the subquery From ed138869b846fc0720c9521eba169238b2e7400b Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Sat, 4 Oct 2025 11:14:56 -0400 Subject: [PATCH 049/164] Rename to rhs. --- components/core/src/clp/Query.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/components/core/src/clp/Query.hpp b/components/core/src/clp/Query.hpp index 9e902fba33..51457b0508 100644 --- a/components/core/src/clp/Query.hpp +++ b/components/core/src/clp/Query.hpp @@ -25,7 +25,7 @@ class QueryVar { ); // Methods - auto operator==(QueryVar const& lhs) const -> bool = default; + auto operator==(QueryVar const& rhs) const -> bool = default; /** * Checks if the given encoded variable matches this QueryVar @@ -77,7 +77,7 @@ class QueryVar { class SubQuery { public: // Methods - auto operator==(SubQuery const& lhs) const -> bool = default; + auto operator==(SubQuery const& rhs) const -> bool = default; /** * Adds a precise non-dictionary variable to the subquery From e0ecdd0018eee8775cccd571933e4d53a44d586e Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Mon, 8 Dec 2025 20:46:23 -0500 Subject: [PATCH 050/164] Refactor schema search classes and methods into new files; Do the same for code to test it. --- components/core/CMakeLists.txt | 7 + components/core/src/clp/GrepCore.cpp | 148 --- components/core/src/clp/GrepCore.hpp | 302 +----- components/core/src/clp/SchemaSearcher.cpp | 161 +++ components/core/src/clp/SchemaSearcher.hpp | 337 +++++++ components/core/src/clp/clg/CMakeLists.txt | 2 + components/core/src/clp/clo/CMakeLists.txt | 2 + components/core/src/clp_s/CMakeLists.txt | 2 + .../core/tests/MockLogTypeDictionary.hpp | 116 +++ .../core/tests/MockVariableDictionary.hpp | 91 ++ components/core/tests/search_test_utils.cpp | 105 ++ components/core/tests/search_test_utils.hpp | 90 ++ components/core/tests/test-GrepCore.cpp | 933 +----------------- components/core/tests/test-SchemaSearcher.cpp | 578 +++++++++++ 14 files changed, 1533 insertions(+), 1341 deletions(-) create mode 100644 components/core/src/clp/SchemaSearcher.cpp create mode 100644 components/core/src/clp/SchemaSearcher.hpp create mode 100644 components/core/tests/MockLogTypeDictionary.hpp create mode 100644 components/core/tests/MockVariableDictionary.hpp create mode 100644 components/core/tests/search_test_utils.cpp create mode 100644 components/core/tests/search_test_utils.hpp create mode 100644 components/core/tests/test-SchemaSearcher.cpp diff --git a/components/core/CMakeLists.txt b/components/core/CMakeLists.txt index b4bd27b25a..188d88edee 100644 --- a/components/core/CMakeLists.txt +++ b/components/core/CMakeLists.txt @@ -560,6 +560,8 @@ set(SOURCE_FILES_unitTest src/clp/GrepCore.hpp src/clp/hash_utils.cpp src/clp/hash_utils.hpp + src/clp/SchemaSearcher.cpp + src/clp/SchemaSearcher.hpp src/clp/ir/constants.hpp src/clp/ir/EncodedTextAst.cpp src/clp/ir/EncodedTextAst.hpp @@ -689,6 +691,10 @@ set(SOURCE_FILES_unitTest tests/clp_s_test_utils.cpp tests/clp_s_test_utils.hpp tests/LogSuppressor.hpp + tests/MockLogTypeDictionary.hpp + tests/MockVariableDictionary.hpp + tests/search_test_utils.cpp + tests/search_test_utils.hpp tests/TestOutputCleaner.hpp tests/test-BoundedReader.cpp tests/test-BufferedReader.cpp @@ -716,6 +722,7 @@ set(SOURCE_FILES_unitTest tests/test-ParserWithUserSchema.cpp tests/test-query_methods.cpp tests/test-regex_utils.cpp + tests/test-SchemaSearcher.cpp tests/test-Segment.cpp tests/test-SQLiteDB.cpp tests/test-Stopwatch.cpp diff --git a/components/core/src/clp/GrepCore.cpp b/components/core/src/clp/GrepCore.cpp index 8c4e225955..394b2db5b8 100644 --- a/components/core/src/clp/GrepCore.cpp +++ b/components/core/src/clp/GrepCore.cpp @@ -1,34 +1,17 @@ #include "GrepCore.hpp" -#include #include -#include #include #include -#include -#include -#include -#include #include -#include "EncodedVariableInterpreter.hpp" #include "ir/parsing.hpp" -#include "StringReader.hpp" using clp::ir::is_delim; using clp::string_utils::is_alphabet; using clp::string_utils::is_wildcard; -using log_surgeon::SymbolId::TokenFloat; -using log_surgeon::SymbolId::TokenInt; -using log_surgeon::wildcard_query_parser::QueryInterpretation; -using log_surgeon::wildcard_query_parser::StaticQueryToken; -using log_surgeon::wildcard_query_parser::VariableQueryToken; -using std::holds_alternative; -using std::set; using std::string; -using std::unordered_map; -using std::vector; namespace clp { bool GrepCore::get_bounds_of_next_potential_var( @@ -150,135 +133,4 @@ bool GrepCore::get_bounds_of_next_potential_var( return (value_length != begin_pos); } - -auto GrepCore::normalize_interpretations(set const& interpretations) - -> set { - set normalized_interpretations; - for (auto const& interpretation : interpretations) { - QueryInterpretation normalized_interpretation; - for (auto const& token : interpretation.get_logtype()) { - auto const& src_string{ - holds_alternative(token) - ? std::get(token).get_query_substring() - : std::get(token).get_query_substring() - }; - string normalized_string; - normalized_string.reserve(src_string.size()); - for (auto const c : src_string) { - if (c != '*' || normalized_string.empty() || normalized_string.back() != '*') { - normalized_string += c; - } - } - - if (holds_alternative(token)) { - auto const& variable_token{std::get(token)}; - normalized_interpretation.append_variable_token( - variable_token.get_variable_type(), - normalized_string, - variable_token.get_contains_wildcard() - ); - } else { - normalized_interpretation.append_static_token(normalized_string); - } - } - normalized_interpretations.insert(normalized_interpretation); - } - return normalized_interpretations; -} - -auto GrepCore::get_wildcard_encodable_positions(QueryInterpretation const& interpretation) - -> vector { - auto const logtype{interpretation.get_logtype()}; - vector wildcard_encodable_positions; - wildcard_encodable_positions.reserve(logtype.size()); - - for (size_t i{0}; i < logtype.size(); ++i) { - auto const& token{logtype[i]}; - if (holds_alternative(token)) { - auto const& var_token{std::get(token)}; - auto const var_type{static_cast(var_token.get_variable_type())}; - bool const is_int{TokenInt == var_type}; - bool const is_float{TokenFloat == var_type}; - if (var_token.get_contains_wildcard() && (is_int || is_float)) { - wildcard_encodable_positions.push_back(i); - } - } - } - return wildcard_encodable_positions; -} - -auto GrepCore::generate_logtype_string( - QueryInterpretation const& interpretation, - vector const& wildcard_encodable_positions, - vector const& mask_encoded_flags -) -> string { - string logtype_string; - - // Reserve size for `logtype_string`. - size_t logtype_string_size{0}; - auto const logtype{interpretation.get_logtype()}; - for (auto const& token : logtype) { - if (holds_alternative(token)) { - auto const& static_token{std::get(token)}; - logtype_string_size += static_token.get_query_substring().size(); - } else { - logtype_string_size++; - } - } - logtype_string.reserve(logtype_string_size); - - // Generate `logtype_string`. - for (size_t i{0}; i < logtype.size(); ++i) { - auto const& token{logtype[i]}; - if (holds_alternative(token)) { - logtype_string += std::get(token).get_query_substring(); - continue; - } - - auto const& var_token{std::get(token)}; - auto const& raw_string{var_token.get_query_substring()}; - auto const var_type{static_cast(var_token.get_variable_type())}; - bool const is_int{TokenInt == var_type}; - bool const is_float{TokenFloat == var_type}; - - if (wildcard_encodable_positions.end() - != std::ranges::find( - wildcard_encodable_positions.begin(), - wildcard_encodable_positions.end(), - i - )) - { - if (mask_encoded_flags[i]) { - if (is_int) { - EncodedVariableInterpreter::add_int_var(logtype_string); - } else { - EncodedVariableInterpreter::add_float_var(logtype_string); - } - } else { - EncodedVariableInterpreter::add_dict_var(logtype_string); - } - continue; - } - - encoded_variable_t encoded_var{0}; - if (is_int - && EncodedVariableInterpreter::convert_string_to_representable_integer_var( - raw_string, - encoded_var - )) - { - EncodedVariableInterpreter::add_int_var(logtype_string); - } else if (is_float - && EncodedVariableInterpreter::convert_string_to_representable_float_var( - raw_string, - encoded_var - )) - { - EncodedVariableInterpreter::add_float_var(logtype_string); - } else { - EncodedVariableInterpreter::add_dict_var(logtype_string); - } - } - return logtype_string; -} } // namespace clp diff --git a/components/core/src/clp/GrepCore.hpp b/components/core/src/clp/GrepCore.hpp index b85b172e88..249c276e04 100644 --- a/components/core/src/clp/GrepCore.hpp +++ b/components/core/src/clp/GrepCore.hpp @@ -1,19 +1,15 @@ #ifndef CLP_GREPCORE_HPP #define CLP_GREPCORE_HPP -#include #include #include #include -#include #include #include #include -#include #include #include -#include #include #include @@ -24,18 +20,11 @@ #include "LogTypeDictionaryReaderReq.hpp" #include "Query.hpp" #include "QueryToken.hpp" +#include "SchemaSearcher.hpp" #include "VariableDictionaryReaderReq.hpp" namespace clp { -#ifdef CLP_BUILD_TESTING -class GrepCoreTest; -#endif - class GrepCore { -#ifdef CLP_BUILD_TESTING - friend class GrepCoreTest; -#endif - public: // Methods /** @@ -141,129 +130,6 @@ class GrepCore { bool ignore_case, SubQuery& sub_query ); - - /** - * Normalizes a set of interpretations by collapsing consecutive greedy wildcards ('*') within - * each token. - * - * Consecutive wildcards that span across the boundary of tokens are preserved. - * - * @param interpretations The original set of `QueryInterpretation`s to normalize. - * @return The normalized set of `QueryInterpretation`s. - */ - static auto normalize_interpretations( - std::set const& interpretations - ) -> std::set; - - /** - * Compare all log-surgeon interpretations against the dictionaries to determine the sub queries - * to search for within the archive. - * - * A. For each interpretation we must consider encodable wildcard variables (e.g. (*1)). - * Each such variable introduces a binary choice: - * - 0: treat as a dictionary variable (\d) - * - 1: treat as an encoded variable (\i for integers, \f for floats) - * - * If there are k encodable wildcard variables, then 2^k logtype strings are possible. As a - * result we limit k <= 16. We represent these alternatives using a bitmask. - * - * Example: - * Search query: "a *1 *2 b", - * Interpretation (one of many): "a (*1) (*2) b" - * Possible logtypes (for the above interpretation): - * mask 00 -> "a \d \d b" - * mask 01 -> "a \d \f b" - * mask 10 -> "a \i \d b" - * mask 11 -> "a \i \f b" - * - * B. Each candidate combination becomes a useful subquery only if: - * 1. The logtype exists in the logtype dictionary, and - * 2. Each variable is either: - * a) resolvable in the variable dictionary (for dictionary vars), or - * b) encoded (always assumed valid). - * - * Note: Encoded variables are always assumed to exist in the segment. This is a performance - * trade-off: checking the archive would be slower than decompressing. - * - * @tparam LogTypeDictionaryReaderType Logtype dictionary reader type. - * @tparam VariableDictionaryReaderType Variable dictionary reader type. - * @param interpretations Log-surgeon's interpretations of the search query. - * @param logtype_dict The logtype dictionary. - * @param var_dict The variable dictionary. - * @param ignore_case If true, perform a case-insensitive search. - * @param sub_queries Returns the subqueries to compare against CLP's archives. - * @throw std::runtime_error If there are too many candidate combinations. - */ - template < - LogTypeDictionaryReaderReq LogTypeDictionaryReaderType, - VariableDictionaryReaderReq VariableDictionaryReaderType - > - static auto generate_schema_sub_queries( - std::set const& - interpretations, - LogTypeDictionaryReaderType const& logtype_dict, - VariableDictionaryReaderType const& var_dict, - bool ignore_case, - std::vector& sub_queries - ) -> void; - - /** - * Scans the interpretation and returns the indices of all encodable wildcard variables. - * - * An encodable variable is a variable token that: - * - Contains a wildcard (e.g. *1). - * - Is of an encodable type (integer or float). - * - * @param interpretation The `QueryInterpretation` to scan. - * @return A vector of positions of encodable wildcard variables. - */ - static auto get_wildcard_encodable_positions( - log_surgeon::wildcard_query_parser::QueryInterpretation const& interpretation - ) -> std::vector; - - /** - * Generates a logtype string from an interpretation, applying a mask to determine which - * encodable wildcard positions are treated as encoded vs dictionary variables. - * - 0: Treat as dictionary variable. - * - 1: Treat as an encoded variable. - * - * @param interpretation The interpretation to convert to a logtype string. - * @param wildcard_encodable_positions A vector of positions of encodable wildcard variables. - * @param mask_encoded_flags A vector indicating if a variables is mask encoded. - * @return The logtype string corresponding to this combination of encoded variables. - */ - static auto generate_logtype_string( - log_surgeon::wildcard_query_parser::QueryInterpretation const& interpretation, - std::vector const& wildcard_encodable_positions, - std::vector const& mask_encoded_flags - ) -> std::string; - - /** - * Process a single variable token for schema subquery generation. - * - * Determines if the variable can be treated as: - * - an encoded variable, - * - a dictionary variable, - * - or requires wildcard dictionary search. - * - * Updates `sub_query` with the appropriate variable encodings. - * - * @tparam VariableDictionaryReaderType Variable dictionary reader type. - * @param variable_token The variable token to process. - * @param var_dict The variable dictionary. - * @param ignore_case If true, perform a case-insensitive search. - * @param is_mask_encoded If the token is an encodable wildcard and is to be encoded. - * @param sub_query Returns the updated sub query object. - * @return True if the variable is encoded or is in the variable dictionary, false otherwise. - */ - template - static auto process_schema_var_token( - log_surgeon::wildcard_query_parser::VariableQueryToken const& variable_token, - VariableDictionaryReaderType const& var_dict, - bool ignore_case, - bool is_mask_encoded, - SubQuery& sub_query - ) -> bool; }; template < @@ -367,16 +233,12 @@ std::optional GrepCore::process_raw_query( } } } else { - // TODO: Optimize such that interpretations are only generated once per schema. - log_surgeon::wildcard_query_parser::Query const query{search_string}; - auto const interpretations{query.get_all_multi_token_interpretations(lexer)}; - auto const normalized_interpretations{normalize_interpretations(interpretations)}; - generate_schema_sub_queries( - normalized_interpretations, + sub_queries = SchemaSearcher::search( + search_string, + lexer, logtype_dict, var_dict, - ignore_case, - sub_queries + ignore_case ); } @@ -549,160 +411,6 @@ GrepCore::SubQueryMatchabilityResult GrepCore::generate_logtypes_and_vars_for_su return SubQueryMatchabilityResult::MayMatch; } - -template < - LogTypeDictionaryReaderReq LogTypeDictionaryReaderType, - VariableDictionaryReaderReq VariableDictionaryReaderType -> -void GrepCore::generate_schema_sub_queries( - std::set const& interpretations, - LogTypeDictionaryReaderType const& logtype_dict, - VariableDictionaryReaderType const& var_dict, - bool const ignore_case, - std::vector& sub_queries -) { - constexpr size_t cMaxEncodableWildcardVariables{16}; - for (auto const& interpretation : interpretations) { - auto const logtype{interpretation.get_logtype()}; - auto wildcard_encodable_positions{get_wildcard_encodable_positions(interpretation)}; - if (wildcard_encodable_positions.size() > cMaxEncodableWildcardVariables) { - throw std::runtime_error("Too many encodable variables."); - } - uint64_t const num_combos{1ULL << wildcard_encodable_positions.size()}; - for (uint64_t mask{0}; mask < num_combos; ++mask) { - std::vector mask_encoded_flags(logtype.size(), false); - for (size_t i{0}; i < wildcard_encodable_positions.size(); ++i) { - mask_encoded_flags[wildcard_encodable_positions[i]] = (mask >> i) & 1ULL; - } - - auto logtype_string{generate_logtype_string( - interpretation, - wildcard_encodable_positions, - mask_encoded_flags - )}; - - std::unordered_set logtype_entries; - logtype_dict.get_entries_matching_wildcard_string( - logtype_string, - ignore_case, - logtype_entries - ); - if (logtype_entries.empty()) { - continue; - } - - SubQuery sub_query; - bool has_vars{true}; - for (size_t i{0}; i < logtype.size(); ++i) { - auto const& token{logtype[i]}; - if (std::holds_alternative( - token - )) - { - bool is_mask_encoded{false}; - if (wildcard_encodable_positions.end() - != std::ranges::find( - wildcard_encodable_positions.begin(), - wildcard_encodable_positions.end(), - i - )) - { - is_mask_encoded = mask_encoded_flags[i]; - } - - has_vars = process_schema_var_token( - std::get(token), - var_dict, - ignore_case, - is_mask_encoded, - sub_query - ); - } - if (false == has_vars) { - break; - } - } - if (false == has_vars) { - continue; - } - - std::unordered_set possible_logtype_ids; - possible_logtype_ids.reserve(logtype_entries.size()); - for (auto const* entry : logtype_entries) { - possible_logtype_ids.emplace(entry->get_id()); - } - sub_query.set_possible_logtypes(possible_logtype_ids); - if (sub_queries.end() == std::ranges::find(sub_queries, sub_query)) { - sub_queries.push_back(std::move(sub_query)); - } - } - } -} - -template -auto GrepCore::process_schema_var_token( - log_surgeon::wildcard_query_parser::VariableQueryToken const& variable_token, - VariableDictionaryReaderType const& var_dict, - bool const ignore_case, - bool const is_mask_encoded, - SubQuery& sub_query -) -> bool { - auto const& raw_string{variable_token.get_query_substring()}; - auto const var_has_wildcard{variable_token.get_contains_wildcard()}; - auto const var_type{static_cast(variable_token.get_variable_type())}; - bool const is_int{log_surgeon::SymbolId::TokenInt == var_type}; - bool const is_float{log_surgeon::SymbolId::TokenFloat == var_type}; - - if (is_mask_encoded) { - sub_query.mark_wildcard_match_required(); - return true; - } - - if (var_has_wildcard) { - return EncodedVariableInterpreter::wildcard_search_dictionary_and_get_encoded_matches( - raw_string, - var_dict, - ignore_case, - sub_query - ); - } - - encoded_variable_t encoded_var{}; - if ((is_int - && EncodedVariableInterpreter::convert_string_to_representable_integer_var( - raw_string, - encoded_var - )) - || (is_float - && EncodedVariableInterpreter::convert_string_to_representable_float_var( - raw_string, - encoded_var - ))) - { - sub_query.add_non_dict_var(encoded_var); - return true; - } - - auto entries = var_dict.get_entry_matching_value(raw_string, ignore_case); - if (entries.empty()) { - return false; - } - if (1 == entries.size()) { - auto const entry_id{entries[0]->get_id()}; - sub_query.add_dict_var(EncodedVariableInterpreter::encode_var_dict_id(entry_id), entry_id); - return true; - } - std::unordered_set encoded_vars; - std::unordered_set var_dict_ids; - encoded_vars.reserve(entries.size()); - var_dict_ids.reserve(entries.size()); - for (auto const* entry : entries) { - encoded_vars.emplace(EncodedVariableInterpreter::encode_var_dict_id(entry->get_id())); - var_dict_ids.emplace(entry->get_id()); - } - sub_query.add_imprecise_dict_var(encoded_vars, var_dict_ids); - return true; -} } // namespace clp #endif // CLP_GREPCORE_HPP diff --git a/components/core/src/clp/SchemaSearcher.cpp b/components/core/src/clp/SchemaSearcher.cpp new file mode 100644 index 0000000000..d644b9f76b --- /dev/null +++ b/components/core/src/clp/SchemaSearcher.cpp @@ -0,0 +1,161 @@ +#include "SchemaSearcher.hpp" + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "EncodedVariableInterpreter.hpp" + +using log_surgeon::SymbolId::TokenFloat; +using log_surgeon::SymbolId::TokenInt; +using log_surgeon::wildcard_query_parser::QueryInterpretation; +using log_surgeon::wildcard_query_parser::StaticQueryToken; +using log_surgeon::wildcard_query_parser::VariableQueryToken; +using std::holds_alternative; +using std::set; +using std::string; +using std::unordered_map; +using std::vector; + +namespace clp { +auto SchemaSearcher::normalize_interpretations(set const& interpretations) + -> set { + set normalized_interpretations; + for (auto const& interpretation : interpretations) { + QueryInterpretation normalized_interpretation; + for (auto const& token : interpretation.get_logtype()) { + auto const& src_string{std::visit( + [](auto const& token) -> std::string const& { + return token.get_query_substring(); + }, + token + )}; + string normalized_string; + normalized_string.reserve(src_string.size()); + for (auto const c : src_string) { + if (c != '*' || normalized_string.empty() || normalized_string.back() != '*') { + normalized_string += c; + } + } + + std::visit( + overloaded{ + [&](VariableQueryToken const& variable_token) -> void { + normalized_interpretation.append_variable_token( + variable_token.get_variable_type(), + normalized_string, + variable_token.get_contains_wildcard() + ); + }, + [&]([[maybe_unused]] StaticQueryToken const& static_token) -> void { + normalized_interpretation.append_static_token(normalized_string); + } + }, + token + ); + } + normalized_interpretations.insert(normalized_interpretation); + } + return normalized_interpretations; +} + +auto SchemaSearcher::get_wildcard_encodable_positions(QueryInterpretation const& interpretation) + -> vector { + auto const logtype{interpretation.get_logtype()}; + vector wildcard_encodable_positions; + wildcard_encodable_positions.reserve(logtype.size()); + + for (size_t i{0}; i < logtype.size(); ++i) { + auto const& token{logtype[i]}; + if (holds_alternative(token)) { + auto const& var_token{std::get(token)}; + auto const var_type{static_cast(var_token.get_variable_type())}; + bool const is_int{TokenInt == var_type}; + bool const is_float{TokenFloat == var_type}; + if (var_token.get_contains_wildcard() && (is_int || is_float)) { + wildcard_encodable_positions.push_back(i); + } + } + } + return wildcard_encodable_positions; +} + +auto SchemaSearcher::generate_logtype_string( + QueryInterpretation const& interpretation, + vector const& wildcard_encodable_positions, + vector const& mask_encoded_flags +) -> string { + string logtype_string; + + size_t logtype_string_size{0}; + auto const logtype{interpretation.get_logtype()}; + for (auto const& token : logtype) { + if (holds_alternative(token)) { + auto const& static_token{std::get(token)}; + logtype_string_size += static_token.get_query_substring().size(); + } else { + logtype_string_size++; + } + } + logtype_string.reserve(logtype_string_size); + + for (size_t i{0}; i < logtype.size(); ++i) { + auto const& token{logtype[i]}; + if (holds_alternative(token)) { + logtype_string += std::get(token).get_query_substring(); + continue; + } + + auto const& var_token{std::get(token)}; + auto const& raw_string{var_token.get_query_substring()}; + auto const var_type{static_cast(var_token.get_variable_type())}; + bool const is_int{TokenInt == var_type}; + bool const is_float{TokenFloat == var_type}; + + if (wildcard_encodable_positions.end() + != std::ranges::find( + wildcard_encodable_positions.begin(), + wildcard_encodable_positions.end(), + i + )) + { + if (mask_encoded_flags[i]) { + if (is_int) { + EncodedVariableInterpreter::add_int_var(logtype_string); + } else { + EncodedVariableInterpreter::add_float_var(logtype_string); + } + } else { + EncodedVariableInterpreter::add_dict_var(logtype_string); + } + continue; + } + + encoded_variable_t encoded_var{0}; + if (is_int + && EncodedVariableInterpreter::convert_string_to_representable_integer_var( + raw_string, + encoded_var + )) + { + EncodedVariableInterpreter::add_int_var(logtype_string); + } else if (is_float + && EncodedVariableInterpreter::convert_string_to_representable_float_var( + raw_string, + encoded_var + )) + { + EncodedVariableInterpreter::add_float_var(logtype_string); + } else { + EncodedVariableInterpreter::add_dict_var(logtype_string); + } + } + return logtype_string; +} +} // namespace clp \ No newline at end of file diff --git a/components/core/src/clp/SchemaSearcher.hpp b/components/core/src/clp/SchemaSearcher.hpp new file mode 100644 index 0000000000..b2710e3396 --- /dev/null +++ b/components/core/src/clp/SchemaSearcher.hpp @@ -0,0 +1,337 @@ +#ifndef CLP_SCHEMASEARCHER_HPP +#define CLP_SCHEMASEARCHER_HPP + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "Defs.h" +#include "EncodedVariableInterpreter.hpp" +#include "LogTypeDictionaryReaderReq.hpp" +#include "Query.hpp" +#include "VariableDictionaryReaderReq.hpp" + +namespace clp { +#ifdef CLP_BUILD_TESTING +class SchemaSearcherTest; +#endif + +class SchemaSearcher { +#ifdef CLP_BUILD_TESTING + friend class SchemaSearcherTest; +#endif + +public: + template < + LogTypeDictionaryReaderReq LogTypeDictionaryReaderType, + VariableDictionaryReaderReq VariableDictionaryReaderType + > + static auto search( + std::string const& search_string, + log_surgeon::lexers::ByteLexer& lexer, + LogTypeDictionaryReaderType const& logtype_dict, + VariableDictionaryReaderType const& var_dict, + bool ignore_case + ) -> std::vector { + // TODO: Optimize such that interpretations are only generated once per schema. + log_surgeon::wildcard_query_parser::Query const query{search_string}; + auto const interpretations{query.get_all_multi_token_interpretations(lexer)}; + auto const normalized_interpretations{normalize_interpretations(interpretations)}; + return generate_schema_sub_queries( + normalized_interpretations, + logtype_dict, + var_dict, + ignore_case + ); + } + +private: + /** + * Normalizes a set of interpretations by collapsing consecutive greedy wildcards ('*') within + * each token. + * + * Consecutive wildcards that span across the boundary of tokens are preserved. + * + * @param interpretations The original set of `QueryInterpretation`s to normalize. + * @return The normalized set of `QueryInterpretation`s. + */ + static auto normalize_interpretations( + std::set const& interpretations + ) -> std::set; + + /** + * Compare all log-surgeon interpretations against the dictionaries to determine the sub queries + * to search for within the archive. + * + * A. For each interpretation we must consider encodable wildcard variables (e.g. (*1)). + * Each such variable introduces a binary choice: + * - 0: treat as a dictionary variable (\d) + * - 1: treat as an encoded variable (\i for integers, \f for floats) + * + * If there are k encodable wildcard variables, then 2^k logtype strings are possible. As a + * result we limit k <= 16. We represent these alternatives using a bitmask. + * + * Example: + * Search query: "a *1 *2 b", + * Interpretation (one of many): "a (*1) (*2) b" + * Possible logtypes (for the above interpretation): + * mask 00 -> "a \d \d b" + * mask 01 -> "a \d \f b" + * mask 10 -> "a \i \d b" + * mask 11 -> "a \i \f b" + * + * B. Each candidate combination becomes a useful subquery only if: + * 1. The logtype exists in the logtype dictionary, and + * 2. Each variable is either: + * a) resolvable in the variable dictionary (for dictionary vars), or + * b) encoded (always assumed valid). + * + * Note: Encoded variables are always assumed to exist in the segment. This is a performance + * trade-off: checking the archive would be slower than decompressing. + * + * @tparam LogTypeDictionaryReaderType Logtype dictionary reader type. + * @tparam VariableDictionaryReaderType Variable dictionary reader type. + * @param interpretations Log-surgeon's interpretations of the search query. + * @param logtype_dict The logtype dictionary. + * @param var_dict The variable dictionary. + * @param ignore_case If true, perform a case-insensitive search. + * @return The vector of subqueries to compare against CLP's archives. + * @throw std::runtime_error If there are too many candidate combinations. + */ + template < + LogTypeDictionaryReaderReq LogTypeDictionaryReaderType, + VariableDictionaryReaderReq VariableDictionaryReaderType + > + static auto generate_schema_sub_queries( + std::set const& + interpretations, + LogTypeDictionaryReaderType const& logtype_dict, + VariableDictionaryReaderType const& var_dict, + bool ignore_case + ) -> std::vector; + + /** + * Scans the interpretation and returns the indices of all encodable wildcard variables. + * + * An encodable variable is a variable token that: + * - Contains a wildcard (e.g. *1). + * - Is of an encodable type (integer or float). + * + * @param interpretation The `QueryInterpretation` to scan. + * @return A vector of positions of encodable wildcard variables. + */ + static auto get_wildcard_encodable_positions( + log_surgeon::wildcard_query_parser::QueryInterpretation const& interpretation + ) -> std::vector; + + /** + * Generates a logtype string from an interpretation, applying a mask to determine which + * encodable wildcard positions are treated as encoded vs dictionary variables. + * - 0: Treat as dictionary variable. + * - 1: Treat as an encoded variable. + * + * @param interpretation The interpretation to convert to a logtype string. + * @param wildcard_encodable_positions A vector of positions of encodable wildcard variables. + * @param mask_encoded_flags A vector indicating if a variables is mask encoded. + * @return The logtype string corresponding to this combination of encoded variables. + */ + static auto generate_logtype_string( + log_surgeon::wildcard_query_parser::QueryInterpretation const& interpretation, + std::vector const& wildcard_encodable_positions, + std::vector const& mask_encoded_flags + ) -> std::string; + + /** + * Process a single variable token for schema subquery generation. + * + * Determines if the variable can be treated as: + * - an encoded variable, + * - a dictionary variable, + * - or requires wildcard dictionary search. + * + * Updates `sub_query` with the appropriate variable encodings. + * + * @tparam VariableDictionaryReaderType Variable dictionary reader type. + * @param variable_token The variable token to process. + * @param var_dict The variable dictionary. + * @param ignore_case If true, perform a case-insensitive search. + * @param is_mask_encoded If the token is an encodable wildcard and is to be encoded. + * @param sub_query Returns the updated sub query object. + * @return True if the variable is encoded or is in the variable dictionary, false otherwise. + */ + template + static auto process_schema_var_token( + log_surgeon::wildcard_query_parser::VariableQueryToken const& variable_token, + VariableDictionaryReaderType const& var_dict, + bool ignore_case, + bool is_mask_encoded, + SubQuery& sub_query + ) -> bool; +}; + +template < + LogTypeDictionaryReaderReq LogTypeDictionaryReaderType, + VariableDictionaryReaderReq VariableDictionaryReaderType +> +auto SchemaSearcher::generate_schema_sub_queries( + std::set const& interpretations, + LogTypeDictionaryReaderType const& logtype_dict, + VariableDictionaryReaderType const& var_dict, + bool const ignore_case +) -> std::vector { + std::vector sub_queries; + constexpr size_t cMaxEncodableWildcardVariables{16}; + for (auto const& interpretation : interpretations) { + auto const logtype{interpretation.get_logtype()}; + auto wildcard_encodable_positions{get_wildcard_encodable_positions(interpretation)}; + if (wildcard_encodable_positions.size() > cMaxEncodableWildcardVariables) { + throw std::runtime_error("Too many encodable variables."); + } + uint64_t const num_combos{1ULL << wildcard_encodable_positions.size()}; + for (uint64_t mask{0}; mask < num_combos; ++mask) { + std::vector mask_encoded_flags(logtype.size(), false); + for (size_t i{0}; i < wildcard_encodable_positions.size(); ++i) { + mask_encoded_flags[wildcard_encodable_positions[i]] = (mask >> i) & 1ULL; + } + + auto logtype_string{generate_logtype_string( + interpretation, + wildcard_encodable_positions, + mask_encoded_flags + )}; + + std::unordered_set logtype_entries; + logtype_dict.get_entries_matching_wildcard_string( + logtype_string, + ignore_case, + logtype_entries + ); + if (logtype_entries.empty()) { + continue; + } + + SubQuery sub_query; + bool has_vars{true}; + for (size_t i{0}; i < logtype.size(); ++i) { + auto const& token{logtype[i]}; + if (std::holds_alternative( + token + )) + { + bool is_mask_encoded{false}; + if (wildcard_encodable_positions.end() + != std::ranges::find( + wildcard_encodable_positions.begin(), + wildcard_encodable_positions.end(), + i + )) + { + is_mask_encoded = mask_encoded_flags[i]; + } + + has_vars = process_schema_var_token( + std::get(token), + var_dict, + ignore_case, + is_mask_encoded, + sub_query + ); + } + if (false == has_vars) { + break; + } + } + if (false == has_vars) { + continue; + } + + std::unordered_set possible_logtype_ids; + possible_logtype_ids.reserve(logtype_entries.size()); + for (auto const* entry : logtype_entries) { + possible_logtype_ids.emplace(entry->get_id()); + } + sub_query.set_possible_logtypes(possible_logtype_ids); + if (sub_queries.end() == std::ranges::find(sub_queries, sub_query)) { + sub_queries.push_back(std::move(sub_query)); + } + } + } + return sub_queries; +} + +template +auto SchemaSearcher::process_schema_var_token( + log_surgeon::wildcard_query_parser::VariableQueryToken const& variable_token, + VariableDictionaryReaderType const& var_dict, + bool const ignore_case, + bool const is_mask_encoded, + SubQuery& sub_query +) -> bool { + auto const& raw_string{variable_token.get_query_substring()}; + auto const var_has_wildcard{variable_token.get_contains_wildcard()}; + auto const var_type{static_cast(variable_token.get_variable_type())}; + bool const is_int{log_surgeon::SymbolId::TokenInt == var_type}; + bool const is_float{log_surgeon::SymbolId::TokenFloat == var_type}; + + if (is_mask_encoded) { + sub_query.mark_wildcard_match_required(); + return true; + } + + if (var_has_wildcard) { + return EncodedVariableInterpreter::wildcard_search_dictionary_and_get_encoded_matches( + raw_string, + var_dict, + ignore_case, + sub_query + ); + } + + encoded_variable_t encoded_var{}; + if ((is_int + && EncodedVariableInterpreter::convert_string_to_representable_integer_var( + raw_string, + encoded_var + )) + || (is_float + && EncodedVariableInterpreter::convert_string_to_representable_float_var( + raw_string, + encoded_var + ))) + { + sub_query.add_non_dict_var(encoded_var); + return true; + } + + auto entries = var_dict.get_entry_matching_value(raw_string, ignore_case); + if (entries.empty()) { + return false; + } + if (1 == entries.size()) { + auto const entry_id{entries[0]->get_id()}; + sub_query.add_dict_var(EncodedVariableInterpreter::encode_var_dict_id(entry_id), entry_id); + return true; + } + std::unordered_set encoded_vars; + std::unordered_set var_dict_ids; + encoded_vars.reserve(entries.size()); + var_dict_ids.reserve(entries.size()); + for (auto const* entry : entries) { + encoded_vars.emplace(EncodedVariableInterpreter::encode_var_dict_id(entry->get_id())); + var_dict_ids.emplace(entry->get_id()); + } + sub_query.add_imprecise_dict_var(encoded_vars, var_dict_ids); + return true; +} +} // namespace clp + +#endif //CLP_SCHEMASEARCHER_HPP diff --git a/components/core/src/clp/clg/CMakeLists.txt b/components/core/src/clp/clg/CMakeLists.txt index 55c6b058b7..0da4a71915 100644 --- a/components/core/src/clp/clg/CMakeLists.txt +++ b/components/core/src/clp/clg/CMakeLists.txt @@ -37,6 +37,8 @@ set( ../Grep.hpp ../GrepCore.cpp ../GrepCore.hpp + ../SchemaSearcher.cpp + ../SchemaSearcher.hpp ../ir/EncodedTextAst.cpp ../ir/EncodedTextAst.hpp ../ir/LogEvent.hpp diff --git a/components/core/src/clp/clo/CMakeLists.txt b/components/core/src/clp/clo/CMakeLists.txt index 28c8da4279..aa98623e2f 100644 --- a/components/core/src/clp/clo/CMakeLists.txt +++ b/components/core/src/clp/clo/CMakeLists.txt @@ -36,6 +36,8 @@ set( ../Grep.hpp ../GrepCore.cpp ../GrepCore.hpp + ../SchemaSearcher.cpp + ../SchemaSearcher.hpp ../ir/EncodedTextAst.cpp ../ir/EncodedTextAst.hpp ../ir/LogEvent.hpp diff --git a/components/core/src/clp_s/CMakeLists.txt b/components/core/src/clp_s/CMakeLists.txt index b768488565..25cca28006 100644 --- a/components/core/src/clp_s/CMakeLists.txt +++ b/components/core/src/clp_s/CMakeLists.txt @@ -64,6 +64,8 @@ set( ../clp/GrepCore.hpp ../clp/hash_utils.cpp ../clp/hash_utils.hpp + ../clp/SchemaSearcher.cpp + ../clp/SchemaSearcher.hpp ../clp/ir/constants.hpp ../clp/ir/EncodedTextAst.cpp ../clp/ir/EncodedTextAst.hpp diff --git a/components/core/tests/MockLogTypeDictionary.hpp b/components/core/tests/MockLogTypeDictionary.hpp new file mode 100644 index 0000000000..5e340c58b5 --- /dev/null +++ b/components/core/tests/MockLogTypeDictionary.hpp @@ -0,0 +1,116 @@ +#ifndef MOCK_LOGTYPE_DICTIONARY_HPP +#define MOCK_LOGTYPE_DICTIONARY_HPP + +#include +#include +#include +#include +#include + +#include "../src/clp/Defs.h" +#include "../src/clp/EncodedVariableInterpreter.hpp" +#include "../src/clp/string_utils/string_utils.hpp" + +using clp::EncodedVariableInterpreter; +using clp::logtype_dictionary_id_t; +using clp::string_utils::wildcard_match_unsafe_case_sensitive; +using std::string; +using std::string_view; +using std::unordered_set; +using std::vector; + +/** + * Simple helper class representing a mock logtype dictionary entry for unit tests. + * + * Adheres to `LogtypeDictionaryEntryReq`. + */ +class MockLogTypeEntry { +public: + MockLogTypeEntry(string value, logtype_dictionary_id_t const id) + : m_value(std::move(value)), + m_id(id) {} + + auto clear() -> void { m_value.clear(); } + + auto reserve_constant_length(size_t const length) -> void { m_value.reserve(length); } + + auto parse_next_var( + [[maybe_unused]] string_view msg, + [[maybe_unused]] size_t begin, + [[maybe_unused]] size_t end, + [[maybe_unused]] string_view& parsed + ) -> bool { + return false; + } + + auto add_constant(string_view const msg, size_t const begin_pos, size_t const length) -> void { + m_value.append(msg.substr(begin_pos, length)); + } + + auto add_int_var() -> void { EncodedVariableInterpreter::add_int_var(m_value); } + + auto add_float_var() -> void { EncodedVariableInterpreter::add_float_var(m_value); } + + auto add_dictionary_var() -> void { EncodedVariableInterpreter::add_dict_var(m_value); } + + [[nodiscard]] auto get_value() const -> string const& { return m_value; } + + [[nodiscard]] auto get_num_variables() const -> size_t { return 0; } + + [[nodiscard]] auto get_num_placeholders() const -> size_t { return 0; } + + [[nodiscard]] auto + get_placeholder_info([[maybe_unused]] size_t idx, [[maybe_unused]] auto& ref) const -> size_t { + return SIZE_MAX; + } + + [[nodiscard]] auto get_id() const -> logtype_dictionary_id_t { return m_id; } + +private: + string m_value; + logtype_dictionary_id_t m_id{0}; +}; + +/** + * Simple helper class representing a mock logtype dictionary for unit tests. + * + * Provides a method for adding entries and adheres to `LogtypeDictionaryReaderReq`. + */ +class MockLogTypeDictionary { +public: + using Entry = MockLogTypeEntry; + using dictionary_id_t = logtype_dictionary_id_t; + + auto add_entry(string const& value, dictionary_id_t id) -> void { + m_storage.emplace_back(value, id); + } + + auto + get_entry_matching_value(string_view const logtype, [[maybe_unused]] bool ignore_case) const + -> vector { + vector results; + for (auto const& entry : m_storage) { + if (logtype == entry.get_value()) { + results.push_back(&entry); + } + } + return results; + } + + auto get_entries_matching_wildcard_string( + string_view const logtype, + [[maybe_unused]] bool ignore_case, + unordered_set& results + ) const -> void { + for (auto const& entry : m_storage) { + if (wildcard_match_unsafe_case_sensitive(entry.get_value(), logtype)) { + results.insert(&entry); + } + } + } + +private: + vector m_storage; +}; + +#endif // MOCK_LOGTYPE_DICTIONARY_HPP diff --git a/components/core/tests/MockVariableDictionary.hpp b/components/core/tests/MockVariableDictionary.hpp new file mode 100644 index 0000000000..bf29bd0c7e --- /dev/null +++ b/components/core/tests/MockVariableDictionary.hpp @@ -0,0 +1,91 @@ +#ifndef MOCK_VARIABLE_DICTIONARY_HPP +#define MOCK_VARIABLE_DICTIONARY_HPP + +#include +#include +#include +#include +#include + +#include "../src/clp/Defs.h" +#include "../src/clp/string_utils/string_utils.hpp" + +using clp::string_utils::wildcard_match_unsafe_case_sensitive; +using std::string; +using std::string_view; +using std::unordered_map; +using std::unordered_set; +using std::vector; +using clp::variable_dictionary_id_t; + +/** + * Simple helper class representing a mock variable dictionary entry for unit tests. + * + * Adheres to `VariableDictionaryEntryReq`. + */ +class MockVarEntry { +public: + explicit MockVarEntry(variable_dictionary_id_t const id, string value) + : m_id{id}, + m_value{std::move(value)} {} + + [[nodiscard]] auto get_id() const -> variable_dictionary_id_t { return m_id; } + + [[nodiscard]] auto get_value() const -> string const& { return m_value; } + +private: + variable_dictionary_id_t m_id; + string m_value; +}; + +/** + * Simple helper class representing a mock variable dictionary for unit tests. + * + * Provides a method for adding entries and adheres to `VariableDictionaryReaderReq`. + */ +class MockVarDictionary { +public: + using Entry = MockVarEntry; + using dictionary_id_t = variable_dictionary_id_t; + + auto add_entry(dictionary_id_t const id, string value) -> void { + m_storage.emplace(id, Entry{id, std::move(value)}); + } + + [[nodiscard]] auto get_value(dictionary_id_t const id) const -> string const& { + static string const cEmpty{}; + auto const it{m_storage.find(id)}; + if (m_storage.end() != it) { + return it->second.get_value(); + } + return cEmpty; + } + + auto get_entry_matching_value(string_view const val, [[maybe_unused]] bool ignore_case) const + -> vector { + vector results; + for (auto const& [id, entry] : m_storage) { + if (val == entry.get_value()) { + results.push_back(&entry); + } + } + return results; + } + + auto get_entries_matching_wildcard_string( + string_view const val, + [[maybe_unused]] bool ignore_case, + unordered_set& results + ) const -> void { + for (auto const& [id, entry] : m_storage) { + if (wildcard_match_unsafe_case_sensitive(entry.get_value(), val)) { + results.insert(&entry); + } + } + } + +private: + unordered_map m_storage; +}; + +#endif // MOCK_VARIABLE_DICTIONARY_HPP diff --git a/components/core/tests/search_test_utils.cpp b/components/core/tests/search_test_utils.cpp new file mode 100644 index 0000000000..39d3e358ce --- /dev/null +++ b/components/core/tests/search_test_utils.cpp @@ -0,0 +1,105 @@ +#include "search_test_utils.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "../src/clp/Defs.h" +#include "../src/clp/EncodedVariableInterpreter.hpp" +#include "../src/clp/Query.hpp" +#include "MockLogTypeDictionary.hpp" +#include "MockVariableDictionary.hpp" + +using clp::EncodedVariableInterpreter; +using clp::logtype_dictionary_id_t; +using std::pair; +using std::string; +using std::string_view; +using clp::SubQuery; +using std::tuple; +using std::unordered_set; +using std::variant; +using std::vector; +using clp::variable_dictionary_id_t; + +using VarInfo = tuple>; + +auto make_var_dict(vector> const& entries) -> MockVarDictionary { + MockVarDictionary dict; + for (auto const& [id, val] : entries) { + dict.add_entry(id, val); + } + return dict; +} + +auto make_logtype_dict(vector>> const& entries) + -> MockLogTypeDictionary { + MockLogTypeDictionary dict; + logtype_dictionary_id_t id{0}; + for (auto const& entry : entries) { + dict.add_entry(generate_expected_logtype_string(entry), id++); + } + return dict; +} + +auto generate_expected_logtype_string(vector> const& tokens) -> string { + string result; + for (auto const& token : tokens) { + if (holds_alternative(token)) { + result.append(get(token)); + } else { + switch (get(token)) { + case 'i': + EncodedVariableInterpreter::add_int_var(result); + break; + case 'f': + EncodedVariableInterpreter::add_float_var(result); + break; + case 'd': + EncodedVariableInterpreter::add_dict_var(result); + break; + default: + break; + } + } + } + return result; +} + +auto check_sub_query( + size_t id, + vector const& sub_queries, + bool const wildcard_match_required, + vector const& vars_info, + unordered_set const& logtype_ids +) -> void { + CAPTURE(id); + auto const& sub_query{sub_queries[id]}; + + REQUIRE(wildcard_match_required == sub_query.wildcard_match_required()); + REQUIRE(vars_info.size() == sub_query.get_num_possible_vars()); + + for (size_t i{0}; i < vars_info.size(); ++i) { + auto const& [is_dict_var, is_precise_var, var_dict_ids]{vars_info[i]}; + auto const& var{sub_query.get_vars()[i]}; + REQUIRE(is_dict_var == var.is_dict_var()); + REQUIRE(is_precise_var == var.is_precise_var()); + if (is_dict_var) { + if (is_precise_var) { + REQUIRE(1 == var_dict_ids.size()); + REQUIRE(var_dict_ids.contains(var.get_var_dict_id())); + } else { + REQUIRE(var_dict_ids == var.get_possible_var_dict_ids()); + } + } + } + + REQUIRE(logtype_ids == sub_query.get_possible_logtypes()); +} \ No newline at end of file diff --git a/components/core/tests/search_test_utils.hpp b/components/core/tests/search_test_utils.hpp new file mode 100644 index 0000000000..dce4fcf877 --- /dev/null +++ b/components/core/tests/search_test_utils.hpp @@ -0,0 +1,90 @@ +#ifndef SEARCH_TEST_UTILS_HPP +#define SEARCH_TEST_UTILS_HPP + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../src/clp/Defs.h" +#include "../src/clp/EncodedVariableInterpreter.hpp" +#include "../src/clp/Query.hpp" +#include "MockLogTypeDictionary.hpp" +#include "MockVariableDictionary.hpp" + +using clp::EncodedVariableInterpreter; +using clp::logtype_dictionary_id_t; +using std::pair; +using std::string; +using std::string_view; +using clp::SubQuery; +using std::tuple; +using std::unordered_set; +using std::variant; +using std::vector; +using clp::variable_dictionary_id_t; + +using VarInfo = tuple>; + +/** + * @param entries Vector of (id, value) pairs to populate the variable + * dictionary. + * @return A `MockVarDictionary` initialized with the given entries. + */ +auto make_var_dict(vector> const& entries) -> MockVarDictionary; + +/** + * @param entries Vector of logtypes, where each logtype is represented by a vector of tokens. Each + * token is either a literal substring (`string_view`) or a variable placeholder (`char`). + * @return A `MockLogTypeDictionary` initialized with the given entries. + */ +auto make_logtype_dict(vector>> const& entries) + -> MockLogTypeDictionary; + +/** + * Generates a logtype string from a vector of tokens. + * + * Each token is either: + * - a literal substring (`string_view`) to append directly, or + * - a variable placeholder (`char`) indicating the type of variable: + * - `i` -> integer variable; + * - `f` -> float variable; + * - `d` -> dictionary variable. + * + * The function forwards variable tokens to `EncodedVariableInterpreter` to + * append their encoded representations to the resulting string. + * + * @param tokens Vector of tokens to convert into a logtype string. + * @return A `string` representing the expected encoded logtype. + */ +auto generate_expected_logtype_string(vector> const& tokens) -> string; + +/** + * Checks that a `SubQuery` at a given index matches the expected properties. + * + * This method verifies: + * - Whether wildcard matching is required; + * - The number and type of variables; + * - For dictionary variables, the precise or possible dictionary IDs; + * - The set of possible logtype IDs. + * + * @param id Index of the sub-query to check in `sub_queries`. + * @param sub_queries Vector of `SubQuery` objects. + * @param wildcard_match_required Expected wildcard match requirement. + * @param vars_info Vector of tuples describing expected variable properties: (`is_dict_var`, + * `is_precise_var`, `var_dict_ids`). + * @param logtype_ids Expected set of possible logtype IDs. + */ +auto check_sub_query( + size_t id, + vector const& sub_queries, + bool wildcard_match_required, + vector const& vars_info, + unordered_set const& logtype_ids +) -> void; + +#endif // SEARCH_TEST_UTILS_HPP diff --git a/components/core/tests/test-GrepCore.cpp b/components/core/tests/test-GrepCore.cpp index 3b4733cb69..f6be717c0a 100644 --- a/components/core/tests/test-GrepCore.cpp +++ b/components/core/tests/test-GrepCore.cpp @@ -1,14 +1,7 @@ #include -#include #include -#include #include -#include -#include -#include -#include #include -#include #include #include @@ -16,359 +9,25 @@ #include #include #include -#include -#include "../src/clp/Defs.h" -#include "../src/clp/EncodedVariableInterpreter.hpp" #include "../src/clp/GrepCore.hpp" -#include "../src/clp/LogTypeDictionaryReaderReq.hpp" -#include "../src/clp/Query.hpp" -#include "../src/clp/string_utils/string_utils.hpp" -#include "../src/clp/VariableDictionaryReaderReq.hpp" +#include "search_test_utils.hpp" -using clp::EncodedVariableInterpreter; using clp::GrepCore; -using clp::logtype_dictionary_id_t; -using clp::LogTypeDictionaryReaderReq; -using clp::string_utils::wildcard_match_unsafe_case_sensitive; -using clp::SubQuery; -using clp::variable_dictionary_id_t; -using clp::VariableDictionaryReaderReq; using log_surgeon::lexers::ByteLexer; using log_surgeon::Schema; using log_surgeon::SchemaVarAST; using log_surgeon::SymbolId::TokenFloat; using log_surgeon::SymbolId::TokenInt; -using log_surgeon::wildcard_query_parser::QueryInterpretation; -using log_surgeon::wildcard_query_parser::VariableQueryToken; -using std::pair; -using std::set; using std::string; -using std::string_view; -using std::tuple; -using std::unordered_map; -using std::unordered_set; -using std::variant; +using std::pair; using std::vector; -using VarInfo = tuple>; - constexpr uint32_t cIntId{static_cast(TokenInt)}; constexpr uint32_t cFloatId{static_cast(TokenFloat)}; constexpr uint32_t cHasNumId{111}; -/** - * Helper to expose `GrepCore` functionality for unit-testing. - * - * This class provides static wrappers around `GrepCore` methods, allowing test - * code to access internal logic such as: - * - Finding wildcard encodable positions in a `QueryInterpretation`; - * - Generating logtype strings with wildcard masks; - * - Processing variable tokens with or without encoding; - * - Generating schema-based sub-queries. - * - * All methods forward directly to `GrepCore` and are intended for testing only. - */ -class clp::GrepCoreTest { -public: - static auto normalize_interpretations(set const& interpretations) - -> set { - return GrepCore::normalize_interpretations(interpretations); - } - - template < - LogTypeDictionaryReaderReq LogTypeDictionaryReaderType, - VariableDictionaryReaderReq VariableDictionaryReaderType - > - static auto generate_schema_sub_queries( - set const& interpretations, - LogTypeDictionaryReaderType const& logtype_dict, - VariableDictionaryReaderType const& var_dict, - vector& sub_queries - ) -> void { - GrepCore::generate_schema_sub_queries( - interpretations, - logtype_dict, - var_dict, - false, - sub_queries - ); - } - - static auto get_wildcard_encodable_positions(QueryInterpretation const& interpretation) - -> vector { - return GrepCore::get_wildcard_encodable_positions(interpretation); - } - - static auto generate_logtype_string( - QueryInterpretation const& interpretation, - vector const& wildcard_encodable_positions, - vector const& mask_encoded_flags - ) -> string { - return GrepCore::generate_logtype_string( - interpretation, - wildcard_encodable_positions, - mask_encoded_flags - ); - } - - template - static auto process_token( - VariableQueryToken const& var_token, - VariableDictionaryReaderType const& var_dict, - SubQuery& sub_query - ) -> bool { - return GrepCore::process_schema_var_token(var_token, var_dict, false, false, sub_query); - } - - template - static auto process_encoded_token( - VariableQueryToken const& var_token, - VariableDictionaryReaderType const& var_dict, - SubQuery& sub_query - ) -> bool { - return GrepCore::process_schema_var_token(var_token, var_dict, false, true, sub_query); - } -}; - namespace { -/** - * Simple helper class representing a fake variable dictionary entry for unit tests. - * - * Adheres to `VariableDictionaryEntryReq`. - */ -class FakeVarEntry { -public: - explicit FakeVarEntry(variable_dictionary_id_t const id, string value) - : m_id{id}, - m_value{std::move(value)} {} - - [[nodiscard]] auto get_id() const -> variable_dictionary_id_t { return m_id; } - - [[nodiscard]] auto get_value() const -> string const& { return m_value; } - -private: - variable_dictionary_id_t m_id; - string m_value; -}; - -/** - * Simple helper class representing a fake variable dictionary for unit tests. - * - * Provides a method for adding entries and adheres to `VariableDictionaryReaderReq`. - */ -class FakeVarDict { -public: - using Entry = FakeVarEntry; - using dictionary_id_t = variable_dictionary_id_t; - - auto add_entry(dictionary_id_t const id, string value) -> void { - m_storage.emplace(id, Entry{id, std::move(value)}); - } - - [[nodiscard]] auto get_value(dictionary_id_t const id) const -> string const& { - static string const cEmpty{}; - auto const it{m_storage.find(id)}; - if (m_storage.end() != it) { - return it->second.get_value(); - } - return cEmpty; - } - - auto get_entry_matching_value(string_view const val, [[maybe_unused]] bool ignore_case) const - -> vector { - vector results; - for (auto const& [id, entry] : m_storage) { - if (val == entry.get_value()) { - results.push_back(&entry); - } - } - return results; - } - - auto get_entries_matching_wildcard_string( - string_view const val, - [[maybe_unused]] bool ignore_case, - unordered_set& results - ) const -> void { - for (auto const& [id, entry] : m_storage) { - if (wildcard_match_unsafe_case_sensitive(entry.get_value(), val)) { - results.insert(&entry); - } - } - } - -private: - unordered_map m_storage; -}; - -/** - * Simple helper class representing a fake logtype dictionary entry for unit tests. - * - * Adheres to `LogtypeDictionaryEntryReq`. - */ -class FakeLogTypeEntry { -public: - FakeLogTypeEntry(string value, logtype_dictionary_id_t const id) - : m_value(std::move(value)), - m_id(id) {} - - auto clear() -> void { m_value.clear(); } - - auto reserve_constant_length(size_t const length) -> void { m_value.reserve(length); } - - auto parse_next_var( - [[maybe_unused]] string_view msg, - [[maybe_unused]] size_t begin, - [[maybe_unused]] size_t end, - [[maybe_unused]] string_view& parsed - ) -> bool { - return false; - } - - auto add_constant(string_view const msg, size_t const begin_pos, size_t const length) -> void { - m_value.append(msg.substr(begin_pos, length)); - } - - auto add_int_var() -> void { EncodedVariableInterpreter::add_int_var(m_value); } - - auto add_float_var() -> void { EncodedVariableInterpreter::add_float_var(m_value); } - - auto add_dictionary_var() -> void { EncodedVariableInterpreter::add_dict_var(m_value); } - - [[nodiscard]] auto get_value() const -> string const& { return m_value; } - - [[nodiscard]] auto get_num_variables() const -> size_t { return 0; } - - [[nodiscard]] auto get_num_placeholders() const -> size_t { return 0; } - - [[nodiscard]] auto - get_placeholder_info([[maybe_unused]] size_t idx, [[maybe_unused]] auto& ref) const -> size_t { - return SIZE_MAX; - } - - [[nodiscard]] auto get_id() const -> logtype_dictionary_id_t { return m_id; } - -private: - string m_value; - logtype_dictionary_id_t m_id{0}; -}; - -/** - * Simple helper class representing a fake logtype dictionary for unit tests. - * - * Provides a method for adding entries and adheres to `LogtypeDictionaryReaderReq`. - */ -class FakeLogTypeDict { -public: - using Entry = FakeLogTypeEntry; - using dictionary_id_t = logtype_dictionary_id_t; - - auto add_entry(string const& value, dictionary_id_t id) -> void { - m_storage.emplace_back(value, id); - } - - auto - get_entry_matching_value(string_view const logtype, [[maybe_unused]] bool ignore_case) const - -> vector { - vector results; - for (auto const& entry : m_storage) { - if (logtype == entry.get_value()) { - results.push_back(&entry); - } - } - return results; - } - - auto get_entries_matching_wildcard_string( - string_view const logtype, - [[maybe_unused]] bool ignore_case, - unordered_set& results - ) const -> void { - for (auto const& entry : m_storage) { - if (wildcard_match_unsafe_case_sensitive(entry.get_value(), logtype)) { - results.insert(&entry); - } - } - } - -private: - vector m_storage; -}; - -/** - * @param entries Vector of (id, value) pairs to populate the variable - * dictionary. - * @return A `FakeVarDict` initialized with the given entries. - */ -auto make_var_dict(vector> const& entries) -> FakeVarDict; - -/** - * @param entries Vector of logtypes, where each logtype is represented by a vector of tokens. Each - * token is either a literal substring (`string_view`) or a variable placeholder (`char`). - * @return A `FakeLogtypeDict` initialized with the given entries. - */ -auto make_logtype_dict(vector>> const& entries) - -> FakeLogTypeDict; - -/** - * Constructs a `QueryInterpretation` from a vector of tokens. - * - * Each token is either: - * - a `string` representing a static substring, or - * - a `pair`, representing a variable placeholder and its value. - * - * This method automatically detects whether a variable token contains a - * wildcard (`*` or `?`). - * - * @param tokens Vector of tokens to populate the `QueryInterpretation`. - * @return A `QueryInterpretation` populated with the given tokens. - */ -auto make_query_interpretation(vector>> const& tokens) - -> QueryInterpretation; - -/** - * Generates a logtype string from a vector of tokens. - * - * Each token is either: - * - a literal substring (`string_view`) to append directly, or - * - a variable placeholder (`char`) indicating the type of variable: - * - `i` -> integer variable; - * - `f` -> float variable; - * - `d` -> dictionary variable. - * - * The function forwards variable tokens to `EncodedVariableInterpreter` to - * append their encoded representations to the resulting string. - * - * @param tokens Vector of tokens to convert into a logtype string. - * @return A `string` representing the expected encoded logtype. - */ -auto generate_expected_logtype_string(vector> const& tokens) -> string; - -/** - * Checks that a `SubQuery` at a given index matches the expected properties. - * - * This method verifies: - * - Whether wildcard matching is required; - * - The number and type of variables; - * - For dictionary variables, the precise or possible dictionary IDs; - * - The set of possible logtype IDs. - * - * @param id Index of the sub-query to check in `sub_queries`. - * @param sub_queries Vector of `SubQuery` objects. - * @param wildcard_match_required Expected wildcard match requirement. - * @param vars_info Vector of tuples describing expected variable properties: (`is_dict_var`, - * `is_precise_var`, `var_dict_ids`). - * @param logtype_ids Expected set of possible logtype IDs. - */ -auto check_sub_query( - size_t id, - vector const& sub_queries, - bool wildcard_match_required, - vector const& vars_info, - unordered_set const& logtype_ids -) -> void; - /** * Initializes a `ByteLexer` with space as a delimiter and the given `schema_rules`. * @@ -377,94 +36,6 @@ auto check_sub_query( */ auto make_test_lexer(vector const& schema_rules) -> ByteLexer; -auto make_var_dict(vector> const& entries) -> FakeVarDict { - FakeVarDict dict; - for (auto const& [id, val] : entries) { - dict.add_entry(id, val); - } - return dict; -} - -auto make_logtype_dict(vector>> const& entries) - -> FakeLogTypeDict { - FakeLogTypeDict dict; - logtype_dictionary_id_t id{0}; - for (auto const& entry : entries) { - dict.add_entry(generate_expected_logtype_string(entry), id++); - } - return dict; -} - -auto make_query_interpretation(vector>> const& tokens) - -> QueryInterpretation { - QueryInterpretation interp; - for (auto const& token : tokens) { - if (holds_alternative(token)) { - interp.append_static_token(get(token)); - } else { - auto const& [symbol, value]{get>(token)}; - auto const contains_wildcard{value.find_first_of("*?") != string::npos}; - interp.append_variable_token(symbol, value, contains_wildcard); - } - } - return interp; -} - -auto generate_expected_logtype_string(vector> const& tokens) -> string { - string result; - for (auto const& token : tokens) { - if (holds_alternative(token)) { - result.append(get(token)); - } else { - switch (get(token)) { - case 'i': - EncodedVariableInterpreter::add_int_var(result); - break; - case 'f': - EncodedVariableInterpreter::add_float_var(result); - break; - case 'd': - EncodedVariableInterpreter::add_dict_var(result); - break; - default: - break; - } - } - } - return result; -} - -auto check_sub_query( - size_t id, - vector const& sub_queries, - bool const wildcard_match_required, - vector const& vars_info, - unordered_set const& logtype_ids -) -> void { - CAPTURE(id); - auto const& sub_query{sub_queries[id]}; - - REQUIRE(wildcard_match_required == sub_query.wildcard_match_required()); - REQUIRE(vars_info.size() == sub_query.get_num_possible_vars()); - - for (size_t i{0}; i < vars_info.size(); ++i) { - auto const& [is_dict_var, is_precise_var, var_dict_ids]{vars_info[i]}; - auto const& var{sub_query.get_vars()[i]}; - REQUIRE(is_dict_var == var.is_dict_var()); - REQUIRE(is_precise_var == var.is_precise_var()); - if (is_dict_var) { - if (is_precise_var) { - REQUIRE(1 == var_dict_ids.size()); - REQUIRE(var_dict_ids.contains(var.get_var_dict_id())); - } else { - REQUIRE(var_dict_ids == var.get_possible_var_dict_ids()); - } - } - } - - REQUIRE(logtype_ids == sub_query.get_possible_logtypes()); -} - auto make_test_lexer(vector const& schema_rules) -> ByteLexer { ByteLexer lexer; lexer.m_symbol_id["int"] = cIntId; @@ -496,471 +67,7 @@ auto make_test_lexer(vector const& schema_rules) -> ByteLexer { lexer.generate(); return lexer; } -} // namespace - -// Tests: `get_wildcard_encodable_positions` -TEST_CASE("get_wildcard_encodable_positions_for_empty_interpretation", "[dfa_search]") { - QueryInterpretation const interpretation{}; - - auto const positions{clp::GrepCoreTest::get_wildcard_encodable_positions(interpretation)}; - REQUIRE(positions.empty()); -} - -TEST_CASE("get_wildcard_encodable_positions_for_multi_variable_interpretation", "[dfa_search]") { - auto const interpretation{make_query_interpretation( - {"text", - pair{cIntId, "100"}, - pair{cFloatId, "32.2"}, - pair{cIntId, "10?"}, - pair{cFloatId, "3.14*"}, - pair{cHasNumId, "3.14*"}} - )}; - - auto const positions{clp::GrepCoreTest::get_wildcard_encodable_positions(interpretation)}; - REQUIRE(2 == positions.size()); - REQUIRE(3 == positions[0]); - REQUIRE(4 == positions[1]); -} - -// Tests: `generate_logtype_string` -TEST_CASE("generate_logtype_string_for_empty_interpretation", "[dfa_search]") { - QueryInterpretation const interpretation{}; - - auto const wildcard_encodable_positions{ - clp::GrepCoreTest::get_wildcard_encodable_positions(interpretation) - }; - - REQUIRE(wildcard_encodable_positions.empty()); - auto const logtype_string{clp::GrepCoreTest::generate_logtype_string( - interpretation, - wildcard_encodable_positions, - {false} - )}; - REQUIRE(logtype_string.empty()); -} - -TEST_CASE("generate_logtype_string_for_single_variable_interpretation", "[dfa_search]") { - string expected_logtype_string; - EncodedVariableInterpreter::add_int_var(expected_logtype_string); - - QueryInterpretation interpretation{}; - interpretation.append_variable_token(static_cast(TokenInt), "100", false); - - auto const wildcard_encodable_positions{ - clp::GrepCoreTest::get_wildcard_encodable_positions(interpretation) - }; - - REQUIRE(wildcard_encodable_positions.empty()); - auto const logtype_string{clp::GrepCoreTest::generate_logtype_string( - interpretation, - wildcard_encodable_positions, - {false} - )}; - REQUIRE(expected_logtype_string == logtype_string); -} - -TEST_CASE("generate_logtype_string_for_multi_variable_interpretation", "[dfa_search]") { - unordered_set const expected_logtype_strings{ - generate_expected_logtype_string({"text", 'i', 'f', 'd', 'd', 'd'}), - generate_expected_logtype_string({"text", 'i', 'f', 'i', 'd', 'd'}), - generate_expected_logtype_string({"text", 'i', 'f', 'd', 'f', 'd'}), - generate_expected_logtype_string({"text", 'i', 'f', 'i', 'f', 'd'}) - }; - - auto const interpretation{make_query_interpretation( - {"text", - pair{cIntId, "100"}, - pair{cFloatId, "32.2"}, - pair{cIntId, "10?"}, - pair{cFloatId, "3.14*"}, - pair{cHasNumId, "3.14*"}} - )}; - - auto const wildcard_encodable_positions{ - clp::GrepCoreTest::get_wildcard_encodable_positions(interpretation) - }; - - uint64_t const num_combos{1ULL << wildcard_encodable_positions.size()}; - REQUIRE(num_combos == 4); - unordered_set logtype_strings; - for (uint64_t mask{0}; mask < num_combos; ++mask) { - vector mask_encoded_flags(interpretation.get_logtype().size(), false); - for (size_t i{0}; i < wildcard_encodable_positions.size(); ++i) { - mask_encoded_flags[wildcard_encodable_positions[i]] = (mask >> i) & 1ULL; - } - logtype_strings.insert( - clp::GrepCoreTest::generate_logtype_string( - interpretation, - wildcard_encodable_positions, - mask_encoded_flags - ) - ); - } - REQUIRE(expected_logtype_strings == logtype_strings); -} - -// Tests: `process_schema_var_token` -TEST_CASE("process_schema_empty_token ", "[dfa_search]") { - FakeVarDict const var_dict{make_var_dict({pair{0, "100"}})}; - - SubQuery sub_query; - VariableQueryToken const empty_int_token{cIntId, "", false}; - REQUIRE(false == clp::GrepCoreTest::process_token(empty_int_token, var_dict, sub_query)); - REQUIRE(false == sub_query.wildcard_match_required()); - REQUIRE(0 == sub_query.get_num_possible_vars()); -} - -TEST_CASE("process_schema_unmatched_token ", "[dfa_search]") { - FakeVarDict const var_dict{make_var_dict({pair{0, "100"}})}; - - SubQuery sub_query; - VariableQueryToken const int_token{cIntId, "200", false}; - REQUIRE(clp::GrepCoreTest::process_token(int_token, var_dict, sub_query)); - REQUIRE(false == sub_query.wildcard_match_required()); - REQUIRE(1 == sub_query.get_num_possible_vars()); - auto const& var{sub_query.get_vars()[0]}; - REQUIRE(false == var.is_dict_var()); - REQUIRE(var.is_precise_var()); - REQUIRE(var.get_possible_var_dict_ids().empty()); -} - -TEST_CASE("process_schema_int_token ", "[dfa_search]") { - FakeVarDict const var_dict{make_var_dict({pair{0, "100"}})}; - - SubQuery sub_query; - VariableQueryToken const int_token{cIntId, "100", false}; - REQUIRE(clp::GrepCoreTest::process_token(int_token, var_dict, sub_query)); - REQUIRE(false == sub_query.wildcard_match_required()); - REQUIRE(1 == sub_query.get_num_possible_vars()); - auto const& var{sub_query.get_vars()[0]}; - REQUIRE(false == var.is_dict_var()); - REQUIRE(var.is_precise_var()); - REQUIRE(var.get_possible_var_dict_ids().empty()); -} - -TEST_CASE("process_schema_encoded_non_greedy_wildcard_token ", "[dfa_search]") { - FakeVarDict const var_dict{make_var_dict({pair{0, "10a0"}, pair{1, "10b0"}})}; - - SECTION("interpret_as_int") { - SubQuery sub_query; - VariableQueryToken const int_token{cIntId, "10?0", true}; - REQUIRE(clp::GrepCoreTest::process_encoded_token(int_token, var_dict, sub_query)); - REQUIRE(sub_query.wildcard_match_required()); - REQUIRE(0 == sub_query.get_num_possible_vars()); - } - - SECTION("interpret_as_float") { - SubQuery sub_query; - VariableQueryToken const float_token{cFloatId, "10?0", true}; - REQUIRE(clp::GrepCoreTest::process_encoded_token(float_token, var_dict, sub_query)); - REQUIRE(sub_query.wildcard_match_required()); - REQUIRE(0 == sub_query.get_num_possible_vars()); - } - - SECTION("interpret_as_precise_has_number") { - SubQuery sub_query; - VariableQueryToken const has_number_token{cHasNumId, "10a?", true}; - REQUIRE(clp::GrepCoreTest::process_token(has_number_token, var_dict, sub_query)); - REQUIRE(false == sub_query.wildcard_match_required()); - REQUIRE(1 == sub_query.get_num_possible_vars()); - auto const& var{sub_query.get_vars()[0]}; - REQUIRE(var.is_dict_var()); - REQUIRE(var.is_precise_var()); - REQUIRE(0 == var.get_var_dict_id()); - REQUIRE(var.get_possible_var_dict_ids().empty()); - } - - SECTION("interpret_as_imprecise_has_number") { - SubQuery sub_query; - VariableQueryToken const has_number_token{cHasNumId, "10?0", true}; - REQUIRE(clp::GrepCoreTest::process_token(has_number_token, var_dict, sub_query)); - REQUIRE(false == sub_query.wildcard_match_required()); - REQUIRE(1 == sub_query.get_num_possible_vars()); - auto const& var{sub_query.get_vars()[0]}; - REQUIRE(var.is_dict_var()); - REQUIRE(false == var.is_precise_var()); - REQUIRE(2 == var.get_possible_var_dict_ids().size()); - for (size_t i{0}; i < var.get_possible_var_dict_ids().size(); ++i) { - REQUIRE(var.get_possible_var_dict_ids().contains(i)); - } - } -} - -// NOTE: CLP currently treats all non-encoded variables as the same, so the below test demonstrates -// this. In the future if CLP is more sophisticated, the two sections behave differently. -TEST_CASE("process_schema_non_encoded_non_greedy_wildcard_token ", "[dfa_search]") { - size_t id{0}; - FakeVarDict const var_dict{make_var_dict( - {pair{id++, "100000000000000000000000010"}, - pair{id++, "100000000000000000000000020"}, - pair{id++, "100000000000000000000000030"}, - pair{id++, "1000000000000000000000000.0"}, - pair{id++, "1000000000000000000000000a0"}} - )}; - - SECTION("interpret_as_int") { - SubQuery sub_query; - VariableQueryToken const int_token{cIntId, "1000000000000000000000000?0", true}; - REQUIRE(clp::GrepCoreTest::process_token(int_token, var_dict, sub_query)); - REQUIRE(false == sub_query.wildcard_match_required()); - REQUIRE(1 == sub_query.get_num_possible_vars()); - auto const& var{sub_query.get_vars()[0]}; - REQUIRE(var.is_dict_var()); - REQUIRE(false == var.is_precise_var()); - REQUIRE(5 == var.get_possible_var_dict_ids().size()); - for (size_t i{0}; i < var.get_possible_var_dict_ids().size(); ++i) { - REQUIRE(var.get_possible_var_dict_ids().contains(i)); - } - } - - SECTION("interpret_as_float") { - SubQuery sub_query; - VariableQueryToken const float_token{cFloatId, "1000000000000000000000000?0", true}; - REQUIRE(clp::GrepCoreTest::process_token(float_token, var_dict, sub_query)); - REQUIRE(false == sub_query.wildcard_match_required()); - REQUIRE(1 == sub_query.get_num_possible_vars()); - auto const& var{sub_query.get_vars()[0]}; - REQUIRE(var.is_dict_var()); - REQUIRE(false == var.is_precise_var()); - REQUIRE(5 == var.get_possible_var_dict_ids().size()); - for (size_t i{0}; i < var.get_possible_var_dict_ids().size(); ++i) { - REQUIRE(var.get_possible_var_dict_ids().contains(i)); - } - } - - SECTION("interpret_as_has_number") { - SubQuery sub_query; - VariableQueryToken const has_number_token{cHasNumId, "1000000000000000000000000?0", true}; - REQUIRE(clp::GrepCoreTest::process_token(has_number_token, var_dict, sub_query)); - REQUIRE(false == sub_query.wildcard_match_required()); - REQUIRE(1 == sub_query.get_num_possible_vars()); - auto const& var{sub_query.get_vars()[0]}; - REQUIRE(var.is_dict_var()); - REQUIRE(false == var.is_precise_var()); - REQUIRE(5 == var.get_possible_var_dict_ids().size()); - for (size_t i{0}; i < var.get_possible_var_dict_ids().size(); ++i) { - REQUIRE(var.get_possible_var_dict_ids().contains(i)); - } - } -} - -TEST_CASE("process_schema_greedy_wildcard_token ", "[dfa_search]") { - size_t id{0}; - FakeVarDict const var_dict{make_var_dict( - {pair{id++, "10a0"}, - pair{id++, "10b0"}, - pair{id++, "100000000000000000000000010"}, - pair{id++, "100000000000000000000000020"}, - pair{id++, "100000000000000000000000030"}, - pair{id++, "1000000000000000000000000.0"}, - pair{id++, "1000000000000000000000000a0"}} - )}; - - SECTION("interpret_as_non_encoded_int") { - SubQuery sub_query; - VariableQueryToken const int_token{cIntId, "10*0", true}; - REQUIRE(clp::GrepCoreTest::process_token(int_token, var_dict, sub_query)); - REQUIRE(false == sub_query.wildcard_match_required()); - REQUIRE(1 == sub_query.get_num_possible_vars()); - auto const& var{sub_query.get_vars()[0]}; - REQUIRE(var.is_dict_var()); - REQUIRE(false == var.is_precise_var()); - REQUIRE(7 == var.get_possible_var_dict_ids().size()); - for (size_t i{0}; i < var.get_possible_var_dict_ids().size(); ++i) { - REQUIRE(var.get_possible_var_dict_ids().contains(i)); - } - } - - SECTION("interpret_as_non_encoded_float") { - SubQuery sub_query; - VariableQueryToken const float_token{cFloatId, "10*0", true}; - REQUIRE(clp::GrepCoreTest::process_token(float_token, var_dict, sub_query)); - REQUIRE(false == sub_query.wildcard_match_required()); - REQUIRE(1 == sub_query.get_num_possible_vars()); - auto const& var{sub_query.get_vars()[0]}; - REQUIRE(var.is_dict_var()); - REQUIRE(false == var.is_precise_var()); - REQUIRE(7 == var.get_possible_var_dict_ids().size()); - for (size_t i{0}; i < var.get_possible_var_dict_ids().size(); ++i) { - REQUIRE(var.get_possible_var_dict_ids().contains(i)); - } - } - - SECTION("interpret_as_non_encoded_imprecise_has_number") { - SubQuery sub_query; - VariableQueryToken const has_number_token{cHasNumId, "10*0", true}; - REQUIRE(clp::GrepCoreTest::process_token(has_number_token, var_dict, sub_query)); - REQUIRE(false == sub_query.wildcard_match_required()); - REQUIRE(1 == sub_query.get_num_possible_vars()); - auto const& var{sub_query.get_vars()[0]}; - REQUIRE(var.is_dict_var()); - REQUIRE(false == var.is_precise_var()); - REQUIRE(7 == var.get_possible_var_dict_ids().size()); - for (size_t i{0}; i < var.get_possible_var_dict_ids().size(); ++i) { - REQUIRE(var.get_possible_var_dict_ids().contains(i)); - } - } - - SECTION("interpret_as_non_encoded_precise_has_number") { - SubQuery sub_query; - VariableQueryToken const has_number_token{cHasNumId, "10b*", true}; - REQUIRE(clp::GrepCoreTest::process_token(has_number_token, var_dict, sub_query)); - REQUIRE(false == sub_query.wildcard_match_required()); - REQUIRE(1 == sub_query.get_num_possible_vars()); - auto const& var{sub_query.get_vars()[0]}; - REQUIRE(var.is_dict_var()); - REQUIRE(var.is_precise_var()); - REQUIRE(1 == var.get_var_dict_id()); - REQUIRE(var.get_possible_var_dict_ids().empty()); - } - - SECTION("interpret_as_encoded_int") { - SubQuery sub_query; - VariableQueryToken const int_token{cIntId, "10*0", true}; - REQUIRE(clp::GrepCoreTest::process_encoded_token(int_token, var_dict, sub_query)); - REQUIRE(sub_query.wildcard_match_required()); - REQUIRE(0 == sub_query.get_num_possible_vars()); - } - - SECTION("interpret_as_encoded_float") { - SubQuery sub_query; - VariableQueryToken const float_token{cFloatId, "10*0", true}; - REQUIRE(clp::GrepCoreTest::process_encoded_token(float_token, var_dict, sub_query)); - REQUIRE(sub_query.wildcard_match_required()); - REQUIRE(0 == sub_query.get_num_possible_vars()); - } -} - -// Tests: `generate_schema_sub_queries` -TEST_CASE("generate_schema_sub_queries", "[dfa_search]") { - FakeVarDict const var_dict{make_var_dict({pair{0, "1a3"}, pair{1, "10a"}, pair{2, "10b"}})}; - FakeLogTypeDict const logtype_dict{make_logtype_dict( - {{"text ", 'i', " ", 'i', " ", 'f'}, - {"text ", 'i', " ", 'd', " ", 'f'}, - {"text ", 'i', " ", 'd', " 3.14ab$"}, - {"text ", 'i', " ", 'd', " 3.14abc$"}, - {"text ", 'i', " ", 'd', " 3.15ab$"}, - {"text ", 'i', " 10$ ", 'f'}} - )}; - - using V = pair; - vector>> raw_interpretations{ - {"text ", V{cIntId, "100"}, " ", V{cIntId, "10?"}, " ", V{cFloatId, " 3.14*"}}, - {"text ", V{cIntId, "100"}, " ", V{cIntId, "10?"}, " ", V{cHasNumId, "3.14*"}}, - {"text ", V{cIntId, "100"}, " ", V{cIntId, "10?"}, " 3.14*"}, - {"text ", V{cIntId, "100"}, " ", V{cHasNumId, "10?"}, " ", V{cFloatId, " 3.14*"}}, - {"text ", V{cIntId, "100"}, " ", V{cHasNumId, "10?"}, " ", V{cHasNumId, "3.14*"}}, - {"text ", V{cIntId, "100"}, " ", V{cHasNumId, "10?"}, " 3.14*"}, - {"text ", V{cIntId, "100"}, " 10? ", V{cFloatId, " 3.14*"}}, - {"text ", V{cIntId, "100"}, " 10? ", V{cHasNumId, "3.14*"}}, - {"text ", V{cIntId, "100"}, " 10? 3.14*"} - }; - set interpretations; - for (auto const& raw_interpretation : raw_interpretations) { - interpretations.insert(make_query_interpretation(raw_interpretation)); - } - - vector sub_queries; - clp::GrepCoreTest::generate_schema_sub_queries( - interpretations, - logtype_dict, - var_dict, - sub_queries - ); - - VarInfo const wild_int{false, true, {}}; - VarInfo const wild_has_num{true, false, {1LL, 2LL}}; - REQUIRE(4 == sub_queries.size()); - size_t i{0}; - check_sub_query(i++, sub_queries, true, {wild_int, wild_has_num}, {1LL}); - check_sub_query(i++, sub_queries, true, {wild_int}, {0LL}); - check_sub_query(i++, sub_queries, false, {wild_int, wild_has_num}, {2LL, 3LL}); - check_sub_query(i++, sub_queries, true, {wild_int}, {5LL}); -} - -TEST_CASE("generate_schema_sub_queries_with_wildcard_duplication", "[dfa_search]") { - FakeVarDict const var_dict{make_var_dict({pair{0, "1a3"}, pair{1, "10a"}})}; - FakeLogTypeDict const logtype_dict{make_logtype_dict( - {{"text ", 'i', " ", 'i', " ", 'f'}, - {"text ", 'i', " ", 'd', " ", 'f'}, - {"text ", 'i', " ", 'd', " 3.14ab$"}, - {"text ", 'i', " ", 'd', " 3.14abc$"}, - {"text ", 'i', " ", 'd', " 3.15ab$"}, - {"text ", 'i', " 10$ ", 'f'}} - )}; - - using V = pair; - vector>> raw_interpretations{ - {"text ", V{cIntId, "100"}, " ", V{cIntId, "10?"}, " ", V{cFloatId, " 3.14*"}, "*"}, - {"text ", V{cIntId, "100"}, " ", V{cIntId, "10?"}, " ", V{cHasNumId, "3.14*"}, "*"}, - {"text ", V{cIntId, "100"}, " ", V{cIntId, "10?"}, " 3.14**"}, - {"text ", V{cIntId, "100"}, " ", V{cHasNumId, "10?"}, " ", V{cFloatId, " 3.14*"}, "*"}, - {"text ", V{cIntId, "100"}, " ", V{cHasNumId, "10?"}, " ", V{cHasNumId, "3.14*"}, "*"}, - {"text ", V{cIntId, "100"}, " ", V{cHasNumId, "10?"}, " 3.14**"}, - {"text ", V{cIntId, "100"}, " 10? ", V{cFloatId, " 3.14*"}, "*"}, - {"text ", V{cIntId, "100"}, " 10? ", V{cHasNumId, "3.14*"}, "*"}, - {"text ", V{cIntId, "100"}, " 10? 3.14**"} - }; - set interpretations; - for (auto const& raw_interpretation : raw_interpretations) { - interpretations.insert(make_query_interpretation(raw_interpretation)); - } - auto const normalized_interpretations{ - clp::GrepCoreTest::normalize_interpretations(interpretations) - }; - - vector sub_queries; - clp::GrepCoreTest::generate_schema_sub_queries( - normalized_interpretations, - logtype_dict, - var_dict, - sub_queries - ); - - VarInfo const wild_int{false, true, {}}; - VarInfo const wild_has_num{true, true, {1LL}}; - REQUIRE(4 == sub_queries.size()); - size_t i{0}; - check_sub_query(i++, sub_queries, true, {wild_int, wild_has_num}, {1LL}); - check_sub_query(i++, sub_queries, true, {wild_int}, {0LL}); - check_sub_query(i++, sub_queries, false, {wild_int, wild_has_num}, {2LL, 3LL}); - check_sub_query(i++, sub_queries, true, {wild_int}, {5LL}); -} - -// Tests: `process_raw_query` -TEST_CASE("process_raw_query", "[dfa_search]") { - auto lexer{make_test_lexer( - {{R"(int:(\d+))"}, {R"(float:(\d+\.\d+))"}, {R"(hasNumber:[^ $]*\d+[^ $]*)"}} - )}; - - FakeVarDict const var_dict{make_var_dict({pair{0, "1a3"}, pair{1, "10a"}})}; - FakeLogTypeDict const logtype_dict{make_logtype_dict( - {{"text ", 'i', " ", 'i', " ", 'f'}, - {"text ", 'i', " ", 'd', " ", 'f'}, - {"text ", 'i', " ", 'd', " 3.14ab$"}, - {"text ", 'i', " ", 'd', " 3.14abc$"}, - {"text ", 'i', " ", 'd', " 3.15ab$"}, - {"text ", 'i', " 10$ ", 'f'}} - )}; - - string const raw_query{"text 100 10? 3.14*"}; - - auto const query{ - GrepCore::process_raw_query(logtype_dict, var_dict, raw_query, 0, 0, true, lexer, false) - }; - - REQUIRE(query.has_value()); - auto const& sub_queries{query.value().get_sub_queries()}; - - VarInfo const wild_int{false, true, {}}; - VarInfo const wild_has_num{true, true, {1LL}}; - REQUIRE(4 == sub_queries.size()); - size_t i{0}; - check_sub_query(i++, sub_queries, true, {wild_int, wild_has_num}, {1LL}); - check_sub_query(i++, sub_queries, true, {wild_int}, {0LL}); - check_sub_query(i++, sub_queries, false, {wild_int, wild_has_num}, {2LL, 3LL}); - check_sub_query(i++, sub_queries, true, {wild_int}, {5LL}); -} +} // namespace // Tests: `get_bounds_of_next_potential_var` TEST_CASE("get_bounds_of_next_potential_var", "[get_bounds_of_next_potential_var]") { @@ -1054,3 +161,37 @@ TEST_CASE("get_bounds_of_next_potential_var", "[get_bounds_of_next_potential_var REQUIRE(GrepCore::get_bounds_of_next_potential_var(str, begin_pos, end_pos, is_var) == false); } + +TEST_CASE("process_raw_query", "[dfa_search]") { + auto lexer{make_test_lexer( + {{R"(int:(\d+))"}, {R"(float:(\d+\.\d+))"}, {R"(hasNumber:[^ $]*\d+[^ $]*)"}} + )}; + + MockVarDictionary const var_dict{make_var_dict({pair{0, "1a3"}, pair{1, "10a"}})}; + MockLogTypeDictionary const logtype_dict{make_logtype_dict( + {{"text ", 'i', " ", 'i', " ", 'f'}, + {"text ", 'i', " ", 'd', " ", 'f'}, + {"text ", 'i', " ", 'd', " 3.14ab$"}, + {"text ", 'i', " ", 'd', " 3.14abc$"}, + {"text ", 'i', " ", 'd', " 3.15ab$"}, + {"text ", 'i', " 10$ ", 'f'}} + )}; + + string const raw_query{"text 100 10? 3.14*"}; + + auto const query{ + GrepCore::process_raw_query(logtype_dict, var_dict, raw_query, 0, 0, true, lexer, false) + }; + + REQUIRE(query.has_value()); + auto const& sub_queries{query.value().get_sub_queries()}; + + VarInfo const wild_int{false, true, {}}; + VarInfo const wild_has_num{true, true, {1LL}}; + REQUIRE(4 == sub_queries.size()); + size_t i{0}; + check_sub_query(i++, sub_queries, true, {wild_int, wild_has_num}, {1LL}); + check_sub_query(i++, sub_queries, true, {wild_int}, {0LL}); + check_sub_query(i++, sub_queries, false, {wild_int, wild_has_num}, {2LL, 3LL}); + check_sub_query(i++, sub_queries, true, {wild_int}, {5LL}); +} diff --git a/components/core/tests/test-SchemaSearcher.cpp b/components/core/tests/test-SchemaSearcher.cpp new file mode 100644 index 0000000000..d4096da568 --- /dev/null +++ b/components/core/tests/test-SchemaSearcher.cpp @@ -0,0 +1,578 @@ +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "../src/clp/LogTypeDictionaryReaderReq.hpp" +#include "../src/clp/Query.hpp" +#include "../src/clp/SchemaSearcher.hpp" +#include "../src/clp/VariableDictionaryReaderReq.hpp" +#include "search_test_utils.hpp" + +using clp::LogTypeDictionaryReaderReq; +using clp::SubQuery; +using clp::VariableDictionaryReaderReq; +using log_surgeon::SymbolId::TokenFloat; +using log_surgeon::SymbolId::TokenInt; +using log_surgeon::wildcard_query_parser::QueryInterpretation; +using log_surgeon::wildcard_query_parser::VariableQueryToken; +using std::pair; +using std::set; +using std::string; +using std::string_view; +using std::unordered_set; +using std::variant; +using std::vector; + +constexpr uint32_t cIntId{static_cast(TokenInt)}; +constexpr uint32_t cFloatId{static_cast(TokenFloat)}; +constexpr uint32_t cHasNumId{111}; + +/** + * Helper to expose `SchemaSearcher` functionality for unit-testing. + * + * This class provides static wrappers around `SchemaSearcher` methods, allowing test code to access + * internal logic such as: + * - Finding wildcard encodable positions in a `QueryInterpretation`; + * - Generating logtype strings with wildcard masks; + * - Processing variable tokens with or without encoding; + * - Generating schema-based sub-queries. + * + * All methods forward directly to `SchemaSearcher` and are intended for testing only. + */ +class clp::SchemaSearcherTest { +public: + static auto normalize_interpretations(set const& interpretations) + -> set { + return SchemaSearcher::normalize_interpretations(interpretations); + } + + template < + LogTypeDictionaryReaderReq LogTypeDictionaryReaderType, + VariableDictionaryReaderReq VariableDictionaryReaderType + > + static auto generate_schema_sub_queries( + set const& interpretations, + LogTypeDictionaryReaderType const& logtype_dict, + VariableDictionaryReaderType const& var_dict + ) -> vector { + return SchemaSearcher::generate_schema_sub_queries( + interpretations, + logtype_dict, + var_dict, + false + ); + } + + static auto get_wildcard_encodable_positions(QueryInterpretation const& interpretation) + -> vector { + return SchemaSearcher::get_wildcard_encodable_positions(interpretation); + } + + static auto generate_logtype_string( + QueryInterpretation const& interpretation, + vector const& wildcard_encodable_positions, + vector const& mask_encoded_flags + ) -> string { + return SchemaSearcher::generate_logtype_string( + interpretation, + wildcard_encodable_positions, + mask_encoded_flags + ); + } + + template + static auto process_token( + VariableQueryToken const& var_token, + VariableDictionaryReaderType const& var_dict, + SubQuery& sub_query + ) -> bool { + return SchemaSearcher::process_schema_var_token( + var_token, + var_dict, + false, + false, + sub_query + ); + } + + template + static auto process_encoded_token( + VariableQueryToken const& var_token, + VariableDictionaryReaderType const& var_dict, + SubQuery& sub_query + ) -> bool { + return SchemaSearcher::process_schema_var_token( + var_token, + var_dict, + false, + true, + sub_query + ); + } +}; + +namespace { +/** + * Constructs a `QueryInterpretation` from a vector of tokens. + * + * Each token is either: + * - a `string` representing a static substring, or + * - a `pair`, representing a variable placeholder and its value. + * + * This method automatically detects whether a variable token contains a + * wildcard (`*` or `?`). + * + * @param tokens Vector of tokens to populate the `QueryInterpretation`. + * @return A `QueryInterpretation` populated with the given tokens. + */ +auto make_query_interpretation(vector>> const& tokens) + -> QueryInterpretation; + +auto make_query_interpretation(vector>> const& tokens) + -> QueryInterpretation { + QueryInterpretation interp; + for (auto const& token : tokens) { + if (holds_alternative(token)) { + interp.append_static_token(get(token)); + } else { + auto const& [symbol, value]{get>(token)}; + auto const contains_wildcard{value.find_first_of("*?") != string::npos}; + interp.append_variable_token(symbol, value, contains_wildcard); + } + } + return interp; +} +} // namespace + +// Tests: `get_wildcard_encodable_positions` +TEST_CASE("get_wildcard_encodable_positions_for_empty_interpretation", "[dfa_search]") { + QueryInterpretation const interpretation{}; + + auto const positions{clp::SchemaSearcherTest::get_wildcard_encodable_positions(interpretation)}; + REQUIRE(positions.empty()); +} + +TEST_CASE("get_wildcard_encodable_positions_for_multi_variable_interpretation", "[dfa_search]") { + auto const interpretation{make_query_interpretation( + {"text", + pair{cIntId, "100"}, + pair{cFloatId, "32.2"}, + pair{cIntId, "10?"}, + pair{cFloatId, "3.14*"}, + pair{cHasNumId, "3.14*"}} + )}; + + auto const positions{clp::SchemaSearcherTest::get_wildcard_encodable_positions(interpretation)}; + REQUIRE(2 == positions.size()); + REQUIRE(3 == positions[0]); + REQUIRE(4 == positions[1]); +} + +TEST_CASE("generate_logtype_string_for_empty_interpretation", "[dfa_search]") { + QueryInterpretation const interpretation{}; + + auto const wildcard_encodable_positions{ + clp::SchemaSearcherTest::get_wildcard_encodable_positions(interpretation) + }; + + REQUIRE(wildcard_encodable_positions.empty()); + auto const logtype_string{clp::SchemaSearcherTest::generate_logtype_string( + interpretation, + wildcard_encodable_positions, + {false} + )}; + REQUIRE(logtype_string.empty()); +} + +TEST_CASE("generate_logtype_string_for_single_variable_interpretation", "[dfa_search]") { + auto const expected_logtype_string{generate_expected_logtype_string({'i'})}; + + auto const interpretation{make_query_interpretation({pair{cIntId, "100"}})}; + + auto const wildcard_encodable_positions{ + clp::SchemaSearcherTest::get_wildcard_encodable_positions(interpretation) + }; + + REQUIRE(wildcard_encodable_positions.empty()); + auto const logtype_string{clp::SchemaSearcherTest::generate_logtype_string( + interpretation, + wildcard_encodable_positions, + {false} + )}; + REQUIRE(expected_logtype_string == logtype_string); +} + +TEST_CASE("generate_logtype_string_for_multi_variable_interpretation", "[dfa_search]") { + unordered_set const expected_logtype_strings{ + generate_expected_logtype_string({"text", 'i', 'f', 'd', 'd', 'd'}), + generate_expected_logtype_string({"text", 'i', 'f', 'i', 'd', 'd'}), + generate_expected_logtype_string({"text", 'i', 'f', 'd', 'f', 'd'}), + generate_expected_logtype_string({"text", 'i', 'f', 'i', 'f', 'd'}) + }; + + auto const interpretation{make_query_interpretation( + {"text", + pair{cIntId, "100"}, + pair{cFloatId, "32.2"}, + pair{cIntId, "10?"}, + pair{cFloatId, "3.14*"}, + pair{cHasNumId, "3.14*"}} + )}; + + auto const wildcard_encodable_positions{ + clp::SchemaSearcherTest::get_wildcard_encodable_positions(interpretation) + }; + + uint64_t const num_combos{1ULL << wildcard_encodable_positions.size()}; + REQUIRE(num_combos == 4); + unordered_set logtype_strings; + for (uint64_t mask{0}; mask < num_combos; ++mask) { + vector mask_encoded_flags(interpretation.get_logtype().size(), false); + for (size_t i{0}; i < wildcard_encodable_positions.size(); ++i) { + mask_encoded_flags[wildcard_encodable_positions[i]] = (mask >> i) & 1ULL; + } + logtype_strings.insert( + clp::SchemaSearcherTest::generate_logtype_string( + interpretation, + wildcard_encodable_positions, + mask_encoded_flags + ) + ); + } + REQUIRE(expected_logtype_strings == logtype_strings); +} + +TEST_CASE("process_schema_empty_token ", "[dfa_search]") { + MockVarDictionary const var_dict{make_var_dict({pair{0, "100"}})}; + + SubQuery sub_query; + VariableQueryToken const empty_int_token{cIntId, "", false}; + REQUIRE(false == clp::SchemaSearcherTest::process_token(empty_int_token, var_dict, sub_query)); + REQUIRE(false == sub_query.wildcard_match_required()); + REQUIRE(0 == sub_query.get_num_possible_vars()); +} + +TEST_CASE("process_schema_unmatched_token ", "[dfa_search]") { + MockVarDictionary const var_dict{make_var_dict({pair{0, "100"}})}; + + SubQuery sub_query; + VariableQueryToken const int_token{cIntId, "200", false}; + REQUIRE(clp::SchemaSearcherTest::process_token(int_token, var_dict, sub_query)); + REQUIRE(false == sub_query.wildcard_match_required()); + REQUIRE(1 == sub_query.get_num_possible_vars()); + auto const& var{sub_query.get_vars()[0]}; + REQUIRE(false == var.is_dict_var()); + REQUIRE(var.is_precise_var()); + REQUIRE(var.get_possible_var_dict_ids().empty()); +} + +TEST_CASE("process_schema_int_token ", "[dfa_search]") { + MockVarDictionary const var_dict{make_var_dict({pair{0, "100"}})}; + + SubQuery sub_query; + VariableQueryToken const int_token{cIntId, "100", false}; + REQUIRE(clp::SchemaSearcherTest::process_token(int_token, var_dict, sub_query)); + REQUIRE(false == sub_query.wildcard_match_required()); + REQUIRE(1 == sub_query.get_num_possible_vars()); + auto const& var{sub_query.get_vars()[0]}; + REQUIRE(false == var.is_dict_var()); + REQUIRE(var.is_precise_var()); + REQUIRE(var.get_possible_var_dict_ids().empty()); +} + +TEST_CASE("process_schema_encoded_non_greedy_wildcard_token ", "[dfa_search]") { + MockVarDictionary const var_dict{make_var_dict({pair{0, "10a0"}, pair{1, "10b0"}})}; + + SECTION("interpret_as_int") { + SubQuery sub_query; + VariableQueryToken const int_token{cIntId, "10?0", true}; + REQUIRE(clp::SchemaSearcherTest::process_encoded_token(int_token, var_dict, sub_query)); + REQUIRE(sub_query.wildcard_match_required()); + REQUIRE(0 == sub_query.get_num_possible_vars()); + } + + SECTION("interpret_as_float") { + SubQuery sub_query; + VariableQueryToken const float_token{cFloatId, "10?0", true}; + REQUIRE(clp::SchemaSearcherTest::process_encoded_token(float_token, var_dict, sub_query)); + REQUIRE(sub_query.wildcard_match_required()); + REQUIRE(0 == sub_query.get_num_possible_vars()); + } + + SECTION("interpret_as_precise_has_number") { + SubQuery sub_query; + VariableQueryToken const has_number_token{cHasNumId, "10a?", true}; + REQUIRE(clp::SchemaSearcherTest::process_token(has_number_token, var_dict, sub_query)); + REQUIRE(false == sub_query.wildcard_match_required()); + REQUIRE(1 == sub_query.get_num_possible_vars()); + auto const& var{sub_query.get_vars()[0]}; + REQUIRE(var.is_dict_var()); + REQUIRE(var.is_precise_var()); + REQUIRE(0 == var.get_var_dict_id()); + REQUIRE(var.get_possible_var_dict_ids().empty()); + } + + SECTION("interpret_as_imprecise_has_number") { + SubQuery sub_query; + VariableQueryToken const has_number_token{cHasNumId, "10?0", true}; + REQUIRE(clp::SchemaSearcherTest::process_token(has_number_token, var_dict, sub_query)); + REQUIRE(false == sub_query.wildcard_match_required()); + REQUIRE(1 == sub_query.get_num_possible_vars()); + auto const& var{sub_query.get_vars()[0]}; + REQUIRE(var.is_dict_var()); + REQUIRE(false == var.is_precise_var()); + REQUIRE(2 == var.get_possible_var_dict_ids().size()); + for (size_t i{0}; i < var.get_possible_var_dict_ids().size(); ++i) { + REQUIRE(var.get_possible_var_dict_ids().contains(i)); + } + } +} + +// NOTE: CLP currently treats all non-encoded variables as the same, so the below test demonstrates +// this. In the future if CLP is more sophisticated, the two sections behave differently. +TEST_CASE("process_schema_non_encoded_non_greedy_wildcard_token ", "[dfa_search]") { + size_t id{0}; + MockVarDictionary const var_dict{make_var_dict( + {pair{id++, "100000000000000000000000010"}, + pair{id++, "100000000000000000000000020"}, + pair{id++, "100000000000000000000000030"}, + pair{id++, "1000000000000000000000000.0"}, + pair{id++, "1000000000000000000000000a0"}} + )}; + + SECTION("interpret_as_int") { + SubQuery sub_query; + VariableQueryToken const int_token{cIntId, "1000000000000000000000000?0", true}; + REQUIRE(clp::SchemaSearcherTest::process_token(int_token, var_dict, sub_query)); + REQUIRE(false == sub_query.wildcard_match_required()); + REQUIRE(1 == sub_query.get_num_possible_vars()); + auto const& var{sub_query.get_vars()[0]}; + REQUIRE(var.is_dict_var()); + REQUIRE(false == var.is_precise_var()); + REQUIRE(5 == var.get_possible_var_dict_ids().size()); + for (size_t i{0}; i < var.get_possible_var_dict_ids().size(); ++i) { + REQUIRE(var.get_possible_var_dict_ids().contains(i)); + } + } + + SECTION("interpret_as_float") { + SubQuery sub_query; + VariableQueryToken const float_token{cFloatId, "1000000000000000000000000?0", true}; + REQUIRE(clp::SchemaSearcherTest::process_token(float_token, var_dict, sub_query)); + REQUIRE(false == sub_query.wildcard_match_required()); + REQUIRE(1 == sub_query.get_num_possible_vars()); + auto const& var{sub_query.get_vars()[0]}; + REQUIRE(var.is_dict_var()); + REQUIRE(false == var.is_precise_var()); + REQUIRE(5 == var.get_possible_var_dict_ids().size()); + for (size_t i{0}; i < var.get_possible_var_dict_ids().size(); ++i) { + REQUIRE(var.get_possible_var_dict_ids().contains(i)); + } + } + + SECTION("interpret_as_has_number") { + SubQuery sub_query; + VariableQueryToken const has_number_token{cHasNumId, "1000000000000000000000000?0", true}; + REQUIRE(clp::SchemaSearcherTest::process_token(has_number_token, var_dict, sub_query)); + REQUIRE(false == sub_query.wildcard_match_required()); + REQUIRE(1 == sub_query.get_num_possible_vars()); + auto const& var{sub_query.get_vars()[0]}; + REQUIRE(var.is_dict_var()); + REQUIRE(false == var.is_precise_var()); + REQUIRE(5 == var.get_possible_var_dict_ids().size()); + for (size_t i{0}; i < var.get_possible_var_dict_ids().size(); ++i) { + REQUIRE(var.get_possible_var_dict_ids().contains(i)); + } + } +} + +TEST_CASE("process_schema_greedy_wildcard_token ", "[dfa_search]") { + size_t id{0}; + MockVarDictionary const var_dict{make_var_dict( + {pair{id++, "10a0"}, + pair{id++, "10b0"}, + pair{id++, "100000000000000000000000010"}, + pair{id++, "100000000000000000000000020"}, + pair{id++, "100000000000000000000000030"}, + pair{id++, "1000000000000000000000000.0"}, + pair{id++, "1000000000000000000000000a0"}} + )}; + + SECTION("interpret_as_non_encoded_int") { + SubQuery sub_query; + VariableQueryToken const int_token{cIntId, "10*0", true}; + REQUIRE(clp::SchemaSearcherTest::process_token(int_token, var_dict, sub_query)); + REQUIRE(false == sub_query.wildcard_match_required()); + REQUIRE(1 == sub_query.get_num_possible_vars()); + auto const& var{sub_query.get_vars()[0]}; + REQUIRE(var.is_dict_var()); + REQUIRE(false == var.is_precise_var()); + REQUIRE(7 == var.get_possible_var_dict_ids().size()); + for (size_t i{0}; i < var.get_possible_var_dict_ids().size(); ++i) { + REQUIRE(var.get_possible_var_dict_ids().contains(i)); + } + } + + SECTION("interpret_as_non_encoded_float") { + SubQuery sub_query; + VariableQueryToken const float_token{cFloatId, "10*0", true}; + REQUIRE(clp::SchemaSearcherTest::process_token(float_token, var_dict, sub_query)); + REQUIRE(false == sub_query.wildcard_match_required()); + REQUIRE(1 == sub_query.get_num_possible_vars()); + auto const& var{sub_query.get_vars()[0]}; + REQUIRE(var.is_dict_var()); + REQUIRE(false == var.is_precise_var()); + REQUIRE(7 == var.get_possible_var_dict_ids().size()); + for (size_t i{0}; i < var.get_possible_var_dict_ids().size(); ++i) { + REQUIRE(var.get_possible_var_dict_ids().contains(i)); + } + } + + SECTION("interpret_as_non_encoded_imprecise_has_number") { + SubQuery sub_query; + VariableQueryToken const has_number_token{cHasNumId, "10*0", true}; + REQUIRE(clp::SchemaSearcherTest::process_token(has_number_token, var_dict, sub_query)); + REQUIRE(false == sub_query.wildcard_match_required()); + REQUIRE(1 == sub_query.get_num_possible_vars()); + auto const& var{sub_query.get_vars()[0]}; + REQUIRE(var.is_dict_var()); + REQUIRE(false == var.is_precise_var()); + REQUIRE(7 == var.get_possible_var_dict_ids().size()); + for (size_t i{0}; i < var.get_possible_var_dict_ids().size(); ++i) { + REQUIRE(var.get_possible_var_dict_ids().contains(i)); + } + } + + SECTION("interpret_as_non_encoded_precise_has_number") { + SubQuery sub_query; + VariableQueryToken const has_number_token{cHasNumId, "10b*", true}; + REQUIRE(clp::SchemaSearcherTest::process_token(has_number_token, var_dict, sub_query)); + REQUIRE(false == sub_query.wildcard_match_required()); + REQUIRE(1 == sub_query.get_num_possible_vars()); + auto const& var{sub_query.get_vars()[0]}; + REQUIRE(var.is_dict_var()); + REQUIRE(var.is_precise_var()); + REQUIRE(1 == var.get_var_dict_id()); + REQUIRE(var.get_possible_var_dict_ids().empty()); + } + + SECTION("interpret_as_encoded_int") { + SubQuery sub_query; + VariableQueryToken const int_token{cIntId, "10*0", true}; + REQUIRE(clp::SchemaSearcherTest::process_encoded_token(int_token, var_dict, sub_query)); + REQUIRE(sub_query.wildcard_match_required()); + REQUIRE(0 == sub_query.get_num_possible_vars()); + } + + SECTION("interpret_as_encoded_float") { + SubQuery sub_query; + VariableQueryToken const float_token{cFloatId, "10*0", true}; + REQUIRE(clp::SchemaSearcherTest::process_encoded_token(float_token, var_dict, sub_query)); + REQUIRE(sub_query.wildcard_match_required()); + REQUIRE(0 == sub_query.get_num_possible_vars()); + } +} + +// Tests: `generate_schema_sub_queries` +TEST_CASE("generate_schema_sub_queries", "[dfa_search]") { + MockVarDictionary const var_dict{make_var_dict({ + pair{0, "1a3"}, + pair{1, "10a"}, + pair{2, "10b"} + })}; + MockLogTypeDictionary const logtype_dict{make_logtype_dict( + {{"text ", 'i', " ", 'i', " ", 'f'}, + {"text ", 'i', " ", 'd', " ", 'f'}, + {"text ", 'i', " ", 'd', " 3.14ab$"}, + {"text ", 'i', " ", 'd', " 3.14abc$"}, + {"text ", 'i', " ", 'd', " 3.15ab$"}, + {"text ", 'i', " 10$ ", 'f'}} + )}; + + using V = pair; + vector>> raw_interpretations{ + {"text ", V{cIntId, "100"}, " ", V{cIntId, "10?"}, " ", V{cFloatId, " 3.14*"}}, + {"text ", V{cIntId, "100"}, " ", V{cIntId, "10?"}, " ", V{cHasNumId, "3.14*"}}, + {"text ", V{cIntId, "100"}, " ", V{cIntId, "10?"}, " 3.14*"}, + {"text ", V{cIntId, "100"}, " ", V{cHasNumId, "10?"}, " ", V{cFloatId, " 3.14*"}}, + {"text ", V{cIntId, "100"}, " ", V{cHasNumId, "10?"}, " ", V{cHasNumId, "3.14*"}}, + {"text ", V{cIntId, "100"}, " ", V{cHasNumId, "10?"}, " 3.14*"}, + {"text ", V{cIntId, "100"}, " 10? ", V{cFloatId, " 3.14*"}}, + {"text ", V{cIntId, "100"}, " 10? ", V{cHasNumId, "3.14*"}}, + {"text ", V{cIntId, "100"}, " 10? 3.14*"} + }; + set interpretations; + for (auto const& raw_interpretation : raw_interpretations) { + interpretations.insert(make_query_interpretation(raw_interpretation)); + } + + auto const sub_queries{clp::SchemaSearcherTest::generate_schema_sub_queries( + interpretations, + logtype_dict, + var_dict + )}; + + VarInfo const wild_int{false, true, {}}; + VarInfo const wild_has_num{true, false, {1LL, 2LL}}; + REQUIRE(4 == sub_queries.size()); + size_t i{0}; + check_sub_query(i++, sub_queries, true, {wild_int, wild_has_num}, {1LL}); + check_sub_query(i++, sub_queries, true, {wild_int}, {0LL}); + check_sub_query(i++, sub_queries, false, {wild_int, wild_has_num}, {2LL, 3LL}); + check_sub_query(i++, sub_queries, true, {wild_int}, {5LL}); +} + +TEST_CASE("generate_schema_sub_queries_with_wildcard_duplication", "[dfa_search]") { + MockVarDictionary const var_dict{make_var_dict({pair{0, "1a3"}, pair{1, "10a"}})}; + MockLogTypeDictionary const logtype_dict{make_logtype_dict( + {{"text ", 'i', " ", 'i', " ", 'f'}, + {"text ", 'i', " ", 'd', " ", 'f'}, + {"text ", 'i', " ", 'd', " 3.14ab$"}, + {"text ", 'i', " ", 'd', " 3.14abc$"}, + {"text ", 'i', " ", 'd', " 3.15ab$"}, + {"text ", 'i', " 10$ ", 'f'}} + )}; + + using V = pair; + vector>> raw_interpretations{ + {"text ", V{cIntId, "100"}, " ", V{cIntId, "10?"}, " ", V{cFloatId, " 3.14*"}, "*"}, + {"text ", V{cIntId, "100"}, " ", V{cIntId, "10?"}, " ", V{cHasNumId, "3.14*"}, "*"}, + {"text ", V{cIntId, "100"}, " ", V{cIntId, "10?"}, " 3.14**"}, + {"text ", V{cIntId, "100"}, " ", V{cHasNumId, "10?"}, " ", V{cFloatId, " 3.14*"}, "*"}, + {"text ", V{cIntId, "100"}, " ", V{cHasNumId, "10?"}, " ", V{cHasNumId, "3.14*"}, "*"}, + {"text ", V{cIntId, "100"}, " ", V{cHasNumId, "10?"}, " 3.14**"}, + {"text ", V{cIntId, "100"}, " 10? ", V{cFloatId, " 3.14*"}, "*"}, + {"text ", V{cIntId, "100"}, " 10? ", V{cHasNumId, "3.14*"}, "*"}, + {"text ", V{cIntId, "100"}, " 10? 3.14**"} + }; + set interpretations; + for (auto const& raw_interpretation : raw_interpretations) { + interpretations.insert(make_query_interpretation(raw_interpretation)); + } + auto const normalized_interpretations{ + clp::SchemaSearcherTest::normalize_interpretations(interpretations) + }; + + auto const sub_queries{clp::SchemaSearcherTest::generate_schema_sub_queries( + normalized_interpretations, + logtype_dict, + var_dict + )}; + + VarInfo const wild_int{false, true, {}}; + VarInfo const wild_has_num{true, true, {1LL}}; + REQUIRE(4 == sub_queries.size()); + size_t i{0}; + check_sub_query(i++, sub_queries, true, {wild_int, wild_has_num}, {1LL}); + check_sub_query(i++, sub_queries, true, {wild_int}, {0LL}); + check_sub_query(i++, sub_queries, false, {wild_int, wild_has_num}, {2LL, 3LL}); + check_sub_query(i++, sub_queries, true, {wild_int}, {5LL}); +} From 134c2a4a3cd60638a9de6aa3442397ca0f7cc648 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Mon, 8 Dec 2025 20:49:52 -0500 Subject: [PATCH 051/164] Remove comments. --- components/core/tests/test-GrepCore.cpp | 1 - components/core/tests/test-SchemaSearcher.cpp | 2 -- 2 files changed, 3 deletions(-) diff --git a/components/core/tests/test-GrepCore.cpp b/components/core/tests/test-GrepCore.cpp index f6be717c0a..2dc96441b4 100644 --- a/components/core/tests/test-GrepCore.cpp +++ b/components/core/tests/test-GrepCore.cpp @@ -69,7 +69,6 @@ auto make_test_lexer(vector const& schema_rules) -> ByteLexer { } } // namespace -// Tests: `get_bounds_of_next_potential_var` TEST_CASE("get_bounds_of_next_potential_var", "[get_bounds_of_next_potential_var]") { string str; size_t begin_pos{}; diff --git a/components/core/tests/test-SchemaSearcher.cpp b/components/core/tests/test-SchemaSearcher.cpp index d4096da568..30eec6e96e 100644 --- a/components/core/tests/test-SchemaSearcher.cpp +++ b/components/core/tests/test-SchemaSearcher.cpp @@ -152,7 +152,6 @@ auto make_query_interpretation(vector>> c } } // namespace -// Tests: `get_wildcard_encodable_positions` TEST_CASE("get_wildcard_encodable_positions_for_empty_interpretation", "[dfa_search]") { QueryInterpretation const interpretation{}; @@ -481,7 +480,6 @@ TEST_CASE("process_schema_greedy_wildcard_token ", "[dfa_search]") { } } -// Tests: `generate_schema_sub_queries` TEST_CASE("generate_schema_sub_queries", "[dfa_search]") { MockVarDictionary const var_dict{make_var_dict({ pair{0, "1a3"}, From 763f8bec0db707bb5f098c67aa3571d1535c549a Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Mon, 8 Dec 2025 20:55:45 -0500 Subject: [PATCH 052/164] Fix test names. --- components/core/tests/test-SchemaSearcher.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/components/core/tests/test-SchemaSearcher.cpp b/components/core/tests/test-SchemaSearcher.cpp index 30eec6e96e..c0c04cc066 100644 --- a/components/core/tests/test-SchemaSearcher.cpp +++ b/components/core/tests/test-SchemaSearcher.cpp @@ -249,7 +249,7 @@ TEST_CASE("generate_logtype_string_for_multi_variable_interpretation", "[dfa_sea REQUIRE(expected_logtype_strings == logtype_strings); } -TEST_CASE("process_schema_empty_token ", "[dfa_search]") { +TEST_CASE("process_schema_empty_token", "[dfa_search]") { MockVarDictionary const var_dict{make_var_dict({pair{0, "100"}})}; SubQuery sub_query; @@ -259,7 +259,7 @@ TEST_CASE("process_schema_empty_token ", "[dfa_search]") { REQUIRE(0 == sub_query.get_num_possible_vars()); } -TEST_CASE("process_schema_unmatched_token ", "[dfa_search]") { +TEST_CASE("process_schema_unmatched_token", "[dfa_search]") { MockVarDictionary const var_dict{make_var_dict({pair{0, "100"}})}; SubQuery sub_query; @@ -273,7 +273,7 @@ TEST_CASE("process_schema_unmatched_token ", "[dfa_search]") { REQUIRE(var.get_possible_var_dict_ids().empty()); } -TEST_CASE("process_schema_int_token ", "[dfa_search]") { +TEST_CASE("process_schema_int_token", "[dfa_search]") { MockVarDictionary const var_dict{make_var_dict({pair{0, "100"}})}; SubQuery sub_query; @@ -287,7 +287,7 @@ TEST_CASE("process_schema_int_token ", "[dfa_search]") { REQUIRE(var.get_possible_var_dict_ids().empty()); } -TEST_CASE("process_schema_encoded_non_greedy_wildcard_token ", "[dfa_search]") { +TEST_CASE("process_schema_encoded_non_greedy_wildcard_token", "[dfa_search]") { MockVarDictionary const var_dict{make_var_dict({pair{0, "10a0"}, pair{1, "10b0"}})}; SECTION("interpret_as_int") { @@ -337,7 +337,7 @@ TEST_CASE("process_schema_encoded_non_greedy_wildcard_token ", "[dfa_search]") { // NOTE: CLP currently treats all non-encoded variables as the same, so the below test demonstrates // this. In the future if CLP is more sophisticated, the two sections behave differently. -TEST_CASE("process_schema_non_encoded_non_greedy_wildcard_token ", "[dfa_search]") { +TEST_CASE("process_schema_non_encoded_non_greedy_wildcard_token", "[dfa_search]") { size_t id{0}; MockVarDictionary const var_dict{make_var_dict( {pair{id++, "100000000000000000000000010"}, @@ -393,7 +393,7 @@ TEST_CASE("process_schema_non_encoded_non_greedy_wildcard_token ", "[dfa_search] } } -TEST_CASE("process_schema_greedy_wildcard_token ", "[dfa_search]") { +TEST_CASE("process_schema_greedy_wildcard_token", "[dfa_search]") { size_t id{0}; MockVarDictionary const var_dict{make_var_dict( {pair{id++, "10a0"}, From c9d1da2124fd3d47a2ce17fe0bf9283416d3ad3a Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Mon, 8 Dec 2025 20:58:01 -0500 Subject: [PATCH 053/164] Fix guard format; Remove duplicate VarInfo. --- components/core/src/clp/SchemaSearcher.hpp | 2 +- components/core/tests/search_test_utils.cpp | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/components/core/src/clp/SchemaSearcher.hpp b/components/core/src/clp/SchemaSearcher.hpp index b2710e3396..67871b08da 100644 --- a/components/core/src/clp/SchemaSearcher.hpp +++ b/components/core/src/clp/SchemaSearcher.hpp @@ -334,4 +334,4 @@ auto SchemaSearcher::process_schema_var_token( } } // namespace clp -#endif //CLP_SCHEMASEARCHER_HPP +#endif // CLP_SCHEMASEARCHER_HPP diff --git a/components/core/tests/search_test_utils.cpp b/components/core/tests/search_test_utils.cpp index 39d3e358ce..b0d10fee1c 100644 --- a/components/core/tests/search_test_utils.cpp +++ b/components/core/tests/search_test_utils.cpp @@ -29,8 +29,6 @@ using std::variant; using std::vector; using clp::variable_dictionary_id_t; -using VarInfo = tuple>; - auto make_var_dict(vector> const& entries) -> MockVarDictionary { MockVarDictionary dict; for (auto const& [id, val] : entries) { From 39c3f18270285c5fd6079b9076c60ae47f58432f Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Mon, 8 Dec 2025 21:03:10 -0500 Subject: [PATCH 054/164] Remove using form logtype header. --- .../core/tests/MockLogTypeDictionary.hpp | 54 +++++++++---------- 1 file changed, 25 insertions(+), 29 deletions(-) diff --git a/components/core/tests/MockLogTypeDictionary.hpp b/components/core/tests/MockLogTypeDictionary.hpp index 5e340c58b5..4ed58a5bea 100644 --- a/components/core/tests/MockLogTypeDictionary.hpp +++ b/components/core/tests/MockLogTypeDictionary.hpp @@ -11,14 +11,6 @@ #include "../src/clp/EncodedVariableInterpreter.hpp" #include "../src/clp/string_utils/string_utils.hpp" -using clp::EncodedVariableInterpreter; -using clp::logtype_dictionary_id_t; -using clp::string_utils::wildcard_match_unsafe_case_sensitive; -using std::string; -using std::string_view; -using std::unordered_set; -using std::vector; - /** * Simple helper class representing a mock logtype dictionary entry for unit tests. * @@ -26,7 +18,7 @@ using std::vector; */ class MockLogTypeEntry { public: - MockLogTypeEntry(string value, logtype_dictionary_id_t const id) + MockLogTypeEntry(std::string value, clp::logtype_dictionary_id_t const id) : m_value(std::move(value)), m_id(id) {} @@ -35,25 +27,25 @@ class MockLogTypeEntry { auto reserve_constant_length(size_t const length) -> void { m_value.reserve(length); } auto parse_next_var( - [[maybe_unused]] string_view msg, + [[maybe_unused]] std::string_view msg, [[maybe_unused]] size_t begin, [[maybe_unused]] size_t end, - [[maybe_unused]] string_view& parsed + [[maybe_unused]] std::string_view& parsed ) -> bool { return false; } - auto add_constant(string_view const msg, size_t const begin_pos, size_t const length) -> void { + auto add_constant(std::string_view const msg, size_t const begin_pos, size_t const length) -> void { m_value.append(msg.substr(begin_pos, length)); } - auto add_int_var() -> void { EncodedVariableInterpreter::add_int_var(m_value); } + auto add_int_var() -> void { clp::EncodedVariableInterpreter::add_int_var(m_value); } - auto add_float_var() -> void { EncodedVariableInterpreter::add_float_var(m_value); } + auto add_float_var() -> void { clp::EncodedVariableInterpreter::add_float_var(m_value); } - auto add_dictionary_var() -> void { EncodedVariableInterpreter::add_dict_var(m_value); } + auto add_dictionary_var() -> void { clp::EncodedVariableInterpreter::add_dict_var(m_value); } - [[nodiscard]] auto get_value() const -> string const& { return m_value; } + [[nodiscard]] auto get_value() const -> std::string const& { return m_value; } [[nodiscard]] auto get_num_variables() const -> size_t { return 0; } @@ -64,11 +56,11 @@ class MockLogTypeEntry { return SIZE_MAX; } - [[nodiscard]] auto get_id() const -> logtype_dictionary_id_t { return m_id; } + [[nodiscard]] auto get_id() const -> clp::logtype_dictionary_id_t { return m_id; } private: - string m_value; - logtype_dictionary_id_t m_id{0}; + std::string m_value; + clp::logtype_dictionary_id_t m_id{0}; }; /** @@ -79,16 +71,17 @@ class MockLogTypeEntry { class MockLogTypeDictionary { public: using Entry = MockLogTypeEntry; - using dictionary_id_t = logtype_dictionary_id_t; + using dictionary_id_t = clp::logtype_dictionary_id_t; - auto add_entry(string const& value, dictionary_id_t id) -> void { + auto add_entry(std::string const& value, dictionary_id_t id) -> void { m_storage.emplace_back(value, id); } - auto - get_entry_matching_value(string_view const logtype, [[maybe_unused]] bool ignore_case) const - -> vector { - vector results; + [[nodiscard]] auto get_entry_matching_value( + std::string_view const logtype, + [[maybe_unused]] bool ignore_case + ) const -> std::vector { + std::vector results; for (auto const& entry : m_storage) { if (logtype == entry.get_value()) { results.push_back(&entry); @@ -98,19 +91,22 @@ class MockLogTypeDictionary { } auto get_entries_matching_wildcard_string( - string_view const logtype, + std::string_view const logtype, [[maybe_unused]] bool ignore_case, - unordered_set& results + std::unordered_set& results ) const -> void { for (auto const& entry : m_storage) { - if (wildcard_match_unsafe_case_sensitive(entry.get_value(), logtype)) { + if (clp::string_utils::wildcard_match_unsafe_case_sensitive( + entry.get_value(), + logtype + )) { results.insert(&entry); } } } private: - vector m_storage; + std::vector m_storage; }; #endif // MOCK_LOGTYPE_DICTIONARY_HPP From 88dca33852913652182910989ad3f7ec2c598eaa Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Mon, 8 Dec 2025 21:05:48 -0500 Subject: [PATCH 055/164] Remove using from variable header. --- .../core/tests/MockVariableDictionary.hpp | 41 ++++++++----------- 1 file changed, 17 insertions(+), 24 deletions(-) diff --git a/components/core/tests/MockVariableDictionary.hpp b/components/core/tests/MockVariableDictionary.hpp index bf29bd0c7e..388d936d09 100644 --- a/components/core/tests/MockVariableDictionary.hpp +++ b/components/core/tests/MockVariableDictionary.hpp @@ -10,14 +10,6 @@ #include "../src/clp/Defs.h" #include "../src/clp/string_utils/string_utils.hpp" -using clp::string_utils::wildcard_match_unsafe_case_sensitive; -using std::string; -using std::string_view; -using std::unordered_map; -using std::unordered_set; -using std::vector; -using clp::variable_dictionary_id_t; - /** * Simple helper class representing a mock variable dictionary entry for unit tests. * @@ -25,17 +17,17 @@ using clp::variable_dictionary_id_t; */ class MockVarEntry { public: - explicit MockVarEntry(variable_dictionary_id_t const id, string value) + explicit MockVarEntry(clp::variable_dictionary_id_t const id, std::string value) : m_id{id}, m_value{std::move(value)} {} - [[nodiscard]] auto get_id() const -> variable_dictionary_id_t { return m_id; } + [[nodiscard]] auto get_id() const -> clp::variable_dictionary_id_t { return m_id; } - [[nodiscard]] auto get_value() const -> string const& { return m_value; } + [[nodiscard]] auto get_value() const -> std::string const& { return m_value; } private: - variable_dictionary_id_t m_id; - string m_value; + clp::variable_dictionary_id_t m_id; + std::string m_value; }; /** @@ -46,14 +38,14 @@ class MockVarEntry { class MockVarDictionary { public: using Entry = MockVarEntry; - using dictionary_id_t = variable_dictionary_id_t; + using dictionary_id_t = clp::variable_dictionary_id_t; - auto add_entry(dictionary_id_t const id, string value) -> void { + auto add_entry(dictionary_id_t const id, std::string value) -> void { m_storage.emplace(id, Entry{id, std::move(value)}); } - [[nodiscard]] auto get_value(dictionary_id_t const id) const -> string const& { - static string const cEmpty{}; + [[nodiscard]] auto get_value(dictionary_id_t const id) const -> std::string const& { + static std::string const cEmpty{}; auto const it{m_storage.find(id)}; if (m_storage.end() != it) { return it->second.get_value(); @@ -61,9 +53,10 @@ class MockVarDictionary { return cEmpty; } - auto get_entry_matching_value(string_view const val, [[maybe_unused]] bool ignore_case) const - -> vector { - vector results; + auto + get_entry_matching_value(std::string_view const val, [[maybe_unused]] bool ignore_case) const + -> std::vector { + std::vector results; for (auto const& [id, entry] : m_storage) { if (val == entry.get_value()) { results.push_back(&entry); @@ -73,19 +66,19 @@ class MockVarDictionary { } auto get_entries_matching_wildcard_string( - string_view const val, + std::string_view const val, [[maybe_unused]] bool ignore_case, - unordered_set& results + std::unordered_set& results ) const -> void { for (auto const& [id, entry] : m_storage) { - if (wildcard_match_unsafe_case_sensitive(entry.get_value(), val)) { + if (clp::string_utils::wildcard_match_unsafe_case_sensitive(entry.get_value(), val)) { results.insert(&entry); } } } private: - unordered_map m_storage; + std::unordered_map m_storage; }; #endif // MOCK_VARIABLE_DICTIONARY_HPP From 7479840b39ed072a33759f4a60c013296484f4bc Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Mon, 8 Dec 2025 21:07:11 -0500 Subject: [PATCH 056/164] Fix comment indentation. --- components/core/src/clp/SchemaSearcher.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/core/src/clp/SchemaSearcher.hpp b/components/core/src/clp/SchemaSearcher.hpp index 67871b08da..22005cc088 100644 --- a/components/core/src/clp/SchemaSearcher.hpp +++ b/components/core/src/clp/SchemaSearcher.hpp @@ -68,7 +68,7 @@ class SchemaSearcher { std::set const& interpretations ) -> std::set; - /** + /** * Compare all log-surgeon interpretations against the dictionaries to determine the sub queries * to search for within the archive. * From c318db449b444bd17bd1d412deaafa8a320aa08c Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Mon, 8 Dec 2025 21:20:56 -0500 Subject: [PATCH 057/164] Fix formatting. --- components/core/src/clp/GrepCore.hpp | 9 ++------- components/core/tests/search_test_utils.cpp | 4 ++-- components/core/tests/search_test_utils.hpp | 4 ++-- components/core/tests/test-GrepCore.cpp | 2 +- components/core/tests/test-SchemaSearcher.cpp | 8 +++----- 5 files changed, 10 insertions(+), 17 deletions(-) diff --git a/components/core/src/clp/GrepCore.hpp b/components/core/src/clp/GrepCore.hpp index 249c276e04..ea39439109 100644 --- a/components/core/src/clp/GrepCore.hpp +++ b/components/core/src/clp/GrepCore.hpp @@ -233,13 +233,8 @@ std::optional GrepCore::process_raw_query( } } } else { - sub_queries = SchemaSearcher::search( - search_string, - lexer, - logtype_dict, - var_dict, - ignore_case - ); + sub_queries = + SchemaSearcher::search(search_string, lexer, logtype_dict, var_dict, ignore_case); } if (sub_queries.empty()) { diff --git a/components/core/tests/search_test_utils.cpp b/components/core/tests/search_test_utils.cpp index b0d10fee1c..83bc416f91 100644 --- a/components/core/tests/search_test_utils.cpp +++ b/components/core/tests/search_test_utils.cpp @@ -19,15 +19,15 @@ using clp::EncodedVariableInterpreter; using clp::logtype_dictionary_id_t; +using clp::SubQuery; +using clp::variable_dictionary_id_t; using std::pair; using std::string; using std::string_view; -using clp::SubQuery; using std::tuple; using std::unordered_set; using std::variant; using std::vector; -using clp::variable_dictionary_id_t; auto make_var_dict(vector> const& entries) -> MockVarDictionary { MockVarDictionary dict; diff --git a/components/core/tests/search_test_utils.hpp b/components/core/tests/search_test_utils.hpp index dce4fcf877..635b6db1f5 100644 --- a/components/core/tests/search_test_utils.hpp +++ b/components/core/tests/search_test_utils.hpp @@ -18,15 +18,15 @@ using clp::EncodedVariableInterpreter; using clp::logtype_dictionary_id_t; +using clp::SubQuery; +using clp::variable_dictionary_id_t; using std::pair; using std::string; using std::string_view; -using clp::SubQuery; using std::tuple; using std::unordered_set; using std::variant; using std::vector; -using clp::variable_dictionary_id_t; using VarInfo = tuple>; diff --git a/components/core/tests/test-GrepCore.cpp b/components/core/tests/test-GrepCore.cpp index 2dc96441b4..a1a89ec4c1 100644 --- a/components/core/tests/test-GrepCore.cpp +++ b/components/core/tests/test-GrepCore.cpp @@ -19,8 +19,8 @@ using log_surgeon::Schema; using log_surgeon::SchemaVarAST; using log_surgeon::SymbolId::TokenFloat; using log_surgeon::SymbolId::TokenInt; -using std::string; using std::pair; +using std::string; using std::vector; constexpr uint32_t cIntId{static_cast(TokenInt)}; diff --git a/components/core/tests/test-SchemaSearcher.cpp b/components/core/tests/test-SchemaSearcher.cpp index c0c04cc066..8294bc9c7c 100644 --- a/components/core/tests/test-SchemaSearcher.cpp +++ b/components/core/tests/test-SchemaSearcher.cpp @@ -481,11 +481,9 @@ TEST_CASE("process_schema_greedy_wildcard_token", "[dfa_search]") { } TEST_CASE("generate_schema_sub_queries", "[dfa_search]") { - MockVarDictionary const var_dict{make_var_dict({ - pair{0, "1a3"}, - pair{1, "10a"}, - pair{2, "10b"} - })}; + MockVarDictionary const var_dict{ + make_var_dict({pair{0, "1a3"}, pair{1, "10a"}, pair{2, "10b"}}) + }; MockLogTypeDictionary const logtype_dict{make_logtype_dict( {{"text ", 'i', " ", 'i', " ", 'f'}, {"text ", 'i', " ", 'd', " ", 'f'}, From d08ce9ee38a661a3b57f64e1749131838e933ef9 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Mon, 8 Dec 2025 21:24:11 -0500 Subject: [PATCH 058/164] Fix naming. --- components/core/tests/MockVariableDictionary.hpp | 8 ++++---- components/core/tests/search_test_utils.cpp | 4 ++-- components/core/tests/search_test_utils.hpp | 4 ++-- components/core/tests/test-GrepCore.cpp | 2 +- components/core/tests/test-SchemaSearcher.cpp | 16 ++++++++-------- 5 files changed, 17 insertions(+), 17 deletions(-) diff --git a/components/core/tests/MockVariableDictionary.hpp b/components/core/tests/MockVariableDictionary.hpp index 388d936d09..74fbac4903 100644 --- a/components/core/tests/MockVariableDictionary.hpp +++ b/components/core/tests/MockVariableDictionary.hpp @@ -15,9 +15,9 @@ * * Adheres to `VariableDictionaryEntryReq`. */ -class MockVarEntry { +class MockVariableEntry { public: - explicit MockVarEntry(clp::variable_dictionary_id_t const id, std::string value) + explicit MockVariableEntry(clp::variable_dictionary_id_t const id, std::string value) : m_id{id}, m_value{std::move(value)} {} @@ -35,9 +35,9 @@ class MockVarEntry { * * Provides a method for adding entries and adheres to `VariableDictionaryReaderReq`. */ -class MockVarDictionary { +class MockVariableDictionary { public: - using Entry = MockVarEntry; + using Entry = MockVariableEntry; using dictionary_id_t = clp::variable_dictionary_id_t; auto add_entry(dictionary_id_t const id, std::string value) -> void { diff --git a/components/core/tests/search_test_utils.cpp b/components/core/tests/search_test_utils.cpp index 83bc416f91..fc1053c516 100644 --- a/components/core/tests/search_test_utils.cpp +++ b/components/core/tests/search_test_utils.cpp @@ -29,8 +29,8 @@ using std::unordered_set; using std::variant; using std::vector; -auto make_var_dict(vector> const& entries) -> MockVarDictionary { - MockVarDictionary dict; +auto make_var_dict(vector> const& entries) -> MockVariableDictionary { + MockVariableDictionary dict; for (auto const& [id, val] : entries) { dict.add_entry(id, val); } diff --git a/components/core/tests/search_test_utils.hpp b/components/core/tests/search_test_utils.hpp index 635b6db1f5..cc035ee7f3 100644 --- a/components/core/tests/search_test_utils.hpp +++ b/components/core/tests/search_test_utils.hpp @@ -33,9 +33,9 @@ using VarInfo = tuple>; /** * @param entries Vector of (id, value) pairs to populate the variable * dictionary. - * @return A `MockVarDictionary` initialized with the given entries. + * @return A `MockVariableDictionary` initialized with the given entries. */ -auto make_var_dict(vector> const& entries) -> MockVarDictionary; +auto make_var_dict(vector> const& entries) -> MockVariableDictionary; /** * @param entries Vector of logtypes, where each logtype is represented by a vector of tokens. Each diff --git a/components/core/tests/test-GrepCore.cpp b/components/core/tests/test-GrepCore.cpp index a1a89ec4c1..8256f9fca3 100644 --- a/components/core/tests/test-GrepCore.cpp +++ b/components/core/tests/test-GrepCore.cpp @@ -166,7 +166,7 @@ TEST_CASE("process_raw_query", "[dfa_search]") { {{R"(int:(\d+))"}, {R"(float:(\d+\.\d+))"}, {R"(hasNumber:[^ $]*\d+[^ $]*)"}} )}; - MockVarDictionary const var_dict{make_var_dict({pair{0, "1a3"}, pair{1, "10a"}})}; + MockVariableDictionary const var_dict{make_var_dict({pair{0, "1a3"}, pair{1, "10a"}})}; MockLogTypeDictionary const logtype_dict{make_logtype_dict( {{"text ", 'i', " ", 'i', " ", 'f'}, {"text ", 'i', " ", 'd', " ", 'f'}, diff --git a/components/core/tests/test-SchemaSearcher.cpp b/components/core/tests/test-SchemaSearcher.cpp index 8294bc9c7c..6c89295112 100644 --- a/components/core/tests/test-SchemaSearcher.cpp +++ b/components/core/tests/test-SchemaSearcher.cpp @@ -250,7 +250,7 @@ TEST_CASE("generate_logtype_string_for_multi_variable_interpretation", "[dfa_sea } TEST_CASE("process_schema_empty_token", "[dfa_search]") { - MockVarDictionary const var_dict{make_var_dict({pair{0, "100"}})}; + MockVariableDictionary const var_dict{make_var_dict({pair{0, "100"}})}; SubQuery sub_query; VariableQueryToken const empty_int_token{cIntId, "", false}; @@ -260,7 +260,7 @@ TEST_CASE("process_schema_empty_token", "[dfa_search]") { } TEST_CASE("process_schema_unmatched_token", "[dfa_search]") { - MockVarDictionary const var_dict{make_var_dict({pair{0, "100"}})}; + MockVariableDictionary const var_dict{make_var_dict({pair{0, "100"}})}; SubQuery sub_query; VariableQueryToken const int_token{cIntId, "200", false}; @@ -274,7 +274,7 @@ TEST_CASE("process_schema_unmatched_token", "[dfa_search]") { } TEST_CASE("process_schema_int_token", "[dfa_search]") { - MockVarDictionary const var_dict{make_var_dict({pair{0, "100"}})}; + MockVariableDictionary const var_dict{make_var_dict({pair{0, "100"}})}; SubQuery sub_query; VariableQueryToken const int_token{cIntId, "100", false}; @@ -288,7 +288,7 @@ TEST_CASE("process_schema_int_token", "[dfa_search]") { } TEST_CASE("process_schema_encoded_non_greedy_wildcard_token", "[dfa_search]") { - MockVarDictionary const var_dict{make_var_dict({pair{0, "10a0"}, pair{1, "10b0"}})}; + MockVariableDictionary const var_dict{make_var_dict({pair{0, "10a0"}, pair{1, "10b0"}})}; SECTION("interpret_as_int") { SubQuery sub_query; @@ -339,7 +339,7 @@ TEST_CASE("process_schema_encoded_non_greedy_wildcard_token", "[dfa_search]") { // this. In the future if CLP is more sophisticated, the two sections behave differently. TEST_CASE("process_schema_non_encoded_non_greedy_wildcard_token", "[dfa_search]") { size_t id{0}; - MockVarDictionary const var_dict{make_var_dict( + MockVariableDictionary const var_dict{make_var_dict( {pair{id++, "100000000000000000000000010"}, pair{id++, "100000000000000000000000020"}, pair{id++, "100000000000000000000000030"}, @@ -395,7 +395,7 @@ TEST_CASE("process_schema_non_encoded_non_greedy_wildcard_token", "[dfa_search]" TEST_CASE("process_schema_greedy_wildcard_token", "[dfa_search]") { size_t id{0}; - MockVarDictionary const var_dict{make_var_dict( + MockVariableDictionary const var_dict{make_var_dict( {pair{id++, "10a0"}, pair{id++, "10b0"}, pair{id++, "100000000000000000000000010"}, @@ -481,7 +481,7 @@ TEST_CASE("process_schema_greedy_wildcard_token", "[dfa_search]") { } TEST_CASE("generate_schema_sub_queries", "[dfa_search]") { - MockVarDictionary const var_dict{ + MockVariableDictionary const var_dict{ make_var_dict({pair{0, "1a3"}, pair{1, "10a"}, pair{2, "10b"}}) }; MockLogTypeDictionary const logtype_dict{make_logtype_dict( @@ -527,7 +527,7 @@ TEST_CASE("generate_schema_sub_queries", "[dfa_search]") { } TEST_CASE("generate_schema_sub_queries_with_wildcard_duplication", "[dfa_search]") { - MockVarDictionary const var_dict{make_var_dict({pair{0, "1a3"}, pair{1, "10a"}})}; + MockVariableDictionary const var_dict{make_var_dict({pair{0, "1a3"}, pair{1, "10a"}})}; MockLogTypeDictionary const logtype_dict{make_logtype_dict( {{"text ", 'i', " ", 'i', " ", 'f'}, {"text ", 'i', " ", 'd', " ", 'f'}, From 7caa3e7d39c7999255775ee77a48a984684965e6 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Mon, 8 Dec 2025 21:29:42 -0500 Subject: [PATCH 059/164] Fix indentation. --- components/core/src/clp/SchemaSearcher.cpp | 28 ++++++++++------------ 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/components/core/src/clp/SchemaSearcher.cpp b/components/core/src/clp/SchemaSearcher.cpp index d644b9f76b..61a7c8df92 100644 --- a/components/core/src/clp/SchemaSearcher.cpp +++ b/components/core/src/clp/SchemaSearcher.cpp @@ -31,9 +31,7 @@ auto SchemaSearcher::normalize_interpretations(set const& i QueryInterpretation normalized_interpretation; for (auto const& token : interpretation.get_logtype()) { auto const& src_string{std::visit( - [](auto const& token) -> std::string const& { - return token.get_query_substring(); - }, + [](auto const& token) -> std::string const& {return token.get_query_substring();}, token )}; string normalized_string; @@ -45,19 +43,19 @@ auto SchemaSearcher::normalize_interpretations(set const& i } std::visit( - overloaded{ - [&](VariableQueryToken const& variable_token) -> void { - normalized_interpretation.append_variable_token( - variable_token.get_variable_type(), - normalized_string, - variable_token.get_contains_wildcard() - ); + overloaded{ + [&](VariableQueryToken const& variable_token) -> void { + normalized_interpretation.append_variable_token( + variable_token.get_variable_type(), + normalized_string, + variable_token.get_contains_wildcard() + ); + }, + [&]([[maybe_unused]] StaticQueryToken const& static_token) -> void { + normalized_interpretation.append_static_token(normalized_string); + } }, - [&]([[maybe_unused]] StaticQueryToken const& static_token) -> void { - normalized_interpretation.append_static_token(normalized_string); - } - }, - token + token ); } normalized_interpretations.insert(normalized_interpretation); From 90c2a7fd28fb58bb730ede405fc4463b40f206bf Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Tue, 9 Dec 2025 16:43:28 -0500 Subject: [PATCH 060/164] Format. --- components/core/src/clp/GrepCore.hpp | 4 ++-- components/core/src/clp/SchemaSearcher.cpp | 2 +- components/core/src/clp/SchemaSearcher.hpp | 2 +- components/core/tests/MockLogTypeDictionary.hpp | 9 ++++----- components/core/tests/search_test_utils.cpp | 2 +- 5 files changed, 9 insertions(+), 10 deletions(-) diff --git a/components/core/src/clp/GrepCore.hpp b/components/core/src/clp/GrepCore.hpp index ea39439109..19f033b7e6 100644 --- a/components/core/src/clp/GrepCore.hpp +++ b/components/core/src/clp/GrepCore.hpp @@ -233,8 +233,8 @@ std::optional GrepCore::process_raw_query( } } } else { - sub_queries = - SchemaSearcher::search(search_string, lexer, logtype_dict, var_dict, ignore_case); + sub_queries + = SchemaSearcher::search(search_string, lexer, logtype_dict, var_dict, ignore_case); } if (sub_queries.empty()) { diff --git a/components/core/src/clp/SchemaSearcher.cpp b/components/core/src/clp/SchemaSearcher.cpp index 61a7c8df92..04be563275 100644 --- a/components/core/src/clp/SchemaSearcher.cpp +++ b/components/core/src/clp/SchemaSearcher.cpp @@ -156,4 +156,4 @@ auto SchemaSearcher::generate_logtype_string( } return logtype_string; } -} // namespace clp \ No newline at end of file +} // namespace clp diff --git a/components/core/src/clp/SchemaSearcher.hpp b/components/core/src/clp/SchemaSearcher.hpp index 22005cc088..3b5c719e29 100644 --- a/components/core/src/clp/SchemaSearcher.hpp +++ b/components/core/src/clp/SchemaSearcher.hpp @@ -22,7 +22,7 @@ namespace clp { #ifdef CLP_BUILD_TESTING -class SchemaSearcherTest; + class SchemaSearcherTest; #endif class SchemaSearcher { diff --git a/components/core/tests/MockLogTypeDictionary.hpp b/components/core/tests/MockLogTypeDictionary.hpp index 4ed58a5bea..1d5f0b1e9d 100644 --- a/components/core/tests/MockLogTypeDictionary.hpp +++ b/components/core/tests/MockLogTypeDictionary.hpp @@ -35,7 +35,8 @@ class MockLogTypeEntry { return false; } - auto add_constant(std::string_view const msg, size_t const begin_pos, size_t const length) -> void { + auto + add_constant(std::string_view const msg, size_t const begin_pos, size_t const length) -> void { m_value.append(msg.substr(begin_pos, length)); } @@ -96,10 +97,8 @@ class MockLogTypeDictionary { std::unordered_set& results ) const -> void { for (auto const& entry : m_storage) { - if (clp::string_utils::wildcard_match_unsafe_case_sensitive( - entry.get_value(), - logtype - )) { + if (clp::string_utils::wildcard_match_unsafe_case_sensitive(entry.get_value(), logtype)) + { results.insert(&entry); } } diff --git a/components/core/tests/search_test_utils.cpp b/components/core/tests/search_test_utils.cpp index fc1053c516..7203d4a99d 100644 --- a/components/core/tests/search_test_utils.cpp +++ b/components/core/tests/search_test_utils.cpp @@ -100,4 +100,4 @@ auto check_sub_query( } REQUIRE(logtype_ids == sub_query.get_possible_logtypes()); -} \ No newline at end of file +} From fc2891efec361076984aa4ded682803c5b8e9719 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Tue, 9 Dec 2025 16:53:53 -0500 Subject: [PATCH 061/164] Remove using from header. --- components/core/tests/search_test_utils.hpp | 31 ++++++++------------- 1 file changed, 11 insertions(+), 20 deletions(-) diff --git a/components/core/tests/search_test_utils.hpp b/components/core/tests/search_test_utils.hpp index cc035ee7f3..62ec98136e 100644 --- a/components/core/tests/search_test_utils.hpp +++ b/components/core/tests/search_test_utils.hpp @@ -11,38 +11,27 @@ #include #include "../src/clp/Defs.h" -#include "../src/clp/EncodedVariableInterpreter.hpp" #include "../src/clp/Query.hpp" #include "MockLogTypeDictionary.hpp" #include "MockVariableDictionary.hpp" -using clp::EncodedVariableInterpreter; -using clp::logtype_dictionary_id_t; -using clp::SubQuery; -using clp::variable_dictionary_id_t; -using std::pair; -using std::string; -using std::string_view; -using std::tuple; -using std::unordered_set; -using std::variant; -using std::vector; - -using VarInfo = tuple>; +using VarInfo = std::tuple>; /** * @param entries Vector of (id, value) pairs to populate the variable * dictionary. * @return A `MockVariableDictionary` initialized with the given entries. */ -auto make_var_dict(vector> const& entries) -> MockVariableDictionary; +auto +make_var_dict(std::vector> const& entries) -> MockVariableDictionary; /** * @param entries Vector of logtypes, where each logtype is represented by a vector of tokens. Each * token is either a literal substring (`string_view`) or a variable placeholder (`char`). * @return A `MockLogTypeDictionary` initialized with the given entries. */ -auto make_logtype_dict(vector>> const& entries) +auto +make_logtype_dict(std::vector>> const& entries) -> MockLogTypeDictionary; /** @@ -61,7 +50,9 @@ auto make_logtype_dict(vector>> const& entries * @param tokens Vector of tokens to convert into a logtype string. * @return A `string` representing the expected encoded logtype. */ -auto generate_expected_logtype_string(vector> const& tokens) -> string; +auto +generate_expected_logtype_string(std::vector> const& tokens) + -> std::string; /** * Checks that a `SubQuery` at a given index matches the expected properties. @@ -81,10 +72,10 @@ auto generate_expected_logtype_string(vector> const& */ auto check_sub_query( size_t id, - vector const& sub_queries, + std::vector const& sub_queries, bool wildcard_match_required, - vector const& vars_info, - unordered_set const& logtype_ids + std::vector const& vars_info, + std::unordered_set const& logtype_ids ) -> void; #endif // SEARCH_TEST_UTILS_HPP From 7320c0faecbe6915871c5654ad2af4e9983ee7f3 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Tue, 9 Dec 2025 17:02:41 -0500 Subject: [PATCH 062/164] Simplify ranges::find call. --- components/core/src/clp/SchemaSearcher.cpp | 12 +++++------- components/core/src/clp/SchemaSearcher.hpp | 2 +- components/core/tests/MockLogTypeDictionary.hpp | 4 ++-- 3 files changed, 8 insertions(+), 10 deletions(-) diff --git a/components/core/src/clp/SchemaSearcher.cpp b/components/core/src/clp/SchemaSearcher.cpp index 04be563275..550e1cf9fe 100644 --- a/components/core/src/clp/SchemaSearcher.cpp +++ b/components/core/src/clp/SchemaSearcher.cpp @@ -31,8 +31,10 @@ auto SchemaSearcher::normalize_interpretations(set const& i QueryInterpretation normalized_interpretation; for (auto const& token : interpretation.get_logtype()) { auto const& src_string{std::visit( - [](auto const& token) -> std::string const& {return token.get_query_substring();}, - token + [](auto const& tok) -> std::string const& { + return tok.get_query_substring(); + }, + token )}; string normalized_string; normalized_string.reserve(src_string.size()); @@ -117,11 +119,7 @@ auto SchemaSearcher::generate_logtype_string( bool const is_float{TokenFloat == var_type}; if (wildcard_encodable_positions.end() - != std::ranges::find( - wildcard_encodable_positions.begin(), - wildcard_encodable_positions.end(), - i - )) + != std::ranges::find(wildcard_encodable_positions, i)) { if (mask_encoded_flags[i]) { if (is_int) { diff --git a/components/core/src/clp/SchemaSearcher.hpp b/components/core/src/clp/SchemaSearcher.hpp index 3b5c719e29..9621841389 100644 --- a/components/core/src/clp/SchemaSearcher.hpp +++ b/components/core/src/clp/SchemaSearcher.hpp @@ -22,7 +22,7 @@ namespace clp { #ifdef CLP_BUILD_TESTING - class SchemaSearcherTest; + class SchemaSearcherTest; #endif class SchemaSearcher { diff --git a/components/core/tests/MockLogTypeDictionary.hpp b/components/core/tests/MockLogTypeDictionary.hpp index 1d5f0b1e9d..90c1256007 100644 --- a/components/core/tests/MockLogTypeDictionary.hpp +++ b/components/core/tests/MockLogTypeDictionary.hpp @@ -35,8 +35,8 @@ class MockLogTypeEntry { return false; } - auto - add_constant(std::string_view const msg, size_t const begin_pos, size_t const length) -> void { + auto add_constant(std::string_view const msg, size_t const begin_pos, size_t const length) + -> void { m_value.append(msg.substr(begin_pos, length)); } From fd1eecdcd0d34c528c21fb9091ea4add72b6b6b7 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Tue, 9 Dec 2025 17:03:20 -0500 Subject: [PATCH 063/164] Fix typo in docstring. --- components/core/src/clp/SchemaSearcher.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/core/src/clp/SchemaSearcher.hpp b/components/core/src/clp/SchemaSearcher.hpp index 9621841389..85c89b39b7 100644 --- a/components/core/src/clp/SchemaSearcher.hpp +++ b/components/core/src/clp/SchemaSearcher.hpp @@ -141,7 +141,7 @@ class SchemaSearcher { * * @param interpretation The interpretation to convert to a logtype string. * @param wildcard_encodable_positions A vector of positions of encodable wildcard variables. - * @param mask_encoded_flags A vector indicating if a variables is mask encoded. + * @param mask_encoded_flags A vector indicating if a variable is mask encoded. * @return The logtype string corresponding to this combination of encoded variables. */ static auto generate_logtype_string( From 43b5b3a2d518cd3ffa7b6542507dd24d1f77e6ab Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Tue, 9 Dec 2025 17:04:34 -0500 Subject: [PATCH 064/164] Check oob. --- components/core/tests/search_test_utils.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/components/core/tests/search_test_utils.cpp b/components/core/tests/search_test_utils.cpp index 7203d4a99d..a8a1fcfebc 100644 --- a/components/core/tests/search_test_utils.cpp +++ b/components/core/tests/search_test_utils.cpp @@ -79,6 +79,7 @@ auto check_sub_query( unordered_set const& logtype_ids ) -> void { CAPTURE(id); + REQUIRE(id < sub_queries.size()); auto const& sub_query{sub_queries[id]}; REQUIRE(wildcard_match_required == sub_query.wildcard_match_required()); From 754b4f0c2cae2d023568698b03b0bb4e8177d063 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 10 Dec 2025 09:20:37 -0500 Subject: [PATCH 065/164] Lint. --- components/core/src/clp/SchemaSearcher.cpp | 4 +--- components/core/src/clp/SchemaSearcher.hpp | 17 ++++++++--------- components/core/tests/search_test_utils.hpp | 16 ++++++++-------- 3 files changed, 17 insertions(+), 20 deletions(-) diff --git a/components/core/src/clp/SchemaSearcher.cpp b/components/core/src/clp/SchemaSearcher.cpp index 550e1cf9fe..0897e81443 100644 --- a/components/core/src/clp/SchemaSearcher.cpp +++ b/components/core/src/clp/SchemaSearcher.cpp @@ -31,9 +31,7 @@ auto SchemaSearcher::normalize_interpretations(set const& i QueryInterpretation normalized_interpretation; for (auto const& token : interpretation.get_logtype()) { auto const& src_string{std::visit( - [](auto const& tok) -> std::string const& { - return tok.get_query_substring(); - }, + [](auto const& tok) -> std::string const& { return tok.get_query_substring(); }, token )}; string normalized_string; diff --git a/components/core/src/clp/SchemaSearcher.hpp b/components/core/src/clp/SchemaSearcher.hpp index 85c89b39b7..80a3a65cad 100644 --- a/components/core/src/clp/SchemaSearcher.hpp +++ b/components/core/src/clp/SchemaSearcher.hpp @@ -22,12 +22,12 @@ namespace clp { #ifdef CLP_BUILD_TESTING - class SchemaSearcherTest; +class SchemaSearcherTest; #endif class SchemaSearcher { #ifdef CLP_BUILD_TESTING - friend class SchemaSearcherTest; + friend class SchemaSearcherTest; #endif public: @@ -35,13 +35,12 @@ class SchemaSearcher { LogTypeDictionaryReaderReq LogTypeDictionaryReaderType, VariableDictionaryReaderReq VariableDictionaryReaderType > - static auto search( - std::string const& search_string, - log_surgeon::lexers::ByteLexer& lexer, - LogTypeDictionaryReaderType const& logtype_dict, - VariableDictionaryReaderType const& var_dict, - bool ignore_case - ) -> std::vector { + static auto + search(std::string const& search_string, + log_surgeon::lexers::ByteLexer& lexer, + LogTypeDictionaryReaderType const& logtype_dict, + VariableDictionaryReaderType const& var_dict, + bool ignore_case) -> std::vector { // TODO: Optimize such that interpretations are only generated once per schema. log_surgeon::wildcard_query_parser::Query const query{search_string}; auto const interpretations{query.get_all_multi_token_interpretations(lexer)}; diff --git a/components/core/tests/search_test_utils.hpp b/components/core/tests/search_test_utils.hpp index 62ec98136e..b4eb83a6a3 100644 --- a/components/core/tests/search_test_utils.hpp +++ b/components/core/tests/search_test_utils.hpp @@ -22,17 +22,17 @@ using VarInfo = std::tuple> const& entries) -> MockVariableDictionary; +auto make_var_dict(std::vector> const& entries) + -> MockVariableDictionary; /** * @param entries Vector of logtypes, where each logtype is represented by a vector of tokens. Each * token is either a literal substring (`string_view`) or a variable placeholder (`char`). * @return A `MockLogTypeDictionary` initialized with the given entries. */ -auto -make_logtype_dict(std::vector>> const& entries) - -> MockLogTypeDictionary; +auto make_logtype_dict( + std::vector>> const& entries +) -> MockLogTypeDictionary; /** * Generates a logtype string from a vector of tokens. @@ -50,9 +50,9 @@ make_logtype_dict(std::vector>> * @param tokens Vector of tokens to convert into a logtype string. * @return A `string` representing the expected encoded logtype. */ -auto -generate_expected_logtype_string(std::vector> const& tokens) - -> std::string; +auto generate_expected_logtype_string( + std::vector> const& tokens +) -> std::string; /** * Checks that a `SubQuery` at a given index matches the expected properties. From 1ebb3bf5dedadaa73d8482b26fa1c0aeac7438d4 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 10 Dec 2025 09:36:17 -0500 Subject: [PATCH 066/164] Check lexer map contains symbol. --- components/core/tests/test-GrepCore.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/components/core/tests/test-GrepCore.cpp b/components/core/tests/test-GrepCore.cpp index 8256f9fca3..8438dad84a 100644 --- a/components/core/tests/test-GrepCore.cpp +++ b/components/core/tests/test-GrepCore.cpp @@ -58,10 +58,9 @@ auto make_test_lexer(vector const& schema_rules) -> ByteLexer { REQUIRE(nullptr != schema_ast->m_schema_vars[i]); auto* capture_rule_ast{dynamic_cast(schema_ast->m_schema_vars[i].get())}; REQUIRE(nullptr != capture_rule_ast); - lexer.add_rule( - lexer.m_symbol_id[capture_rule_ast->m_name], - std::move(capture_rule_ast->m_regex_ptr) - ); + auto symbol_id_t{lexer.m_symbol_id.find(capture_rule_ast->m_name)}; + REQUIRE(lexer.m_symbol_id.end() != symbol_id_t); + lexer.add_rule(symbol_id_t->second, std::move(capture_rule_ast->m_regex_ptr)); } lexer.generate(); From f5e1f9a8ce98cda28948c2c97e78c8119132f604 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 10 Dec 2025 09:40:18 -0500 Subject: [PATCH 067/164] Replace magic numbers. --- components/core/tests/test-GrepCore.cpp | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/components/core/tests/test-GrepCore.cpp b/components/core/tests/test-GrepCore.cpp index 8438dad84a..37213b1f49 100644 --- a/components/core/tests/test-GrepCore.cpp +++ b/components/core/tests/test-GrepCore.cpp @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -161,6 +162,11 @@ TEST_CASE("get_bounds_of_next_potential_var", "[get_bounds_of_next_potential_var } TEST_CASE("process_raw_query", "[dfa_search]") { + constexpr uint32_t cNoBeginTimestamp{0}; + constexpr uint32_t cNoEndTimestamp{0}; + constexpr bool cIgnoreCase{true}; + constexpr bool cSearchArchive{false}; + auto lexer{make_test_lexer( {{R"(int:(\d+))"}, {R"(float:(\d+\.\d+))"}, {R"(hasNumber:[^ $]*\d+[^ $]*)"}} )}; @@ -177,9 +183,16 @@ TEST_CASE("process_raw_query", "[dfa_search]") { string const raw_query{"text 100 10? 3.14*"}; - auto const query{ - GrepCore::process_raw_query(logtype_dict, var_dict, raw_query, 0, 0, true, lexer, false) - }; + auto const query{GrepCore::process_raw_query( + logtype_dict, + var_dict, + raw_query, + cNoBeginTimestamp, + cNoEndTimestamp, + cIgnoreCase, + lexer, + cSearchArchive + )}; REQUIRE(query.has_value()); auto const& sub_queries{query.value().get_sub_queries()}; From 31fe021235ea4f1e709285656c4f0eab159e4854 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 10 Dec 2025 09:46:20 -0500 Subject: [PATCH 068/164] Simplify ranges call. --- components/core/src/clp/SchemaSearcher.hpp | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/components/core/src/clp/SchemaSearcher.hpp b/components/core/src/clp/SchemaSearcher.hpp index 80a3a65cad..e680b23b4b 100644 --- a/components/core/src/clp/SchemaSearcher.hpp +++ b/components/core/src/clp/SchemaSearcher.hpp @@ -228,11 +228,7 @@ auto SchemaSearcher::generate_schema_sub_queries( { bool is_mask_encoded{false}; if (wildcard_encodable_positions.end() - != std::ranges::find( - wildcard_encodable_positions.begin(), - wildcard_encodable_positions.end(), - i - )) + != std::ranges::find(wildcard_encodable_positions, i)) { is_mask_encoded = mask_encoded_flags[i]; } From e2330cc94cabf438c91ee622aaca2125f5159239 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 10 Dec 2025 09:49:16 -0500 Subject: [PATCH 069/164] Remove explicit keyword; Add docstring for VarInfo. --- components/core/tests/MockVariableDictionary.hpp | 2 +- components/core/tests/search_test_utils.hpp | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/components/core/tests/MockVariableDictionary.hpp b/components/core/tests/MockVariableDictionary.hpp index 74fbac4903..40c7795a49 100644 --- a/components/core/tests/MockVariableDictionary.hpp +++ b/components/core/tests/MockVariableDictionary.hpp @@ -17,7 +17,7 @@ */ class MockVariableEntry { public: - explicit MockVariableEntry(clp::variable_dictionary_id_t const id, std::string value) + MockVariableEntry(clp::variable_dictionary_id_t const id, std::string value) : m_id{id}, m_value{std::move(value)} {} diff --git a/components/core/tests/search_test_utils.hpp b/components/core/tests/search_test_utils.hpp index b4eb83a6a3..5f9ef60383 100644 --- a/components/core/tests/search_test_utils.hpp +++ b/components/core/tests/search_test_utils.hpp @@ -15,6 +15,10 @@ #include "MockLogTypeDictionary.hpp" #include "MockVariableDictionary.hpp" +/** + * Type alias for variable information in tests. + * Elements: (is_dict_var, is_percise_var, var_dict_ids) + */ using VarInfo = std::tuple>; /** From 732b452c76929c118a3a6e37f682c6a642a9c290 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 10 Dec 2025 09:52:24 -0500 Subject: [PATCH 070/164] Flip check order. --- components/core/src/clp/SchemaSearcher.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/core/src/clp/SchemaSearcher.cpp b/components/core/src/clp/SchemaSearcher.cpp index 0897e81443..6a06230bca 100644 --- a/components/core/src/clp/SchemaSearcher.cpp +++ b/components/core/src/clp/SchemaSearcher.cpp @@ -37,7 +37,7 @@ auto SchemaSearcher::normalize_interpretations(set const& i string normalized_string; normalized_string.reserve(src_string.size()); for (auto const c : src_string) { - if (c != '*' || normalized_string.empty() || normalized_string.back() != '*') { + if ('*' != c || normalized_string.empty() || '*' != normalized_string.back()) { normalized_string += c; } } From 8fdd28ded02fdf35887098f780733c95834cdb76 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 10 Dec 2025 09:57:59 -0500 Subject: [PATCH 071/164] Fix typo. --- components/core/tests/search_test_utils.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/core/tests/search_test_utils.hpp b/components/core/tests/search_test_utils.hpp index 5f9ef60383..1a78d6380a 100644 --- a/components/core/tests/search_test_utils.hpp +++ b/components/core/tests/search_test_utils.hpp @@ -17,7 +17,7 @@ /** * Type alias for variable information in tests. - * Elements: (is_dict_var, is_percise_var, var_dict_ids) + * Elements: (is_dict_var, is_precise_var, var_dict_ids) */ using VarInfo = std::tuple>; From 322390cff663550d94f5b7e866dfaa20de764e4f Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 10 Dec 2025 10:12:15 -0500 Subject: [PATCH 072/164] Improve constexpr. --- components/core/tests/test-GrepCore.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/components/core/tests/test-GrepCore.cpp b/components/core/tests/test-GrepCore.cpp index 37213b1f49..1ddb0be345 100644 --- a/components/core/tests/test-GrepCore.cpp +++ b/components/core/tests/test-GrepCore.cpp @@ -11,9 +11,11 @@ #include #include +#include "../src/clp/Defs.h" #include "../src/clp/GrepCore.hpp" #include "search_test_utils.hpp" +using clp::epochtime_t; using clp::GrepCore; using log_surgeon::lexers::ByteLexer; using log_surgeon::Schema; @@ -162,10 +164,10 @@ TEST_CASE("get_bounds_of_next_potential_var", "[get_bounds_of_next_potential_var } TEST_CASE("process_raw_query", "[dfa_search]") { - constexpr uint32_t cNoBeginTimestamp{0}; - constexpr uint32_t cNoEndTimestamp{0}; + constexpr epochtime_t cNoBeginTimestamp{0}; + constexpr epochtime_t cNoEndTimestamp{0}; constexpr bool cIgnoreCase{true}; - constexpr bool cSearchArchive{false}; + constexpr bool cUseHeuristic{false}; auto lexer{make_test_lexer( {{R"(int:(\d+))"}, {R"(float:(\d+\.\d+))"}, {R"(hasNumber:[^ $]*\d+[^ $]*)"}} @@ -191,7 +193,7 @@ TEST_CASE("process_raw_query", "[dfa_search]") { cNoEndTimestamp, cIgnoreCase, lexer, - cSearchArchive + cUseHeuristic )}; REQUIRE(query.has_value()); From eed8b0b51857c6c9a06ad9456e7893a39a6d7842 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 10 Dec 2025 20:13:37 -0500 Subject: [PATCH 073/164] Improve make_var_dict. --- components/core/tests/search_test_utils.cpp | 3 ++- components/core/tests/search_test_utils.hpp | 8 ++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/components/core/tests/search_test_utils.cpp b/components/core/tests/search_test_utils.cpp index a8a1fcfebc..7c82839175 100644 --- a/components/core/tests/search_test_utils.cpp +++ b/components/core/tests/search_test_utils.cpp @@ -29,7 +29,8 @@ using std::unordered_set; using std::variant; using std::vector; -auto make_var_dict(vector> const& entries) -> MockVariableDictionary { +auto make_var_dict(vector> const& entries) + -> MockVariableDictionary { MockVariableDictionary dict; for (auto const& [id, val] : entries) { dict.add_entry(id, val); diff --git a/components/core/tests/search_test_utils.hpp b/components/core/tests/search_test_utils.hpp index 1a78d6380a..14b146e6da 100644 --- a/components/core/tests/search_test_utils.hpp +++ b/components/core/tests/search_test_utils.hpp @@ -22,12 +22,12 @@ using VarInfo = std::tuple>; /** - * @param entries Vector of (id, value) pairs to populate the variable - * dictionary. + * @param entries Vector of (id, value) pairs to populate the variable dictionary. * @return A `MockVariableDictionary` initialized with the given entries. */ -auto make_var_dict(std::vector> const& entries) - -> MockVariableDictionary; +auto make_var_dict( + std::vector> const& entries +) -> MockVariableDictionary; /** * @param entries Vector of logtypes, where each logtype is represented by a vector of tokens. Each From dff3df0c3b11b57f608174039e56cdafb351ff58 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Sun, 11 Jan 2026 04:45:30 -0500 Subject: [PATCH 074/164] Add REQUIRE(false). --- components/core/tests/search_test_utils.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/components/core/tests/search_test_utils.cpp b/components/core/tests/search_test_utils.cpp index 7c82839175..fc25fabad1 100644 --- a/components/core/tests/search_test_utils.cpp +++ b/components/core/tests/search_test_utils.cpp @@ -65,6 +65,7 @@ auto generate_expected_logtype_string(vector> const& EncodedVariableInterpreter::add_dict_var(result); break; default: + REQUIRE(false); break; } } From e4f39a2c36affaa77f82f34ed7ae4b5c237781ec Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Sun, 11 Jan 2026 05:55:34 -0500 Subject: [PATCH 075/164] Update includes to use angled braces. --- components/core/src/clp/GrepCore.hpp | 2 +- components/core/src/clp/SchemaSearcher.cpp | 3 +-- components/core/src/clp/SchemaSearcher.hpp | 11 +++++------ components/core/tests/MockLogTypeDictionary.hpp | 6 +++--- components/core/tests/MockVariableDictionary.hpp | 4 ++-- components/core/tests/search_test_utils.cpp | 7 ++++--- components/core/tests/search_test_utils.hpp | 5 +++-- components/core/tests/test-GrepCore.cpp | 4 ++-- components/core/tests/test-SchemaSearcher.cpp | 8 ++++---- 9 files changed, 25 insertions(+), 25 deletions(-) diff --git a/components/core/src/clp/GrepCore.hpp b/components/core/src/clp/GrepCore.hpp index 19f033b7e6..9a3ef64e11 100644 --- a/components/core/src/clp/GrepCore.hpp +++ b/components/core/src/clp/GrepCore.hpp @@ -10,6 +10,7 @@ #include #include +#include #include #include @@ -20,7 +21,6 @@ #include "LogTypeDictionaryReaderReq.hpp" #include "Query.hpp" #include "QueryToken.hpp" -#include "SchemaSearcher.hpp" #include "VariableDictionaryReaderReq.hpp" namespace clp { diff --git a/components/core/src/clp/SchemaSearcher.cpp b/components/core/src/clp/SchemaSearcher.cpp index 6a06230bca..bf5b844a07 100644 --- a/components/core/src/clp/SchemaSearcher.cpp +++ b/components/core/src/clp/SchemaSearcher.cpp @@ -8,10 +8,9 @@ #include #include +#include #include -#include "EncodedVariableInterpreter.hpp" - using log_surgeon::SymbolId::TokenFloat; using log_surgeon::SymbolId::TokenInt; using log_surgeon::wildcard_query_parser::QueryInterpretation; diff --git a/components/core/src/clp/SchemaSearcher.hpp b/components/core/src/clp/SchemaSearcher.hpp index e680b23b4b..7124adf80c 100644 --- a/components/core/src/clp/SchemaSearcher.hpp +++ b/components/core/src/clp/SchemaSearcher.hpp @@ -11,14 +11,13 @@ #include #include +#include +#include #include #include - -#include "Defs.h" -#include "EncodedVariableInterpreter.hpp" -#include "LogTypeDictionaryReaderReq.hpp" -#include "Query.hpp" -#include "VariableDictionaryReaderReq.hpp" +#include +#include +#include namespace clp { #ifdef CLP_BUILD_TESTING diff --git a/components/core/tests/MockLogTypeDictionary.hpp b/components/core/tests/MockLogTypeDictionary.hpp index 90c1256007..60b8f071e6 100644 --- a/components/core/tests/MockLogTypeDictionary.hpp +++ b/components/core/tests/MockLogTypeDictionary.hpp @@ -7,9 +7,9 @@ #include #include -#include "../src/clp/Defs.h" -#include "../src/clp/EncodedVariableInterpreter.hpp" -#include "../src/clp/string_utils/string_utils.hpp" +#include +#include +#include /** * Simple helper class representing a mock logtype dictionary entry for unit tests. diff --git a/components/core/tests/MockVariableDictionary.hpp b/components/core/tests/MockVariableDictionary.hpp index 40c7795a49..4f9269b3b4 100644 --- a/components/core/tests/MockVariableDictionary.hpp +++ b/components/core/tests/MockVariableDictionary.hpp @@ -7,8 +7,8 @@ #include #include -#include "../src/clp/Defs.h" -#include "../src/clp/string_utils/string_utils.hpp" +#include +#include /** * Simple helper class representing a mock variable dictionary entry for unit tests. diff --git a/components/core/tests/search_test_utils.cpp b/components/core/tests/search_test_utils.cpp index fc25fabad1..2325c7ddf0 100644 --- a/components/core/tests/search_test_utils.cpp +++ b/components/core/tests/search_test_utils.cpp @@ -11,9 +11,10 @@ #include -#include "../src/clp/Defs.h" -#include "../src/clp/EncodedVariableInterpreter.hpp" -#include "../src/clp/Query.hpp" +#include +#include +#include + #include "MockLogTypeDictionary.hpp" #include "MockVariableDictionary.hpp" diff --git a/components/core/tests/search_test_utils.hpp b/components/core/tests/search_test_utils.hpp index 14b146e6da..4f08c090bf 100644 --- a/components/core/tests/search_test_utils.hpp +++ b/components/core/tests/search_test_utils.hpp @@ -10,8 +10,9 @@ #include #include -#include "../src/clp/Defs.h" -#include "../src/clp/Query.hpp" +#include +#include + #include "MockLogTypeDictionary.hpp" #include "MockVariableDictionary.hpp" diff --git a/components/core/tests/test-GrepCore.cpp b/components/core/tests/test-GrepCore.cpp index 1ddb0be345..676852186a 100644 --- a/components/core/tests/test-GrepCore.cpp +++ b/components/core/tests/test-GrepCore.cpp @@ -6,13 +6,13 @@ #include #include +#include +#include #include #include #include #include -#include "../src/clp/Defs.h" -#include "../src/clp/GrepCore.hpp" #include "search_test_utils.hpp" using clp::epochtime_t; diff --git a/components/core/tests/test-SchemaSearcher.cpp b/components/core/tests/test-SchemaSearcher.cpp index 6c89295112..fa9c4db83a 100644 --- a/components/core/tests/test-SchemaSearcher.cpp +++ b/components/core/tests/test-SchemaSearcher.cpp @@ -7,13 +7,13 @@ #include #include +#include #include #include +#include +#include +#include -#include "../src/clp/LogTypeDictionaryReaderReq.hpp" -#include "../src/clp/Query.hpp" -#include "../src/clp/SchemaSearcher.hpp" -#include "../src/clp/VariableDictionaryReaderReq.hpp" #include "search_test_utils.hpp" using clp::LogTypeDictionaryReaderReq; From 426966cdf525d0db04890e012ba39f07c5ead1ae Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Sun, 11 Jan 2026 06:04:42 -0500 Subject: [PATCH 076/164] Reverse order to have shorter branch first. --- components/core/src/clp/GrepCore.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/components/core/src/clp/GrepCore.hpp b/components/core/src/clp/GrepCore.hpp index 9a3ef64e11..188d5b8442 100644 --- a/components/core/src/clp/GrepCore.hpp +++ b/components/core/src/clp/GrepCore.hpp @@ -147,7 +147,10 @@ std::optional GrepCore::process_raw_query( bool use_heuristic ) { std::vector sub_queries; - if (use_heuristic) { + if (false == use_heuristic) { + sub_queries + = SchemaSearcher::search(search_string, lexer, logtype_dict, var_dict, ignore_case); + } else { // Split search_string into tokens with wildcards std::vector query_tokens; size_t begin_pos = 0; @@ -232,9 +235,6 @@ std::optional GrepCore::process_raw_query( } } } - } else { - sub_queries - = SchemaSearcher::search(search_string, lexer, logtype_dict, var_dict, ignore_case); } if (sub_queries.empty()) { From 510c1761d198407f00e7e6d23009397bf04a346b Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Sun, 11 Jan 2026 07:01:57 -0500 Subject: [PATCH 077/164] Update cmake. --- components/core/CMakeLists.txt | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/components/core/CMakeLists.txt b/components/core/CMakeLists.txt index e76846c995..465bd8b92b 100644 --- a/components/core/CMakeLists.txt +++ b/components/core/CMakeLists.txt @@ -741,7 +741,15 @@ set(SOURCE_FILES_unitTest tests/test-Utils.cpp ) -if(CLP_BUILD_TESTING) +if(CLP_IS_TOP_LEVEL) + include(CTest) +endif() + +if(BUILD_TESTING AND CLP_BUILD_TESTING) + set(CLP_ENABLE_TESTS ON) +endif() + +if(CLP_ENABLE_TESTS) add_executable(unitTest ${SOURCE_FILES_unitTest} ${SOURCE_FILES_clp_s_unitTest} From db92c0d157401b58c8c8dfa945a9dc9ac277ce43 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Sun, 11 Jan 2026 07:10:02 -0500 Subject: [PATCH 078/164] Move todo to issue. --- components/core/src/clp/SchemaSearcher.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/components/core/src/clp/SchemaSearcher.hpp b/components/core/src/clp/SchemaSearcher.hpp index 7124adf80c..bed711d219 100644 --- a/components/core/src/clp/SchemaSearcher.hpp +++ b/components/core/src/clp/SchemaSearcher.hpp @@ -40,7 +40,6 @@ class SchemaSearcher { LogTypeDictionaryReaderType const& logtype_dict, VariableDictionaryReaderType const& var_dict, bool ignore_case) -> std::vector { - // TODO: Optimize such that interpretations are only generated once per schema. log_surgeon::wildcard_query_parser::Query const query{search_string}; auto const interpretations{query.get_all_multi_token_interpretations(lexer)}; auto const normalized_interpretations{normalize_interpretations(interpretations)}; From 934889a00ee54878c2acb7ec8a6699879e74af0e Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Sun, 11 Jan 2026 07:12:00 -0500 Subject: [PATCH 079/164] Reword comments. --- components/core/tests/MockLogTypeDictionary.hpp | 4 ++-- components/core/tests/MockVariableDictionary.hpp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/components/core/tests/MockLogTypeDictionary.hpp b/components/core/tests/MockLogTypeDictionary.hpp index 60b8f071e6..b62db85841 100644 --- a/components/core/tests/MockLogTypeDictionary.hpp +++ b/components/core/tests/MockLogTypeDictionary.hpp @@ -12,7 +12,7 @@ #include /** - * Simple helper class representing a mock logtype dictionary entry for unit tests. + * Helper class representing a mock logtype dictionary entry for unit tests. * * Adheres to `LogtypeDictionaryEntryReq`. */ @@ -65,7 +65,7 @@ class MockLogTypeEntry { }; /** - * Simple helper class representing a mock logtype dictionary for unit tests. + * Helper class representing a mock logtype dictionary for unit tests. * * Provides a method for adding entries and adheres to `LogtypeDictionaryReaderReq`. */ diff --git a/components/core/tests/MockVariableDictionary.hpp b/components/core/tests/MockVariableDictionary.hpp index 4f9269b3b4..9da263d0b5 100644 --- a/components/core/tests/MockVariableDictionary.hpp +++ b/components/core/tests/MockVariableDictionary.hpp @@ -11,7 +11,7 @@ #include /** - * Simple helper class representing a mock variable dictionary entry for unit tests. + * Helper class representing a mock variable dictionary entry for unit tests. * * Adheres to `VariableDictionaryEntryReq`. */ @@ -31,7 +31,7 @@ class MockVariableEntry { }; /** - * Simple helper class representing a mock variable dictionary for unit tests. + * Helper class representing a mock variable dictionary for unit tests. * * Provides a method for adding entries and adheres to `VariableDictionaryReaderReq`. */ From 21046b061c528a184034c3733bff83d4ce0c9168 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Sun, 11 Jan 2026 07:13:41 -0500 Subject: [PATCH 080/164] Remove std. --- components/core/src/clp/SchemaSearcher.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/core/src/clp/SchemaSearcher.cpp b/components/core/src/clp/SchemaSearcher.cpp index bf5b844a07..a13f107443 100644 --- a/components/core/src/clp/SchemaSearcher.cpp +++ b/components/core/src/clp/SchemaSearcher.cpp @@ -30,7 +30,7 @@ auto SchemaSearcher::normalize_interpretations(set const& i QueryInterpretation normalized_interpretation; for (auto const& token : interpretation.get_logtype()) { auto const& src_string{std::visit( - [](auto const& tok) -> std::string const& { return tok.get_query_substring(); }, + [](auto const& tok) -> string const& { return tok.get_query_substring(); }, token )}; string normalized_string; From b27b92bfa77606bd5d284f88c95c45c39efa9884 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Sun, 11 Jan 2026 07:22:51 -0500 Subject: [PATCH 081/164] Update includes. --- components/core/src/clp/GrepCore.cpp | 2 +- components/core/src/clp/GrepCore.hpp | 17 ++++++++--------- components/core/src/clp/Query.hpp | 2 +- 3 files changed, 10 insertions(+), 11 deletions(-) diff --git a/components/core/src/clp/GrepCore.cpp b/components/core/src/clp/GrepCore.cpp index 394b2db5b8..e8102a2388 100644 --- a/components/core/src/clp/GrepCore.cpp +++ b/components/core/src/clp/GrepCore.cpp @@ -4,9 +4,9 @@ #include #include +#include #include -#include "ir/parsing.hpp" using clp::ir::is_delim; using clp::string_utils::is_alphabet; diff --git a/components/core/src/clp/GrepCore.hpp b/components/core/src/clp/GrepCore.hpp index 188d5b8442..59d60c6880 100644 --- a/components/core/src/clp/GrepCore.hpp +++ b/components/core/src/clp/GrepCore.hpp @@ -9,19 +9,18 @@ #include #include +#include +#include +#include +#include +#include #include +#include +#include #include #include #include - -#include "Defs.h" -#include "EncodedVariableInterpreter.hpp" -#include "ir/parsing.hpp" -#include "ir/types.hpp" -#include "LogTypeDictionaryReaderReq.hpp" -#include "Query.hpp" -#include "QueryToken.hpp" -#include "VariableDictionaryReaderReq.hpp" +#include namespace clp { class GrepCore { diff --git a/components/core/src/clp/Query.hpp b/components/core/src/clp/Query.hpp index 51457b0508..13cdc61a97 100644 --- a/components/core/src/clp/Query.hpp +++ b/components/core/src/clp/Query.hpp @@ -7,7 +7,7 @@ #include #include -#include "Defs.h" +#include namespace clp { /** From 65e683ded377bad755609d53dbfddfdf9b9b9b9c Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Sun, 11 Jan 2026 07:27:27 -0500 Subject: [PATCH 082/164] Remove unused var. --- components/core/src/clp/GrepCore.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/components/core/src/clp/GrepCore.hpp b/components/core/src/clp/GrepCore.hpp index 59d60c6880..6ece674dea 100644 --- a/components/core/src/clp/GrepCore.hpp +++ b/components/core/src/clp/GrepCore.hpp @@ -197,7 +197,6 @@ std::optional GrepCore::process_raw_query( // - (token1 as logtype) (token2 as var) // - (token1 as var) (token2 as logtype) // - (token1 as var) (token2 as var) - std::string logtype; bool type_of_one_token_changed = true; while (type_of_one_token_changed) { SubQuery sub_query; From 07fff20a6ce2a1dfd2572c4779afa40a6564a6ca Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Sun, 11 Jan 2026 07:29:59 -0500 Subject: [PATCH 083/164] Add missing header. --- components/core/src/clp/SchemaSearcher.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/components/core/src/clp/SchemaSearcher.hpp b/components/core/src/clp/SchemaSearcher.hpp index bed711d219..0e5bfe9060 100644 --- a/components/core/src/clp/SchemaSearcher.hpp +++ b/components/core/src/clp/SchemaSearcher.hpp @@ -8,6 +8,7 @@ #include #include #include +#include #include #include From 6cc8df9e988dfa95ca2cd3919e4be38c248e1628 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Sun, 11 Jan 2026 07:31:53 -0500 Subject: [PATCH 084/164] Remove unused headers. --- components/core/src/clp/SchemaSearcher.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/components/core/src/clp/SchemaSearcher.cpp b/components/core/src/clp/SchemaSearcher.cpp index a13f107443..ae15e07450 100644 --- a/components/core/src/clp/SchemaSearcher.cpp +++ b/components/core/src/clp/SchemaSearcher.cpp @@ -1,10 +1,8 @@ #include "SchemaSearcher.hpp" -#include #include #include #include -#include #include #include @@ -19,7 +17,6 @@ using log_surgeon::wildcard_query_parser::VariableQueryToken; using std::holds_alternative; using std::set; using std::string; -using std::unordered_map; using std::vector; namespace clp { From b6312f0fdbd2e5adb5aa5436fe6114cd0f52dff6 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Mon, 12 Jan 2026 11:13:14 -0500 Subject: [PATCH 085/164] Fix headers. --- components/core/src/clp/GrepCore.cpp | 2 +- components/core/src/clp/GrepCore.hpp | 18 +++++++++--------- components/core/src/clp/Query.hpp | 2 +- components/core/src/clp/SchemaSearcher.cpp | 2 +- components/core/src/clp/SchemaSearcher.hpp | 10 +++++----- .../core/tests/MockLogTypeDictionary.hpp | 4 ++-- .../core/tests/MockVariableDictionary.hpp | 2 +- components/core/tests/search_test_utils.cpp | 6 +++--- components/core/tests/search_test_utils.hpp | 4 ++-- components/core/tests/test-GrepCore.cpp | 4 ++-- components/core/tests/test-SchemaSearcher.cpp | 8 ++++---- 11 files changed, 31 insertions(+), 31 deletions(-) diff --git a/components/core/src/clp/GrepCore.cpp b/components/core/src/clp/GrepCore.cpp index e8102a2388..1b353e39c1 100644 --- a/components/core/src/clp/GrepCore.cpp +++ b/components/core/src/clp/GrepCore.cpp @@ -4,7 +4,7 @@ #include #include -#include +#include #include diff --git a/components/core/src/clp/GrepCore.hpp b/components/core/src/clp/GrepCore.hpp index 6ece674dea..3f57a79eb5 100644 --- a/components/core/src/clp/GrepCore.hpp +++ b/components/core/src/clp/GrepCore.hpp @@ -9,18 +9,18 @@ #include #include -#include -#include -#include -#include -#include +#include +#include +#include +#include +#include +#include +#include +#include +#include #include -#include -#include -#include #include #include -#include namespace clp { class GrepCore { diff --git a/components/core/src/clp/Query.hpp b/components/core/src/clp/Query.hpp index 13cdc61a97..eef27c062d 100644 --- a/components/core/src/clp/Query.hpp +++ b/components/core/src/clp/Query.hpp @@ -7,7 +7,7 @@ #include #include -#include +#include namespace clp { /** diff --git a/components/core/src/clp/SchemaSearcher.cpp b/components/core/src/clp/SchemaSearcher.cpp index ae15e07450..dc1f9e769e 100644 --- a/components/core/src/clp/SchemaSearcher.cpp +++ b/components/core/src/clp/SchemaSearcher.cpp @@ -6,7 +6,7 @@ #include #include -#include +#include #include using log_surgeon::SymbolId::TokenFloat; diff --git a/components/core/src/clp/SchemaSearcher.hpp b/components/core/src/clp/SchemaSearcher.hpp index 0e5bfe9060..0e6d7eef7f 100644 --- a/components/core/src/clp/SchemaSearcher.hpp +++ b/components/core/src/clp/SchemaSearcher.hpp @@ -12,13 +12,13 @@ #include #include -#include -#include +#include +#include +#include +#include +#include #include #include -#include -#include -#include namespace clp { #ifdef CLP_BUILD_TESTING diff --git a/components/core/tests/MockLogTypeDictionary.hpp b/components/core/tests/MockLogTypeDictionary.hpp index b62db85841..1490326899 100644 --- a/components/core/tests/MockLogTypeDictionary.hpp +++ b/components/core/tests/MockLogTypeDictionary.hpp @@ -7,8 +7,8 @@ #include #include -#include -#include +#include +#include #include /** diff --git a/components/core/tests/MockVariableDictionary.hpp b/components/core/tests/MockVariableDictionary.hpp index 9da263d0b5..9d683ed245 100644 --- a/components/core/tests/MockVariableDictionary.hpp +++ b/components/core/tests/MockVariableDictionary.hpp @@ -7,7 +7,7 @@ #include #include -#include +#include #include /** diff --git a/components/core/tests/search_test_utils.cpp b/components/core/tests/search_test_utils.cpp index 2325c7ddf0..adba8be389 100644 --- a/components/core/tests/search_test_utils.cpp +++ b/components/core/tests/search_test_utils.cpp @@ -11,9 +11,9 @@ #include -#include -#include -#include +#include +#include +#include #include "MockLogTypeDictionary.hpp" #include "MockVariableDictionary.hpp" diff --git a/components/core/tests/search_test_utils.hpp b/components/core/tests/search_test_utils.hpp index 4f08c090bf..1602d8eb99 100644 --- a/components/core/tests/search_test_utils.hpp +++ b/components/core/tests/search_test_utils.hpp @@ -10,8 +10,8 @@ #include #include -#include -#include +#include +#include #include "MockLogTypeDictionary.hpp" #include "MockVariableDictionary.hpp" diff --git a/components/core/tests/test-GrepCore.cpp b/components/core/tests/test-GrepCore.cpp index 676852186a..aed88e8d4b 100644 --- a/components/core/tests/test-GrepCore.cpp +++ b/components/core/tests/test-GrepCore.cpp @@ -6,8 +6,8 @@ #include #include -#include -#include +#include +#include #include #include #include diff --git a/components/core/tests/test-SchemaSearcher.cpp b/components/core/tests/test-SchemaSearcher.cpp index fa9c4db83a..c3d94e03a6 100644 --- a/components/core/tests/test-SchemaSearcher.cpp +++ b/components/core/tests/test-SchemaSearcher.cpp @@ -7,12 +7,12 @@ #include #include -#include +#include +#include +#include +#include #include #include -#include -#include -#include #include "search_test_utils.hpp" From 34bdf28a9b2e251fca4a22dbf3ce687199ce6f3c Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Mon, 12 Jan 2026 11:23:18 -0500 Subject: [PATCH 086/164] Group clp headers seperately. --- components/core/src/clp/GrepCore.cpp | 2 +- components/core/src/clp/GrepCore.hpp | 7 ++++--- components/core/src/clp/SchemaSearcher.cpp | 3 ++- components/core/src/clp/SchemaSearcher.hpp | 5 +++-- components/core/tests/MockLogTypeDictionary.hpp | 3 ++- components/core/tests/MockVariableDictionary.hpp | 3 ++- components/core/tests/test-GrepCore.cpp | 5 +++-- components/core/tests/test-SchemaSearcher.cpp | 5 +++-- 8 files changed, 20 insertions(+), 13 deletions(-) diff --git a/components/core/src/clp/GrepCore.cpp b/components/core/src/clp/GrepCore.cpp index 1b353e39c1..cde0f69d93 100644 --- a/components/core/src/clp/GrepCore.cpp +++ b/components/core/src/clp/GrepCore.cpp @@ -4,9 +4,9 @@ #include #include -#include #include +#include using clp::ir::is_delim; using clp::string_utils::is_alphabet; diff --git a/components/core/src/clp/GrepCore.hpp b/components/core/src/clp/GrepCore.hpp index 3f57a79eb5..031b8cc2cb 100644 --- a/components/core/src/clp/GrepCore.hpp +++ b/components/core/src/clp/GrepCore.hpp @@ -9,6 +9,10 @@ #include #include +#include +#include +#include + #include #include #include @@ -18,9 +22,6 @@ #include #include #include -#include -#include -#include namespace clp { class GrepCore { diff --git a/components/core/src/clp/SchemaSearcher.cpp b/components/core/src/clp/SchemaSearcher.cpp index dc1f9e769e..f4888c0dd2 100644 --- a/components/core/src/clp/SchemaSearcher.cpp +++ b/components/core/src/clp/SchemaSearcher.cpp @@ -6,9 +6,10 @@ #include #include -#include #include +#include + using log_surgeon::SymbolId::TokenFloat; using log_surgeon::SymbolId::TokenInt; using log_surgeon::wildcard_query_parser::QueryInterpretation; diff --git a/components/core/src/clp/SchemaSearcher.hpp b/components/core/src/clp/SchemaSearcher.hpp index 0e6d7eef7f..6f6080669e 100644 --- a/components/core/src/clp/SchemaSearcher.hpp +++ b/components/core/src/clp/SchemaSearcher.hpp @@ -12,13 +12,14 @@ #include #include +#include +#include + #include #include #include #include #include -#include -#include namespace clp { #ifdef CLP_BUILD_TESTING diff --git a/components/core/tests/MockLogTypeDictionary.hpp b/components/core/tests/MockLogTypeDictionary.hpp index 1490326899..40ed0b5512 100644 --- a/components/core/tests/MockLogTypeDictionary.hpp +++ b/components/core/tests/MockLogTypeDictionary.hpp @@ -7,9 +7,10 @@ #include #include +#include + #include #include -#include /** * Helper class representing a mock logtype dictionary entry for unit tests. diff --git a/components/core/tests/MockVariableDictionary.hpp b/components/core/tests/MockVariableDictionary.hpp index 9d683ed245..e996d0807c 100644 --- a/components/core/tests/MockVariableDictionary.hpp +++ b/components/core/tests/MockVariableDictionary.hpp @@ -7,9 +7,10 @@ #include #include -#include #include +#include + /** * Helper class representing a mock variable dictionary entry for unit tests. * diff --git a/components/core/tests/test-GrepCore.cpp b/components/core/tests/test-GrepCore.cpp index aed88e8d4b..32e376aaba 100644 --- a/components/core/tests/test-GrepCore.cpp +++ b/components/core/tests/test-GrepCore.cpp @@ -6,13 +6,14 @@ #include #include -#include -#include #include #include #include #include +#include +#include + #include "search_test_utils.hpp" using clp::epochtime_t; diff --git a/components/core/tests/test-SchemaSearcher.cpp b/components/core/tests/test-SchemaSearcher.cpp index c3d94e03a6..adddacfd1e 100644 --- a/components/core/tests/test-SchemaSearcher.cpp +++ b/components/core/tests/test-SchemaSearcher.cpp @@ -7,12 +7,13 @@ #include #include +#include +#include + #include #include #include #include -#include -#include #include "search_test_utils.hpp" From caf160fe034da3445480872f802389293907fdbc Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Tue, 13 Jan 2026 19:13:22 -0500 Subject: [PATCH 087/164] Fix clp compile error by reverting to old include style for Query.hpp. --- components/core/src/clp/Query.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/core/src/clp/Query.hpp b/components/core/src/clp/Query.hpp index eef27c062d..51457b0508 100644 --- a/components/core/src/clp/Query.hpp +++ b/components/core/src/clp/Query.hpp @@ -7,7 +7,7 @@ #include #include -#include +#include "Defs.h" namespace clp { /** From 45294c4bd1639754acbc8a2a45c90afba05785ca Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Tue, 13 Jan 2026 19:18:46 -0500 Subject: [PATCH 088/164] Fix clp, clo, clg, to use new include style. --- components/core/src/clp/Query.hpp | 2 +- components/core/src/clp/clg/CMakeLists.txt | 1 + components/core/src/clp/clo/CMakeLists.txt | 1 + components/core/src/clp/clp/CMakeLists.txt | 1 + 4 files changed, 4 insertions(+), 1 deletion(-) diff --git a/components/core/src/clp/Query.hpp b/components/core/src/clp/Query.hpp index 51457b0508..eef27c062d 100644 --- a/components/core/src/clp/Query.hpp +++ b/components/core/src/clp/Query.hpp @@ -7,7 +7,7 @@ #include #include -#include "Defs.h" +#include namespace clp { /** diff --git a/components/core/src/clp/clg/CMakeLists.txt b/components/core/src/clp/clg/CMakeLists.txt index ec1f471f71..4cd2f2f1ae 100644 --- a/components/core/src/clp/clg/CMakeLists.txt +++ b/components/core/src/clp/clg/CMakeLists.txt @@ -145,6 +145,7 @@ if(CLP_BUILD_EXECUTABLES) target_compile_features(clg PRIVATE cxx_std_20) target_include_directories(clg PRIVATE + ../../ "${CLP_SQLITE3_INCLUDE_DIRECTORY}" ) target_link_libraries(clg diff --git a/components/core/src/clp/clo/CMakeLists.txt b/components/core/src/clp/clo/CMakeLists.txt index f353f8169e..4bb624883d 100644 --- a/components/core/src/clp/clo/CMakeLists.txt +++ b/components/core/src/clp/clo/CMakeLists.txt @@ -171,6 +171,7 @@ if(CLP_BUILD_EXECUTABLES) target_compile_features(clo PRIVATE cxx_std_20) target_include_directories(clo PRIVATE + ../../ "${CLP_SQLITE3_INCLUDE_DIRECTORY}" ) target_link_libraries(clo diff --git a/components/core/src/clp/clp/CMakeLists.txt b/components/core/src/clp/clp/CMakeLists.txt index 27f37f49b7..75e6bb4167 100644 --- a/components/core/src/clp/clp/CMakeLists.txt +++ b/components/core/src/clp/clp/CMakeLists.txt @@ -180,6 +180,7 @@ if(CLP_BUILD_EXECUTABLES) target_compile_features(clp PRIVATE cxx_std_20) target_include_directories(clp PRIVATE + ../../ "${CLP_SQLITE3_INCLUDE_DIRECTORY}" ) target_link_libraries(clp From 43b5bd18c68a3f43cacd2cb409de40c9cae4d7a2 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Tue, 13 Jan 2026 20:19:43 -0500 Subject: [PATCH 089/164] Make const. --- components/core/src/clp/SchemaSearcher.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/core/src/clp/SchemaSearcher.hpp b/components/core/src/clp/SchemaSearcher.hpp index 6f6080669e..633692582c 100644 --- a/components/core/src/clp/SchemaSearcher.hpp +++ b/components/core/src/clp/SchemaSearcher.hpp @@ -191,7 +191,7 @@ auto SchemaSearcher::generate_schema_sub_queries( constexpr size_t cMaxEncodableWildcardVariables{16}; for (auto const& interpretation : interpretations) { auto const logtype{interpretation.get_logtype()}; - auto wildcard_encodable_positions{get_wildcard_encodable_positions(interpretation)}; + auto const wildcard_encodable_positions{get_wildcard_encodable_positions(interpretation)}; if (wildcard_encodable_positions.size() > cMaxEncodableWildcardVariables) { throw std::runtime_error("Too many encodable variables."); } From 3ce21e8c9560cd891bfdd14aeedd8f63e3b81e1a Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Tue, 13 Jan 2026 20:40:04 -0500 Subject: [PATCH 090/164] Replace static string. --- components/core/tests/MockVariableDictionary.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/components/core/tests/MockVariableDictionary.hpp b/components/core/tests/MockVariableDictionary.hpp index e996d0807c..3682e3be51 100644 --- a/components/core/tests/MockVariableDictionary.hpp +++ b/components/core/tests/MockVariableDictionary.hpp @@ -46,12 +46,11 @@ class MockVariableDictionary { } [[nodiscard]] auto get_value(dictionary_id_t const id) const -> std::string const& { - static std::string const cEmpty{}; auto const it{m_storage.find(id)}; if (m_storage.end() != it) { return it->second.get_value(); } - return cEmpty; + return m_empty_string; } auto @@ -80,6 +79,7 @@ class MockVariableDictionary { private: std::unordered_map m_storage; + std::string m_empty_string; }; #endif // MOCK_VARIABLE_DICTIONARY_HPP From dc414a0d4768dbe668dd31ddbef489ef2926a03b Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Tue, 13 Jan 2026 20:46:19 -0500 Subject: [PATCH 091/164] Switch to deque. --- components/core/tests/MockLogTypeDictionary.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/components/core/tests/MockLogTypeDictionary.hpp b/components/core/tests/MockLogTypeDictionary.hpp index 40ed0b5512..8895120b28 100644 --- a/components/core/tests/MockLogTypeDictionary.hpp +++ b/components/core/tests/MockLogTypeDictionary.hpp @@ -2,6 +2,7 @@ #define MOCK_LOGTYPE_DICTIONARY_HPP #include +#include #include #include #include @@ -106,7 +107,7 @@ class MockLogTypeDictionary { } private: - std::vector m_storage; + std::deque m_storage; }; #endif // MOCK_LOGTYPE_DICTIONARY_HPP From 71a4a8f9e6b8f14546729ae5466999bf2fbcb18b Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Tue, 13 Jan 2026 21:15:14 -0500 Subject: [PATCH 092/164] Use traceable exception. --- components/core/src/clp/SchemaSearcher.hpp | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/components/core/src/clp/SchemaSearcher.hpp b/components/core/src/clp/SchemaSearcher.hpp index 633692582c..b06b34a362 100644 --- a/components/core/src/clp/SchemaSearcher.hpp +++ b/components/core/src/clp/SchemaSearcher.hpp @@ -17,8 +17,10 @@ #include #include +#include #include #include +#include #include namespace clp { @@ -32,6 +34,16 @@ class SchemaSearcher { #endif public: + class OperationFailed : public TraceableException { + public: + OperationFailed(ErrorCode error_code, char const* const filename, int line_number) + : TraceableException(error_code, filename, line_number) {} + + char const* what() const noexcept override { + return "Too many encodable variables."; + } + }; + template < LogTypeDictionaryReaderReq LogTypeDictionaryReaderType, VariableDictionaryReaderReq VariableDictionaryReaderType @@ -104,7 +116,7 @@ class SchemaSearcher { * @param var_dict The variable dictionary. * @param ignore_case If true, perform a case-insensitive search. * @return The vector of subqueries to compare against CLP's archives. - * @throw std::runtime_error If there are too many candidate combinations. + * @throw clp::TraceableException If there are too many candidate combinations. */ template < LogTypeDictionaryReaderReq LogTypeDictionaryReaderType, @@ -193,7 +205,7 @@ auto SchemaSearcher::generate_schema_sub_queries( auto const logtype{interpretation.get_logtype()}; auto const wildcard_encodable_positions{get_wildcard_encodable_positions(interpretation)}; if (wildcard_encodable_positions.size() > cMaxEncodableWildcardVariables) { - throw std::runtime_error("Too many encodable variables."); + throw OperationFailed(ErrorCode_Unsupported, __FILENAME__, __LINE__); } uint64_t const num_combos{1ULL << wildcard_encodable_positions.size()}; for (uint64_t mask{0}; mask < num_combos; ++mask) { From 74e4688c31a93a5925463824e28bba7f03ed97ea Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Tue, 13 Jan 2026 21:22:25 -0500 Subject: [PATCH 093/164] Add consts. --- components/core/src/clp/SchemaSearcher.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/components/core/src/clp/SchemaSearcher.hpp b/components/core/src/clp/SchemaSearcher.hpp index b06b34a362..8494d8401a 100644 --- a/components/core/src/clp/SchemaSearcher.hpp +++ b/components/core/src/clp/SchemaSearcher.hpp @@ -214,7 +214,7 @@ auto SchemaSearcher::generate_schema_sub_queries( mask_encoded_flags[wildcard_encodable_positions[i]] = (mask >> i) & 1ULL; } - auto logtype_string{generate_logtype_string( + auto const logtype_string{generate_logtype_string( interpretation, wildcard_encodable_positions, mask_encoded_flags @@ -319,7 +319,7 @@ auto SchemaSearcher::process_schema_var_token( return true; } - auto entries = var_dict.get_entry_matching_value(raw_string, ignore_case); + auto const entries{var_dict.get_entry_matching_value(raw_string, ignore_case)}; if (entries.empty()) { return false; } From 2ddc90ac43a85043ec0c02da407123455cc9ac3e Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Tue, 13 Jan 2026 23:21:21 -0500 Subject: [PATCH 094/164] Fix build. --- components/core/src/clp_s/CMakeLists.txt | 3 +++ components/core/src/glt/glt/CMakeLists.txt | 1 + 2 files changed, 4 insertions(+) diff --git a/components/core/src/clp_s/CMakeLists.txt b/components/core/src/clp_s/CMakeLists.txt index ba6f762a54..4bd85b1b2a 100644 --- a/components/core/src/clp_s/CMakeLists.txt +++ b/components/core/src/clp_s/CMakeLists.txt @@ -126,6 +126,7 @@ if(CLP_BUILD_CLP_S_CLP_DEPENDENCIES) ) add_library(clp_s::clp_dependencies ALIAS clp_s_clp_dependencies) target_compile_features(clp_s_clp_dependencies PRIVATE cxx_std_20) + target_include_directories(clp_s_clp_dependencies PUBLIC ../) target_link_libraries( clp_s_clp_dependencies PUBLIC @@ -177,6 +178,7 @@ if(CLP_BUILD_CLP_S_REDUCER_DEPENDENCIES) ) add_library(clp_s::reducer_dependencies ALIAS clp_s_reducer_dependencies) target_compile_features(clp_s_reducer_dependencies PRIVATE cxx_std_20) + target_include_directories(clp_s_reducer_dependencies PUBLIC ../) target_link_libraries( clp_s_reducer_dependencies PUBLIC @@ -431,6 +433,7 @@ if(CLP_BUILD_EXECUTABLES) ${CLP_S_EXE_SOURCES} ) target_compile_features(clp-s PRIVATE cxx_std_20) + target_include_directories(clp-s PRIVATE ../) target_link_libraries( clp-s PRIVATE diff --git a/components/core/src/glt/glt/CMakeLists.txt b/components/core/src/glt/glt/CMakeLists.txt index 6936dab8ea..f3cbdfee17 100644 --- a/components/core/src/glt/glt/CMakeLists.txt +++ b/components/core/src/glt/glt/CMakeLists.txt @@ -176,6 +176,7 @@ if(CLP_BUILD_EXECUTABLES) target_compile_features(glt PRIVATE cxx_std_20) target_include_directories(glt PRIVATE + ../../ "${CLP_SQLITE3_INCLUDE_DIRECTORY}" ) target_link_libraries(glt From dc542fe496fbafc392a73fcf1b03619b0f58da37 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Tue, 13 Jan 2026 23:24:42 -0500 Subject: [PATCH 095/164] Format. --- components/core/src/clp/SchemaSearcher.hpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/components/core/src/clp/SchemaSearcher.hpp b/components/core/src/clp/SchemaSearcher.hpp index 8494d8401a..25fe54c163 100644 --- a/components/core/src/clp/SchemaSearcher.hpp +++ b/components/core/src/clp/SchemaSearcher.hpp @@ -39,9 +39,7 @@ class SchemaSearcher { OperationFailed(ErrorCode error_code, char const* const filename, int line_number) : TraceableException(error_code, filename, line_number) {} - char const* what() const noexcept override { - return "Too many encodable variables."; - } + char const* what() const noexcept override { return "Too many encodable variables."; } }; template < From aee311c57fbd2cb4ca6fe0981a88d1849b07cefd Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Tue, 13 Jan 2026 23:37:57 -0500 Subject: [PATCH 096/164] Fix build errors. --- .../core/src/clp/make_dictionaries_readable/CMakeLists.txt | 1 + components/core/src/clp_s/indexer/CMakeLists.txt | 1 + components/core/src/clp_s/log_converter/CMakeLists.txt | 1 + components/core/src/reducer/CMakeLists.txt | 1 + 4 files changed, 4 insertions(+) diff --git a/components/core/src/clp/make_dictionaries_readable/CMakeLists.txt b/components/core/src/clp/make_dictionaries_readable/CMakeLists.txt index 696b66b559..34b09d5108 100644 --- a/components/core/src/clp/make_dictionaries_readable/CMakeLists.txt +++ b/components/core/src/clp/make_dictionaries_readable/CMakeLists.txt @@ -74,6 +74,7 @@ set( if(CLP_BUILD_EXECUTABLES) add_executable(make-dictionaries-readable ${MAKE_DICTIONARIES_READABLE_SOURCES}) target_compile_features(make-dictionaries-readable PRIVATE cxx_std_20) + target_include_directories(make-dictionaries-readable PRIVATE ../../) target_link_libraries(make-dictionaries-readable PRIVATE clp::string_utils diff --git a/components/core/src/clp_s/indexer/CMakeLists.txt b/components/core/src/clp_s/indexer/CMakeLists.txt index ba21f50a26..916313bfcc 100644 --- a/components/core/src/clp_s/indexer/CMakeLists.txt +++ b/components/core/src/clp_s/indexer/CMakeLists.txt @@ -126,6 +126,7 @@ set( if(CLP_BUILD_EXECUTABLES) add_executable(indexer ${INDEXER_SOURCES}) target_compile_features(indexer PRIVATE cxx_std_20) + target_include_directories(indexer PRIVATE ../../) target_link_libraries(indexer PRIVATE absl::flat_hash_map diff --git a/components/core/src/clp_s/log_converter/CMakeLists.txt b/components/core/src/clp_s/log_converter/CMakeLists.txt index 3d78443412..acb05cacc8 100644 --- a/components/core/src/clp_s/log_converter/CMakeLists.txt +++ b/components/core/src/clp_s/log_converter/CMakeLists.txt @@ -15,6 +15,7 @@ if(CLP_BUILD_EXECUTABLES) ${CLP_S_LOG_CONVERTER_SOURCES} ) target_compile_features(log-converter PRIVATE cxx_std_20) + target_include_directories(log-converter PRIVATE ../../) target_link_libraries( log-converter PRIVATE diff --git a/components/core/src/reducer/CMakeLists.txt b/components/core/src/reducer/CMakeLists.txt index ec830624b8..66cf5b8fd3 100644 --- a/components/core/src/reducer/CMakeLists.txt +++ b/components/core/src/reducer/CMakeLists.txt @@ -41,6 +41,7 @@ set( if(CLP_BUILD_EXECUTABLES) add_executable(reducer-server ${REDUCER_SOURCES}) target_compile_features(reducer-server PRIVATE cxx_std_20) + target_include_directories(reducer-server PRIVATE ../) target_link_libraries(reducer-server PRIVATE Boost::program_options From 18499c245d4f1ba5221773ddddb09b0f96d4cc27 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Fri, 16 Jan 2026 09:03:31 -0500 Subject: [PATCH 097/164] Use braced initialization. --- components/core/src/clp/GrepCore.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/components/core/src/clp/GrepCore.hpp b/components/core/src/clp/GrepCore.hpp index 031b8cc2cb..4b76c57c00 100644 --- a/components/core/src/clp/GrepCore.hpp +++ b/components/core/src/clp/GrepCore.hpp @@ -153,9 +153,9 @@ std::optional GrepCore::process_raw_query( } else { // Split search_string into tokens with wildcards std::vector query_tokens; - size_t begin_pos = 0; - size_t end_pos = 0; - bool is_var; + size_t begin_pos{0}; + size_t end_pos{0}; + bool is_var{false}; std::string search_string_for_sub_queries{search_string}; // Replace unescaped '?' wildcards with '*' wildcards since we currently have no support for From 12c926d82b889d518c2882a0b72ca431eae96af9 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Fri, 16 Jan 2026 09:04:09 -0500 Subject: [PATCH 098/164] Use braced initialization, again. --- components/core/src/clp/GrepCore.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/core/src/clp/GrepCore.hpp b/components/core/src/clp/GrepCore.hpp index 4b76c57c00..fb0717d4c1 100644 --- a/components/core/src/clp/GrepCore.hpp +++ b/components/core/src/clp/GrepCore.hpp @@ -198,7 +198,7 @@ std::optional GrepCore::process_raw_query( // - (token1 as logtype) (token2 as var) // - (token1 as var) (token2 as logtype) // - (token1 as var) (token2 as var) - bool type_of_one_token_changed = true; + bool type_of_one_token_changed{true}; while (type_of_one_token_changed) { SubQuery sub_query; From 68660b5f2bf8aa86de5a8afdc86b1341cda26efb Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Fri, 16 Jan 2026 09:05:15 -0500 Subject: [PATCH 099/164] Remove obvious comment. --- components/core/src/clp/GrepCore.hpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/components/core/src/clp/GrepCore.hpp b/components/core/src/clp/GrepCore.hpp index fb0717d4c1..758fb8e5d1 100644 --- a/components/core/src/clp/GrepCore.hpp +++ b/components/core/src/clp/GrepCore.hpp @@ -201,8 +201,6 @@ std::optional GrepCore::process_raw_query( bool type_of_one_token_changed{true}; while (type_of_one_token_changed) { SubQuery sub_query; - - // Compute logtypes and variables for query auto matchability = generate_logtypes_and_vars_for_subquery( logtype_dict, var_dict, From 9425afc1777704d6c6376361af12d7ecf6155ada Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Fri, 16 Jan 2026 09:05:54 -0500 Subject: [PATCH 100/164] Use braced initialization, again x2. --- components/core/src/clp/GrepCore.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/components/core/src/clp/GrepCore.hpp b/components/core/src/clp/GrepCore.hpp index 758fb8e5d1..ff979cc0b7 100644 --- a/components/core/src/clp/GrepCore.hpp +++ b/components/core/src/clp/GrepCore.hpp @@ -201,14 +201,14 @@ std::optional GrepCore::process_raw_query( bool type_of_one_token_changed{true}; while (type_of_one_token_changed) { SubQuery sub_query; - auto matchability = generate_logtypes_and_vars_for_subquery( + auto matchability{generate_logtypes_and_vars_for_subquery( logtype_dict, var_dict, search_string_for_sub_queries, query_tokens, ignore_case, sub_query - ); + )}; switch (matchability) { case SubQueryMatchabilityResult::SupercedesAllSubQueries: // Since other sub-queries will be superceded by this one, we can stop From fa8e353911dbb55c4cb59d028d435e4d040e2274 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Fri, 16 Jan 2026 09:06:25 -0500 Subject: [PATCH 101/164] Add full stop. --- components/core/src/clp/GrepCore.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/core/src/clp/GrepCore.hpp b/components/core/src/clp/GrepCore.hpp index ff979cc0b7..8254095f8d 100644 --- a/components/core/src/clp/GrepCore.hpp +++ b/components/core/src/clp/GrepCore.hpp @@ -212,7 +212,7 @@ std::optional GrepCore::process_raw_query( switch (matchability) { case SubQueryMatchabilityResult::SupercedesAllSubQueries: // Since other sub-queries will be superceded by this one, we can stop - // processing now + // processing now. return Query{search_begin_ts, search_end_ts, ignore_case, search_string, {}}; case SubQueryMatchabilityResult::MayMatch: sub_queries.push_back(std::move(sub_query)); From 3fde49f4ec03bd6445364e321445d217d90d53b6 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Fri, 16 Jan 2026 09:06:53 -0500 Subject: [PATCH 102/164] Remove obvious comment, again. --- components/core/src/clp/GrepCore.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/components/core/src/clp/GrepCore.hpp b/components/core/src/clp/GrepCore.hpp index 8254095f8d..55a154d74d 100644 --- a/components/core/src/clp/GrepCore.hpp +++ b/components/core/src/clp/GrepCore.hpp @@ -219,7 +219,6 @@ std::optional GrepCore::process_raw_query( break; case SubQueryMatchabilityResult::WontMatch: default: - // Do nothing break; } From 435a008377571441eb8239b9883ec2800df7a43a Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Fri, 16 Jan 2026 09:09:25 -0500 Subject: [PATCH 103/164] Move using into namespace. --- components/core/src/clp/SchemaSearcher.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/core/src/clp/SchemaSearcher.cpp b/components/core/src/clp/SchemaSearcher.cpp index f4888c0dd2..a795b3ab6a 100644 --- a/components/core/src/clp/SchemaSearcher.cpp +++ b/components/core/src/clp/SchemaSearcher.cpp @@ -10,6 +10,7 @@ #include +namespace clp { using log_surgeon::SymbolId::TokenFloat; using log_surgeon::SymbolId::TokenInt; using log_surgeon::wildcard_query_parser::QueryInterpretation; @@ -20,7 +21,6 @@ using std::set; using std::string; using std::vector; -namespace clp { auto SchemaSearcher::normalize_interpretations(set const& interpretations) -> set { set normalized_interpretations; From fc47f236ef2f94393ba20543847d329a5755ce7b Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Fri, 16 Jan 2026 09:10:16 -0500 Subject: [PATCH 104/164] Use pre-increment over post-increment. --- components/core/src/clp/SchemaSearcher.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/core/src/clp/SchemaSearcher.cpp b/components/core/src/clp/SchemaSearcher.cpp index a795b3ab6a..b26610ea04 100644 --- a/components/core/src/clp/SchemaSearcher.cpp +++ b/components/core/src/clp/SchemaSearcher.cpp @@ -95,7 +95,7 @@ auto SchemaSearcher::generate_logtype_string( auto const& static_token{std::get(token)}; logtype_string_size += static_token.get_query_substring().size(); } else { - logtype_string_size++; + ++logtype_string_size; } } logtype_string.reserve(logtype_string_size); From b81360d5383220be1d59a540a3c1a8b2c213b19b Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Fri, 16 Jan 2026 10:16:06 -0500 Subject: [PATCH 105/164] Switch tuple to struct. --- components/core/tests/search_test_utils.hpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/components/core/tests/search_test_utils.hpp b/components/core/tests/search_test_utils.hpp index 1602d8eb99..72f56dd635 100644 --- a/components/core/tests/search_test_utils.hpp +++ b/components/core/tests/search_test_utils.hpp @@ -17,10 +17,13 @@ #include "MockVariableDictionary.hpp" /** - * Type alias for variable information in tests. - * Elements: (is_dict_var, is_precise_var, var_dict_ids) + * Struct for variable information in tests. */ -using VarInfo = std::tuple>; +struct VarInfo { + bool is_dict_var; + bool is_precise_var; + std::unordered_set var_dict_ids; +}; /** * @param entries Vector of (id, value) pairs to populate the variable dictionary. From 5922cb17a0d29f75fb2c639f33669b7455bc43f6 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Sun, 18 Jan 2026 05:18:03 -0500 Subject: [PATCH 106/164] Move SchemaSearcherTest class into its own file. --- components/core/CMakeLists.txt | 1 + components/core/tests/SchemaSearcherTest.hpp | 111 ++++++++++++++++++ components/core/tests/test-SchemaSearcher.cpp | 90 +------------- 3 files changed, 113 insertions(+), 89 deletions(-) create mode 100644 components/core/tests/SchemaSearcherTest.hpp diff --git a/components/core/CMakeLists.txt b/components/core/CMakeLists.txt index f288e5e9ea..f86d40ea14 100644 --- a/components/core/CMakeLists.txt +++ b/components/core/CMakeLists.txt @@ -701,6 +701,7 @@ set(SOURCE_FILES_unitTest tests/LogSuppressor.hpp tests/MockLogTypeDictionary.hpp tests/MockVariableDictionary.hpp + tests/SchemaSearcherTest.hpp tests/search_test_utils.cpp tests/search_test_utils.hpp tests/TestOutputCleaner.hpp diff --git a/components/core/tests/SchemaSearcherTest.hpp b/components/core/tests/SchemaSearcherTest.hpp new file mode 100644 index 0000000000..ae33b97e77 --- /dev/null +++ b/components/core/tests/SchemaSearcherTest.hpp @@ -0,0 +1,111 @@ +#ifndef SCHEMA_SEARCHER_TEST_HPP +#define SCHEMA_SEARCHER_TEST_HPP + +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +#include "search_test_utils.hpp" + +using clp::LogTypeDictionaryReaderReq; +using clp::SubQuery; +using clp::VariableDictionaryReaderReq; +using log_surgeon::wildcard_query_parser::QueryInterpretation; +using log_surgeon::wildcard_query_parser::VariableQueryToken; +using std::set; +using std::string; +using std::vector; + +/** + * Helper to expose `SchemaSearcher` functionality for unit-testing. + * + * This class provides static wrappers around `SchemaSearcher` methods, allowing test code to access + * internal logic such as: + * - Finding wildcard encodable positions in a `QueryInterpretation`; + * - Generating logtype strings with wildcard masks; + * - Processing variable tokens with or without encoding; + * - Generating schema-based sub-queries. + * + * All methods forward directly to `SchemaSearcher` and are intended for testing only. + */ +class clp::SchemaSearcherTest { +public: + static auto normalize_interpretations(set const& interpretations) + -> set { + return SchemaSearcher::normalize_interpretations(interpretations); + } + + template < + LogTypeDictionaryReaderReq LogTypeDictionaryReaderType, + VariableDictionaryReaderReq VariableDictionaryReaderType + > + static auto generate_schema_sub_queries( + set const& interpretations, + LogTypeDictionaryReaderType const& logtype_dict, + VariableDictionaryReaderType const& var_dict + ) -> vector { + return SchemaSearcher::generate_schema_sub_queries( + interpretations, + logtype_dict, + var_dict, + false + ); + } + + static auto get_wildcard_encodable_positions(QueryInterpretation const& interpretation) + -> vector { + return SchemaSearcher::get_wildcard_encodable_positions(interpretation); + } + + static auto generate_logtype_string( + QueryInterpretation const& interpretation, + vector const& wildcard_encodable_positions, + vector const& mask_encoded_flags + ) -> string { + return SchemaSearcher::generate_logtype_string( + interpretation, + wildcard_encodable_positions, + mask_encoded_flags + ); + } + + template + static auto process_token( + VariableQueryToken const& var_token, + VariableDictionaryReaderType const& var_dict, + SubQuery& sub_query + ) -> bool { + return SchemaSearcher::process_schema_var_token( + var_token, + var_dict, + false, + false, + sub_query + ); + } + + template + static auto process_encoded_token( + VariableQueryToken const& var_token, + VariableDictionaryReaderType const& var_dict, + SubQuery& sub_query + ) -> bool { + return SchemaSearcher::process_schema_var_token( + var_token, + var_dict, + false, + true, + sub_query + ); + } +}; + +#endif // SCHEMA_SEARCHER_TEST_HPP diff --git a/components/core/tests/test-SchemaSearcher.cpp b/components/core/tests/test-SchemaSearcher.cpp index adddacfd1e..fec525e0d2 100644 --- a/components/core/tests/test-SchemaSearcher.cpp +++ b/components/core/tests/test-SchemaSearcher.cpp @@ -10,16 +10,13 @@ #include #include -#include #include #include -#include +#include "SchemaSearcherTest.hpp" #include "search_test_utils.hpp" -using clp::LogTypeDictionaryReaderReq; using clp::SubQuery; -using clp::VariableDictionaryReaderReq; using log_surgeon::SymbolId::TokenFloat; using log_surgeon::SymbolId::TokenInt; using log_surgeon::wildcard_query_parser::QueryInterpretation; @@ -27,7 +24,6 @@ using log_surgeon::wildcard_query_parser::VariableQueryToken; using std::pair; using std::set; using std::string; -using std::string_view; using std::unordered_set; using std::variant; using std::vector; @@ -36,90 +32,6 @@ constexpr uint32_t cIntId{static_cast(TokenInt)}; constexpr uint32_t cFloatId{static_cast(TokenFloat)}; constexpr uint32_t cHasNumId{111}; -/** - * Helper to expose `SchemaSearcher` functionality for unit-testing. - * - * This class provides static wrappers around `SchemaSearcher` methods, allowing test code to access - * internal logic such as: - * - Finding wildcard encodable positions in a `QueryInterpretation`; - * - Generating logtype strings with wildcard masks; - * - Processing variable tokens with or without encoding; - * - Generating schema-based sub-queries. - * - * All methods forward directly to `SchemaSearcher` and are intended for testing only. - */ -class clp::SchemaSearcherTest { -public: - static auto normalize_interpretations(set const& interpretations) - -> set { - return SchemaSearcher::normalize_interpretations(interpretations); - } - - template < - LogTypeDictionaryReaderReq LogTypeDictionaryReaderType, - VariableDictionaryReaderReq VariableDictionaryReaderType - > - static auto generate_schema_sub_queries( - set const& interpretations, - LogTypeDictionaryReaderType const& logtype_dict, - VariableDictionaryReaderType const& var_dict - ) -> vector { - return SchemaSearcher::generate_schema_sub_queries( - interpretations, - logtype_dict, - var_dict, - false - ); - } - - static auto get_wildcard_encodable_positions(QueryInterpretation const& interpretation) - -> vector { - return SchemaSearcher::get_wildcard_encodable_positions(interpretation); - } - - static auto generate_logtype_string( - QueryInterpretation const& interpretation, - vector const& wildcard_encodable_positions, - vector const& mask_encoded_flags - ) -> string { - return SchemaSearcher::generate_logtype_string( - interpretation, - wildcard_encodable_positions, - mask_encoded_flags - ); - } - - template - static auto process_token( - VariableQueryToken const& var_token, - VariableDictionaryReaderType const& var_dict, - SubQuery& sub_query - ) -> bool { - return SchemaSearcher::process_schema_var_token( - var_token, - var_dict, - false, - false, - sub_query - ); - } - - template - static auto process_encoded_token( - VariableQueryToken const& var_token, - VariableDictionaryReaderType const& var_dict, - SubQuery& sub_query - ) -> bool { - return SchemaSearcher::process_schema_var_token( - var_token, - var_dict, - false, - true, - sub_query - ); - } -}; - namespace { /** * Constructs a `QueryInterpretation` from a vector of tokens. From 235fa74ed8c5a454a18c60e6ff0cea71235a766a Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Sun, 18 Jan 2026 06:30:41 -0500 Subject: [PATCH 107/164] Add doc string. --- components/core/src/clp/SchemaSearcher.hpp | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/components/core/src/clp/SchemaSearcher.hpp b/components/core/src/clp/SchemaSearcher.hpp index 25fe54c163..055aef2f25 100644 --- a/components/core/src/clp/SchemaSearcher.hpp +++ b/components/core/src/clp/SchemaSearcher.hpp @@ -42,6 +42,25 @@ class SchemaSearcher { char const* what() const noexcept override { return "Too many encodable variables."; } }; + /** + * Performs a wildcard-based search on a log message using a query string, producing subqueries + * that match the schema. + * - Parses the search string into a query. + * - Generates all possible interpretations of the query based on the schema. + * - Normalizes the interpretations. + * - Produces a set of subqueries corresponding to valid combinations of logtype variables and + * dictionary variables. + * + * @tparam LogTypeDictionaryReaderType The type of object accessing the logtype dictionary. + * @tparam VariableDictionaryReaderType The type of object accessing the variable dictionary. + * @param search_string The input query string to search for in the log message. + * @param lexer The lexer containing the schema used to determine variable types and delimiters. + * @param logtype_dict A reference to the logtype dictionary. + * @param var_dict A reference to the variable dictionary. + * @param ignore_case If true, the search will be case-insensitive. + * @return A vector of `SubQuery` objects representing all normalized interpretations of the + * query that are compatible with the logtype and variable dictionaries. + */ template < LogTypeDictionaryReaderReq LogTypeDictionaryReaderType, VariableDictionaryReaderReq VariableDictionaryReaderType From 06704c733bb2fd64c3547fb40aeac9f7e6ba1496 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Sun, 18 Jan 2026 06:37:05 -0500 Subject: [PATCH 108/164] Move constexpr to method param. --- components/core/src/clp/SchemaSearcher.hpp | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/components/core/src/clp/SchemaSearcher.hpp b/components/core/src/clp/SchemaSearcher.hpp index 055aef2f25..1d96c532b8 100644 --- a/components/core/src/clp/SchemaSearcher.hpp +++ b/components/core/src/clp/SchemaSearcher.hpp @@ -132,8 +132,11 @@ class SchemaSearcher { * @param logtype_dict The logtype dictionary. * @param var_dict The variable dictionary. * @param ignore_case If true, perform a case-insensitive search. + * @param max_encodable_wildcard_variables The maximum number of encodable wildcard variables. + * This limits the allowable number of total candidate combinations. Defaults to 16. * @return The vector of subqueries to compare against CLP's archives. - * @throw clp::TraceableException If there are too many candidate combinations. + * @throw clp::TraceableException If there are more encodable wildcard variables than + * `max_encodable_wildcard_variables`. */ template < LogTypeDictionaryReaderReq LogTypeDictionaryReaderType, @@ -144,7 +147,8 @@ class SchemaSearcher { interpretations, LogTypeDictionaryReaderType const& logtype_dict, VariableDictionaryReaderType const& var_dict, - bool ignore_case + bool ignore_case, + size_t max_encodable_wildcard_variables = 16 ) -> std::vector; /** @@ -214,14 +218,14 @@ auto SchemaSearcher::generate_schema_sub_queries( std::set const& interpretations, LogTypeDictionaryReaderType const& logtype_dict, VariableDictionaryReaderType const& var_dict, - bool const ignore_case + bool const ignore_case, + size_t const max_encodable_wildcard_variables ) -> std::vector { std::vector sub_queries; - constexpr size_t cMaxEncodableWildcardVariables{16}; for (auto const& interpretation : interpretations) { auto const logtype{interpretation.get_logtype()}; auto const wildcard_encodable_positions{get_wildcard_encodable_positions(interpretation)}; - if (wildcard_encodable_positions.size() > cMaxEncodableWildcardVariables) { + if (wildcard_encodable_positions.size() > max_encodable_wildcard_variables) { throw OperationFailed(ErrorCode_Unsupported, __FILENAME__, __LINE__); } uint64_t const num_combos{1ULL << wildcard_encodable_positions.size()}; From 82251ab86726e50c953f5877452f58973982a794 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Sun, 18 Jan 2026 07:43:31 -0500 Subject: [PATCH 109/164] Add class level docstring. --- components/core/src/clp/SchemaSearcher.hpp | 66 +++++++++++++--------- 1 file changed, 39 insertions(+), 27 deletions(-) diff --git a/components/core/src/clp/SchemaSearcher.hpp b/components/core/src/clp/SchemaSearcher.hpp index 1d96c532b8..7ef4d6f0b7 100644 --- a/components/core/src/clp/SchemaSearcher.hpp +++ b/components/core/src/clp/SchemaSearcher.hpp @@ -28,6 +28,43 @@ namespace clp { class SchemaSearcherTest; #endif +/** + * SchemaSearcher is responsible for generating schema-aware subqueries from wildcard query strings, + * given logtype and variable dictionaries. + * + * Key concepts: + * + * 1. Encodable variables: + * - A variable token that contains a wildcard (e.g., *1) and is of an encodable type (integer or + * float). + * - Encodable variables introduce binary choices when generating subqueries as each can be + * treated as either a dictionary variable or an encoded variable. For example: + * Search query: "a *1 *2 b" + * One possible interpretation: "a (*1) (*2) b" + * Mask 00 -> "a \d \d b" + * Mask 01 -> "a \d \f b" + * Mask 10 -> "a \i \d b" + * Mask 11 -> "a \i \f b" + * - To limit combinatorial explosion, the number of encodable variables is constrained (default + * maximum = 16). + * + * 2. Mask encodings: + * - For k encodable wildcard variables, 2^k candidate logtype strings exist. + * - Each combination is represented with a bitmask, where each bit indicates whether the + * corresponding variable is encoded (1) or dictionary-based (0). + * + * 3. SubQuery generation: + * - A `SubQuery` is a container for a single possible interpretation of a query, with variables + * resolved to dictionary or encoded forms. + * - `SchemaSearcher` is responsible for creating `SubQuery` objects. + * + * Public interface: + * - `search(...)` is the main entry point: it takes a query string, generates all interpretations, + * normalizes them, and produces `SubQuery` objects. + * + * Internal helpers (private static methods) handle normalization, wildcard scanning, logtype string + * generation, and per-variable processing. + */ class SchemaSearcher { #ifdef CLP_BUILD_TESTING friend class SchemaSearcherTest; @@ -98,26 +135,7 @@ class SchemaSearcher { /** * Compare all log-surgeon interpretations against the dictionaries to determine the sub queries - * to search for within the archive. - * - * A. For each interpretation we must consider encodable wildcard variables (e.g. (*1)). - * Each such variable introduces a binary choice: - * - 0: treat as a dictionary variable (\d) - * - 1: treat as an encoded variable (\i for integers, \f for floats) - * - * If there are k encodable wildcard variables, then 2^k logtype strings are possible. As a - * result we limit k <= 16. We represent these alternatives using a bitmask. - * - * Example: - * Search query: "a *1 *2 b", - * Interpretation (one of many): "a (*1) (*2) b" - * Possible logtypes (for the above interpretation): - * mask 00 -> "a \d \d b" - * mask 01 -> "a \d \f b" - * mask 10 -> "a \i \d b" - * mask 11 -> "a \i \f b" - * - * B. Each candidate combination becomes a useful subquery only if: + * to search for within the archive. Each candidate combination becomes a useful subquery if: * 1. The logtype exists in the logtype dictionary, and * 2. Each variable is either: * a) resolvable in the variable dictionary (for dictionary vars), or @@ -154,10 +172,6 @@ class SchemaSearcher { /** * Scans the interpretation and returns the indices of all encodable wildcard variables. * - * An encodable variable is a variable token that: - * - Contains a wildcard (e.g. *1). - * - Is of an encodable type (integer or float). - * * @param interpretation The `QueryInterpretation` to scan. * @return A vector of positions of encodable wildcard variables. */ @@ -168,8 +182,6 @@ class SchemaSearcher { /** * Generates a logtype string from an interpretation, applying a mask to determine which * encodable wildcard positions are treated as encoded vs dictionary variables. - * - 0: Treat as dictionary variable. - * - 1: Treat as an encoded variable. * * @param interpretation The interpretation to convert to a logtype string. * @param wildcard_encodable_positions A vector of positions of encodable wildcard variables. @@ -197,7 +209,7 @@ class SchemaSearcher { * @param var_dict The variable dictionary. * @param ignore_case If true, perform a case-insensitive search. * @param is_mask_encoded If the token is an encodable wildcard and is to be encoded. - * @param sub_query Returns the updated sub query object. + * @param sub_query Returns the updated `SubQuery` object. * @return True if the variable is encoded or is in the variable dictionary, false otherwise. */ template From 66bd8926f6ae28faabee373c930e2a44276215c2 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Sun, 18 Jan 2026 07:45:29 -0500 Subject: [PATCH 110/164] Update docstring. --- components/core/tests/search_test_utils.hpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/components/core/tests/search_test_utils.hpp b/components/core/tests/search_test_utils.hpp index 72f56dd635..6ff70d03e5 100644 --- a/components/core/tests/search_test_utils.hpp +++ b/components/core/tests/search_test_utils.hpp @@ -74,8 +74,7 @@ auto generate_expected_logtype_string( * @param id Index of the sub-query to check in `sub_queries`. * @param sub_queries Vector of `SubQuery` objects. * @param wildcard_match_required Expected wildcard match requirement. - * @param vars_info Vector of tuples describing expected variable properties: (`is_dict_var`, - * `is_precise_var`, `var_dict_ids`). + * @param vars_info Object describing expected variable properties. * @param logtype_ids Expected set of possible logtype IDs. */ auto check_sub_query( From 81332076ffa5b8539afcf7559c3391c05cbfa919 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Sun, 18 Jan 2026 07:47:04 -0500 Subject: [PATCH 111/164] Remove unused header. --- components/core/tests/SchemaSearcherTest.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/components/core/tests/SchemaSearcherTest.hpp b/components/core/tests/SchemaSearcherTest.hpp index ae33b97e77..e020b34270 100644 --- a/components/core/tests/SchemaSearcherTest.hpp +++ b/components/core/tests/SchemaSearcherTest.hpp @@ -5,7 +5,6 @@ #include #include -#include #include #include From 73f1890f5424db5b6e20115f649b19383dbe52c9 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Sun, 18 Jan 2026 07:50:29 -0500 Subject: [PATCH 112/164] Make test vector empty. --- components/core/tests/test-SchemaSearcher.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/core/tests/test-SchemaSearcher.cpp b/components/core/tests/test-SchemaSearcher.cpp index fec525e0d2..47ac47c8f2 100644 --- a/components/core/tests/test-SchemaSearcher.cpp +++ b/components/core/tests/test-SchemaSearcher.cpp @@ -99,7 +99,7 @@ TEST_CASE("generate_logtype_string_for_empty_interpretation", "[dfa_search]") { auto const logtype_string{clp::SchemaSearcherTest::generate_logtype_string( interpretation, wildcard_encodable_positions, - {false} + {} )}; REQUIRE(logtype_string.empty()); } From d2f4e4474c6e4b42791e37db5f261b0bb4aa96ff Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Mon, 19 Jan 2026 11:06:47 -0500 Subject: [PATCH 113/164] Add header. --- components/core/tests/SchemaSearcherTest.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/components/core/tests/SchemaSearcherTest.hpp b/components/core/tests/SchemaSearcherTest.hpp index e020b34270..3e98f90821 100644 --- a/components/core/tests/SchemaSearcherTest.hpp +++ b/components/core/tests/SchemaSearcherTest.hpp @@ -3,6 +3,7 @@ #include #include +#include #include #include From 49aac57341179b670c9bfdecceefb5ffd08c61dd Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Mon, 19 Jan 2026 11:14:37 -0500 Subject: [PATCH 114/164] Update using in test header. --- components/core/tests/SchemaSearcherTest.hpp | 28 ++++++++------------ 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/components/core/tests/SchemaSearcherTest.hpp b/components/core/tests/SchemaSearcherTest.hpp index 3e98f90821..6d814b89c5 100644 --- a/components/core/tests/SchemaSearcherTest.hpp +++ b/components/core/tests/SchemaSearcherTest.hpp @@ -15,15 +15,6 @@ #include "search_test_utils.hpp" -using clp::LogTypeDictionaryReaderReq; -using clp::SubQuery; -using clp::VariableDictionaryReaderReq; -using log_surgeon::wildcard_query_parser::QueryInterpretation; -using log_surgeon::wildcard_query_parser::VariableQueryToken; -using std::set; -using std::string; -using std::vector; - /** * Helper to expose `SchemaSearcher` functionality for unit-testing. * @@ -37,9 +28,12 @@ using std::vector; * All methods forward directly to `SchemaSearcher` and are intended for testing only. */ class clp::SchemaSearcherTest { +using log_surgeon::wildcard_query_parser::QueryInterpretation; +using log_surgeon::wildcard_query_parser::VariableQueryToken; + public: - static auto normalize_interpretations(set const& interpretations) - -> set { + static auto normalize_interpretations(std::set const& interpretations) + -> std::set { return SchemaSearcher::normalize_interpretations(interpretations); } @@ -48,10 +42,10 @@ class clp::SchemaSearcherTest { VariableDictionaryReaderReq VariableDictionaryReaderType > static auto generate_schema_sub_queries( - set const& interpretations, + std::set const& interpretations, LogTypeDictionaryReaderType const& logtype_dict, VariableDictionaryReaderType const& var_dict - ) -> vector { + ) -> std::vector { return SchemaSearcher::generate_schema_sub_queries( interpretations, logtype_dict, @@ -61,15 +55,15 @@ class clp::SchemaSearcherTest { } static auto get_wildcard_encodable_positions(QueryInterpretation const& interpretation) - -> vector { + -> std::vector { return SchemaSearcher::get_wildcard_encodable_positions(interpretation); } static auto generate_logtype_string( QueryInterpretation const& interpretation, - vector const& wildcard_encodable_positions, - vector const& mask_encoded_flags - ) -> string { + std::vector const& wildcard_encodable_positions, + std::vector const& mask_encoded_flags + ) -> std::string { return SchemaSearcher::generate_logtype_string( interpretation, wildcard_encodable_positions, From b0bea5ffeac24a79a3a193b3f0197de7c0121304 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Mon, 19 Jan 2026 11:15:27 -0500 Subject: [PATCH 115/164] Add missing namespace. --- components/core/tests/test-SchemaSearcher.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/core/tests/test-SchemaSearcher.cpp b/components/core/tests/test-SchemaSearcher.cpp index 47ac47c8f2..01085f1612 100644 --- a/components/core/tests/test-SchemaSearcher.cpp +++ b/components/core/tests/test-SchemaSearcher.cpp @@ -53,7 +53,7 @@ auto make_query_interpretation(vector>> c -> QueryInterpretation { QueryInterpretation interp; for (auto const& token : tokens) { - if (holds_alternative(token)) { + if (std::holds_alternative(token)) { interp.append_static_token(get(token)); } else { auto const& [symbol, value]{get>(token)}; From ea477fbfbc48fc7473945c23edff6861fcdc8e23 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Mon, 19 Jan 2026 11:16:12 -0500 Subject: [PATCH 116/164] Add missing namespace again. --- components/core/tests/test-SchemaSearcher.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/components/core/tests/test-SchemaSearcher.cpp b/components/core/tests/test-SchemaSearcher.cpp index 01085f1612..13703e9d9c 100644 --- a/components/core/tests/test-SchemaSearcher.cpp +++ b/components/core/tests/test-SchemaSearcher.cpp @@ -54,9 +54,9 @@ auto make_query_interpretation(vector>> c QueryInterpretation interp; for (auto const& token : tokens) { if (std::holds_alternative(token)) { - interp.append_static_token(get(token)); + interp.append_static_token(std::get(token)); } else { - auto const& [symbol, value]{get>(token)}; + auto const& [symbol, value]{std::get>(token)}; auto const contains_wildcard{value.find_first_of("*?") != string::npos}; interp.append_variable_token(symbol, value, contains_wildcard); } From be6f6290613ae1af704385af130d7d4380bfe296 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Mon, 19 Jan 2026 11:35:18 -0500 Subject: [PATCH 117/164] Ensure mask shift doesn't exceed uint64_t size. --- components/core/src/clp/SchemaSearcher.hpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/components/core/src/clp/SchemaSearcher.hpp b/components/core/src/clp/SchemaSearcher.hpp index 7ef4d6f0b7..2b6f074e35 100644 --- a/components/core/src/clp/SchemaSearcher.hpp +++ b/components/core/src/clp/SchemaSearcher.hpp @@ -4,8 +4,8 @@ #include #include #include +#include #include -#include #include #include #include @@ -237,7 +237,8 @@ auto SchemaSearcher::generate_schema_sub_queries( for (auto const& interpretation : interpretations) { auto const logtype{interpretation.get_logtype()}; auto const wildcard_encodable_positions{get_wildcard_encodable_positions(interpretation)}; - if (wildcard_encodable_positions.size() > max_encodable_wildcard_variables) { + if (wildcard_encodable_positions.size() > max_encodable_wildcard_variables + || wildcard_encodable_positions.size() >= std::numeric_limits::digits) { throw OperationFailed(ErrorCode_Unsupported, __FILENAME__, __LINE__); } uint64_t const num_combos{1ULL << wildcard_encodable_positions.size()}; From 9e0bf19ba8c87faad2fe21dc3c14247823e756ec Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Mon, 19 Jan 2026 12:23:42 -0500 Subject: [PATCH 118/164] Fix using. --- components/core/tests/SchemaSearcherTest.hpp | 30 ++++++++------------ 1 file changed, 12 insertions(+), 18 deletions(-) diff --git a/components/core/tests/SchemaSearcherTest.hpp b/components/core/tests/SchemaSearcherTest.hpp index 6d814b89c5..8a4e311a05 100644 --- a/components/core/tests/SchemaSearcherTest.hpp +++ b/components/core/tests/SchemaSearcherTest.hpp @@ -28,13 +28,11 @@ * All methods forward directly to `SchemaSearcher` and are intended for testing only. */ class clp::SchemaSearcherTest { -using log_surgeon::wildcard_query_parser::QueryInterpretation; -using log_surgeon::wildcard_query_parser::VariableQueryToken; - public: - static auto normalize_interpretations(std::set const& interpretations) - -> std::set { - return SchemaSearcher::normalize_interpretations(interpretations); + static auto normalize_interpretations( + std::set const& interps + ) -> std::set { + return SchemaSearcher::normalize_interpretations(interps); } template < @@ -42,25 +40,21 @@ using log_surgeon::wildcard_query_parser::VariableQueryToken; VariableDictionaryReaderReq VariableDictionaryReaderType > static auto generate_schema_sub_queries( - std::set const& interpretations, + std::set const& interps, LogTypeDictionaryReaderType const& logtype_dict, VariableDictionaryReaderType const& var_dict ) -> std::vector { - return SchemaSearcher::generate_schema_sub_queries( - interpretations, - logtype_dict, - var_dict, - false - ); + return SchemaSearcher::generate_schema_sub_queries(interps, logtype_dict, var_dict, false); } - static auto get_wildcard_encodable_positions(QueryInterpretation const& interpretation) - -> std::vector { + static auto get_wildcard_encodable_positions( + log_surgeon::wildcard_query_parser::QueryInterpretation const& interpretation + ) -> std::vector { return SchemaSearcher::get_wildcard_encodable_positions(interpretation); } static auto generate_logtype_string( - QueryInterpretation const& interpretation, + log_surgeon::wildcard_query_parser::QueryInterpretation const& interpretation, std::vector const& wildcard_encodable_positions, std::vector const& mask_encoded_flags ) -> std::string { @@ -73,7 +67,7 @@ using log_surgeon::wildcard_query_parser::VariableQueryToken; template static auto process_token( - VariableQueryToken const& var_token, + log_surgeon::wildcard_query_parser::VariableQueryToken const& var_token, VariableDictionaryReaderType const& var_dict, SubQuery& sub_query ) -> bool { @@ -88,7 +82,7 @@ using log_surgeon::wildcard_query_parser::VariableQueryToken; template static auto process_encoded_token( - VariableQueryToken const& var_token, + log_surgeon::wildcard_query_parser::VariableQueryToken const& var_token, VariableDictionaryReaderType const& var_dict, SubQuery& sub_query ) -> bool { From 2e517fbe1c4128ccfac5624b709fb324862c1427 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Mon, 19 Jan 2026 12:27:12 -0500 Subject: [PATCH 119/164] Format. --- components/core/src/clp/SchemaSearcher.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/components/core/src/clp/SchemaSearcher.hpp b/components/core/src/clp/SchemaSearcher.hpp index 2b6f074e35..1a9ce2b61d 100644 --- a/components/core/src/clp/SchemaSearcher.hpp +++ b/components/core/src/clp/SchemaSearcher.hpp @@ -238,7 +238,8 @@ auto SchemaSearcher::generate_schema_sub_queries( auto const logtype{interpretation.get_logtype()}; auto const wildcard_encodable_positions{get_wildcard_encodable_positions(interpretation)}; if (wildcard_encodable_positions.size() > max_encodable_wildcard_variables - || wildcard_encodable_positions.size() >= std::numeric_limits::digits) { + || wildcard_encodable_positions.size() >= std::numeric_limits::digits) + { throw OperationFailed(ErrorCode_Unsupported, __FILENAME__, __LINE__); } uint64_t const num_combos{1ULL << wildcard_encodable_positions.size()}; From aaa69e43ea9169bfa73005255af80dbd5b284322 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Thu, 5 Feb 2026 03:52:45 -0500 Subject: [PATCH 120/164] Make it clear the c++ macro and cmake variable are seperate things by renaming c++ macro. --- components/core/CMakeLists.txt | 2 +- components/core/src/clp/SchemaSearcher.hpp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/components/core/CMakeLists.txt b/components/core/CMakeLists.txt index 8cea1f79d4..2a77ca34c4 100644 --- a/components/core/CMakeLists.txt +++ b/components/core/CMakeLists.txt @@ -799,6 +799,6 @@ if(CLP_ENABLE_TESTS) PRIVATE cxx_std_20 ) target_compile_definitions(unitTest - PRIVATE CLP_BUILD_TESTING + PRIVATE CLP_ENABLE_TESTS_IN_CPP ) endif() diff --git a/components/core/src/clp/SchemaSearcher.hpp b/components/core/src/clp/SchemaSearcher.hpp index 1a9ce2b61d..c4672d8560 100644 --- a/components/core/src/clp/SchemaSearcher.hpp +++ b/components/core/src/clp/SchemaSearcher.hpp @@ -24,7 +24,7 @@ #include namespace clp { -#ifdef CLP_BUILD_TESTING +#ifdef CLP_ENABLE_TESTS_IN_CPP class SchemaSearcherTest; #endif @@ -66,7 +66,7 @@ class SchemaSearcherTest; * generation, and per-variable processing. */ class SchemaSearcher { -#ifdef CLP_BUILD_TESTING +#ifdef CLP_ENABLE_TESTS_IN_CPP friend class SchemaSearcherTest; #endif From 8c3950ab5fa7dbf363e45ebc3d890c04a4ba2ec6 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 11 Feb 2026 05:30:10 -0500 Subject: [PATCH 121/164] Allow timestamped headers; Remove delimiter checking code in load_lexer. --- components/core/src/clp/Utils.cpp | 74 ++++++++--------------------- components/core/src/clp/clp/run.cpp | 20 ++++++-- 2 files changed, 36 insertions(+), 58 deletions(-) diff --git a/components/core/src/clp/Utils.cpp b/components/core/src/clp/Utils.cpp index 382215cf44..43bdee3429 100644 --- a/components/core/src/clp/Utils.cpp +++ b/components/core/src/clp/Utils.cpp @@ -19,13 +19,18 @@ #include "spdlog_with_specializations.hpp" +namespace clp { using std::list; +using log_surgeon::finite_automata::ByteNfaState; +using log_surgeon::finite_automata::RegexASTCat; +using log_surgeon::finite_automata::RegexASTLiteral; +using log_surgeon::finite_automata::RegexASTOr; +using log_surgeon::utf8::cCharStartOfFile; using std::make_unique; using std::string; using std::unique_ptr; using std::vector; -namespace clp { ErrorCode create_directory(string const& path, mode_t mode, bool exist_ok) { int retval = mkdir(path.c_str(), mode); if (0 != retval) { @@ -135,9 +140,8 @@ load_lexer_from_file(std::string const& schema_file_path, log_surgeon::lexers::B lexer.m_symbol_id[log_surgeon::cTokenEnd] = static_cast(log_surgeon::SymbolId::TokenEnd); lexer.m_symbol_id[log_surgeon::cTokenUncaughtString] = static_cast(log_surgeon::SymbolId::TokenUncaughtString); - // cTokenInt, cTokenFloat, cTokenFirstTimestamp, and cTokenNewlineTimestamp each have unknown - // rule(s) until specified by the user so can't be explicitly added and are done by looping over - // schema_vars (user schema) + // cTokenInt, cTokenFloat, and cTokenHeader each have unknown rule(s) until specified by the + // user so can't be explicitly added and are done by looping over schema_vars (user schema) lexer.m_symbol_id[log_surgeon::cTokenInt] = static_cast(log_surgeon::SymbolId::TokenInt); lexer.m_symbol_id[log_surgeon::cTokenFloat] = static_cast(log_surgeon::SymbolId::TokenFloat); @@ -162,10 +166,9 @@ load_lexer_from_file(std::string const& schema_file_path, log_surgeon::lexers::B lexer.add_rule( lexer.m_symbol_id["newLine"], std::move( - std::make_unique>(log_surgeon::finite_automata:: - RegexASTLiteral('\n')) + make_unique>( + RegexASTLiteral('\n') + ) ) ); @@ -185,7 +188,14 @@ load_lexer_from_file(std::string const& schema_file_path, log_surgeon::lexers::B auto* rule = dynamic_cast(parser_ast.get()); // Capture groups are temporarily disabled, until NFA intersection supports for search. - auto const num_captures{rule->m_regex_ptr->get_subtree_positive_captures().size()}; + auto const& captures{rule->m_regex_ptr->get_subtree_positive_captures()}; + auto const num_captures{captures.size()}; + if ("header" == rule->m_name ) { + if ((1 == num_captures && "timestamp" == captures[0]->get_name()) || 0 == num_captures) + { + continue; + } + } if (0 < num_captures) { throw std::runtime_error( schema_file_path + ":" + std::to_string(rule->m_line_num + 1) @@ -195,55 +205,13 @@ load_lexer_from_file(std::string const& schema_file_path, log_surgeon::lexers::B ); } - if ("timestamp" == rule->m_name) { - continue; - } + // transform '.' from any-character into any non-delimiter character + rule->m_regex_ptr->remove_delimiters_from_wildcard(delimiters); if (lexer.m_symbol_id.find(rule->m_name) == lexer.m_symbol_id.end()) { lexer.m_symbol_id[rule->m_name] = lexer.m_symbol_id.size(); lexer.m_id_symbol[lexer.m_symbol_id[rule->m_name]] = rule->m_name; } - - // transform '.' from any-character into any non-delimiter character - rule->m_regex_ptr->remove_delimiters_from_wildcard(delimiters); - - std::array is_possible_input{}; - rule->m_regex_ptr->set_possible_inputs_to_true(is_possible_input); - bool contains_delimiter = false; - uint32_t delimiter_name; - for (uint32_t delimiter : delimiters) { - if (is_possible_input[delimiter]) { - contains_delimiter = true; - delimiter_name = delimiter; - break; - } - } - - if (contains_delimiter) { - FileReader schema_reader{schema_ast->m_file_path}; - // more detailed debugging based on looking at the file - string line; - for (uint32_t i = 0; i <= rule->m_line_num; i++) { - schema_reader.read_to_delimiter('\n', false, false, line); - } - int colon_pos = 0; - for (char i : line) { - colon_pos++; - if (i == ':') { - break; - } - } - string indent(10, ' '); - string spaces(colon_pos, ' '); - string arrows(line.size() - colon_pos, '^'); - - throw std::runtime_error( - schema_file_path + ":" + std::to_string(rule->m_line_num + 1) + ": error: '" - + rule->m_name + "' has regex pattern which contains delimiter '" - + char(delimiter_name) + "'.\n" + indent + line + "\n" + indent + spaces - + arrows + "\n" - ); - } lexer.add_rule(lexer.m_symbol_id[rule->m_name], std::move(rule->m_regex_ptr)); } lexer.generate(); diff --git a/components/core/src/clp/clp/run.cpp b/components/core/src/clp/clp/run.cpp index 09e3475c94..18b061abdf 100644 --- a/components/core/src/clp/clp/run.cpp +++ b/components/core/src/clp/clp/run.cpp @@ -64,12 +64,22 @@ int run(int argc, char const* argv[]) { // Capture groups are temporarily disabled, until NFA intersection support for search. auto const& lexer{reader_parser->get_log_parser().m_lexer}; for (auto const& [rule_id, rule_name] : lexer.m_id_symbol) { - if (lexer.get_captures_from_rule_id(rule_id).has_value()) { - throw std::runtime_error( - schema_file_path + ": error: the schema rule '" + rule_name - + "' has a regex pattern containing capture groups.\n" - ); + auto optional_captures{lexer.get_captures_from_rule_id(rule_id)}; + if (false == optional_captures.has_value()) { + continue; } + + auto const& captures{optional_captures.value()}; + if ("header" == rule_name && 1 == captures.size() + && "timestamp" == captures[0]->get_name()) + { + continue; + } + + throw std::runtime_error( + schema_file_path + ": error: the schema rule '" + rule_name + + "' has a regex pattern containing capture groups.\n" + ); } } From c79077cb646465f0482b29a2f356c00e2575d738 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 11 Feb 2026 05:43:40 -0500 Subject: [PATCH 122/164] Allow for 0 capture header to be added to search lexer. --- components/core/src/clp/Utils.cpp | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/components/core/src/clp/Utils.cpp b/components/core/src/clp/Utils.cpp index 43bdee3429..38a106f168 100644 --- a/components/core/src/clp/Utils.cpp +++ b/components/core/src/clp/Utils.cpp @@ -190,11 +190,9 @@ load_lexer_from_file(std::string const& schema_file_path, log_surgeon::lexers::B // Capture groups are temporarily disabled, until NFA intersection supports for search. auto const& captures{rule->m_regex_ptr->get_subtree_positive_captures()}; auto const num_captures{captures.size()}; - if ("header" == rule->m_name ) { - if ((1 == num_captures && "timestamp" == captures[0]->get_name()) || 0 == num_captures) - { - continue; - } + if ("header" == rule->m_name && 1 == num_captures && "timestamp" == captures[0]->get_name()) + { + continue; } if (0 < num_captures) { throw std::runtime_error( From 2039f72cda6b41765f9b95a2c8c2be17b08c06a5 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 11 Feb 2026 05:55:38 -0500 Subject: [PATCH 123/164] Remove unused headers and unused declarations. --- components/core/src/clp/Utils.cpp | 18 +++--------------- 1 file changed, 3 insertions(+), 15 deletions(-) diff --git a/components/core/src/clp/Utils.cpp b/components/core/src/clp/Utils.cpp index 38a106f168..2533525d19 100644 --- a/components/core/src/clp/Utils.cpp +++ b/components/core/src/clp/Utils.cpp @@ -1,31 +1,21 @@ #include "Utils.hpp" #include -#include #include #include -#include #include -#include #include -#include -#include #include #include #include -#include #include "spdlog_with_specializations.hpp" namespace clp { -using std::list; using log_surgeon::finite_automata::ByteNfaState; -using log_surgeon::finite_automata::RegexASTCat; using log_surgeon::finite_automata::RegexASTLiteral; -using log_surgeon::finite_automata::RegexASTOr; -using log_surgeon::utf8::cCharStartOfFile; using std::make_unique; using std::string; using std::unique_ptr; @@ -165,11 +155,9 @@ load_lexer_from_file(std::string const& schema_file_path, log_surgeon::lexers::B lexer.add_rule( lexer.m_symbol_id["newLine"], - std::move( - make_unique>( - RegexASTLiteral('\n') - ) - ) + std::move(make_unique>( + RegexASTLiteral('\n') + )) ); for (auto const& delimiters_ast : schema_ast->m_delimiters) { From a8eb0c1de4eb0de08ec271e8cd43254e4148dde6 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 11 Feb 2026 05:59:05 -0500 Subject: [PATCH 124/164] Remove unused headers and unused declarations. --- components/core/src/clp/Utils.cpp | 18 +++--------------- 1 file changed, 3 insertions(+), 15 deletions(-) diff --git a/components/core/src/clp/Utils.cpp b/components/core/src/clp/Utils.cpp index 38a106f168..9b977c9e97 100644 --- a/components/core/src/clp/Utils.cpp +++ b/components/core/src/clp/Utils.cpp @@ -1,31 +1,21 @@ #include "Utils.hpp" #include -#include #include #include -#include #include -#include #include -#include -#include #include #include #include -#include #include "spdlog_with_specializations.hpp" namespace clp { -using std::list; using log_surgeon::finite_automata::ByteNfaState; -using log_surgeon::finite_automata::RegexASTCat; using log_surgeon::finite_automata::RegexASTLiteral; -using log_surgeon::finite_automata::RegexASTOr; -using log_surgeon::utf8::cCharStartOfFile; using std::make_unique; using std::string; using std::unique_ptr; @@ -165,11 +155,9 @@ load_lexer_from_file(std::string const& schema_file_path, log_surgeon::lexers::B lexer.add_rule( lexer.m_symbol_id["newLine"], - std::move( - make_unique>( - RegexASTLiteral('\n') - ) - ) + std::move(make_unique>(RegexASTLiteral( + '\n' + ))) ); for (auto const& delimiters_ast : schema_ast->m_delimiters) { From b0b1d06fae41d8cca46fa07b30f68b565facfc52 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 11 Feb 2026 06:01:08 -0500 Subject: [PATCH 125/164] Remove unneeded move operation. --- components/core/src/clp/Utils.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/components/core/src/clp/Utils.cpp b/components/core/src/clp/Utils.cpp index 9b977c9e97..0316da70b3 100644 --- a/components/core/src/clp/Utils.cpp +++ b/components/core/src/clp/Utils.cpp @@ -155,9 +155,7 @@ load_lexer_from_file(std::string const& schema_file_path, log_surgeon::lexers::B lexer.add_rule( lexer.m_symbol_id["newLine"], - std::move(make_unique>(RegexASTLiteral( - '\n' - ))) + make_unique>(RegexASTLiteral('\n')) ); for (auto const& delimiters_ast : schema_ast->m_delimiters) { From 065ebb9183ab80014a0c53f93fda3f944a4bf4e3 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 11 Feb 2026 06:02:59 -0500 Subject: [PATCH 126/164] Safety check for empty captures set. --- components/core/src/clp/clp/run.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/components/core/src/clp/clp/run.cpp b/components/core/src/clp/clp/run.cpp index 18b061abdf..e1e73bf11f 100644 --- a/components/core/src/clp/clp/run.cpp +++ b/components/core/src/clp/clp/run.cpp @@ -70,6 +70,10 @@ int run(int argc, char const* argv[]) { } auto const& captures{optional_captures.value()}; + if (captures.empty()) { + continue; + } + if ("header" == rule_name && 1 == captures.size() && "timestamp" == captures[0]->get_name()) { From 84fb5ee016e12ea89b8b4ba3259eaf151d8d9da3 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 11 Feb 2026 06:04:44 -0500 Subject: [PATCH 127/164] Avoid temporary object. --- components/core/src/clp/Utils.cpp | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/components/core/src/clp/Utils.cpp b/components/core/src/clp/Utils.cpp index 0316da70b3..d8a080b3db 100644 --- a/components/core/src/clp/Utils.cpp +++ b/components/core/src/clp/Utils.cpp @@ -153,10 +153,7 @@ load_lexer_from_file(std::string const& schema_file_path, log_surgeon::lexers::B lexer.m_id_symbol[static_cast(log_surgeon::SymbolId::TokenNewline)] = log_surgeon::cTokenNewline; - lexer.add_rule( - lexer.m_symbol_id["newLine"], - make_unique>(RegexASTLiteral('\n')) - ); + lexer.add_rule(lexer.m_symbol_id["newLine"], make_unique>('\n')); for (auto const& delimiters_ast : schema_ast->m_delimiters) { auto* delimiters_ptr = dynamic_cast(delimiters_ast.get()); From a1bcdb55ead708abd5427a9da5bb9021b1cd74ba Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 11 Feb 2026 13:47:01 -0500 Subject: [PATCH 128/164] Add new unit-tests; Cleanup config schema.txt. --- components/core/config/schemas.txt | 10 +-- .../core/tests/test-ParserWithUserSchema.cpp | 90 +++++++++++++++++-- .../test_schema_files/header_with_int.txt | 3 + .../header_with_no_capture.txt | 3 + .../header_with_timestamp.txt | 3 + .../header_with_timestamp_and_int.txt | 3 + 6 files changed, 102 insertions(+), 10 deletions(-) create mode 100644 components/core/tests/test_schema_files/header_with_int.txt create mode 100644 components/core/tests/test_schema_files/header_with_no_capture.txt create mode 100644 components/core/tests/test_schema_files/header_with_timestamp.txt create mode 100644 components/core/tests/test_schema_files/header_with_timestamp_and_int.txt diff --git a/components/core/config/schemas.txt b/components/core/config/schemas.txt index 920e6bcbcb..a42c4b952f 100644 --- a/components/core/config/schemas.txt +++ b/components/core/config/schemas.txt @@ -1,17 +1,17 @@ // Delimiters delimiters: \t\r\n!"#$%&'()*,:;<>?@[]^_`{}|~ -// Timestamps (using the `timestamp` keyword) +// Headers (using the `timestamp` capture keyword) // E.g. 2015-01-31 15:50:45,392 // E.g. 2015-01-31 15:50:45.392 // E.g. 2015-01-31 15:50:45 -timestamp:\d{4}\-\d{2}\-\d{2} \d{2}:\d{2}:\d{2}(\.\d{3}){0,1} +header:(?\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}(\.\d{3}){0,1}) // E.g. [20150131-15:50:45] -timestamp:\[\d{8}\-\d{2}:\d{2}:\d{2}\] +header:(?\[\d{8}-\d{2}:\d{2}:\d{2}\]) // Specially-encoded variables (using the `int` and `float` keywords) -int:\-{0,1}[0-9]+ -float:\-{0,1}[0-9]+\.[0-9]+ +int:-?\d+ +float:-?\d+\.\d+ // Dictionary variables hex:[a-fA-F]+ diff --git a/components/core/tests/test-ParserWithUserSchema.cpp b/components/core/tests/test-ParserWithUserSchema.cpp index a74de6d8a3..43e4d1acd5 100644 --- a/components/core/tests/test-ParserWithUserSchema.cpp +++ b/components/core/tests/test-ParserWithUserSchema.cpp @@ -20,9 +20,7 @@ #include #include #include -#include #include -#include #include #include "TestOutputCleaner.hpp" @@ -85,6 +83,7 @@ auto run_clp_compress( input_path_str.data(), nullptr }; + spdlog::drop_all(); return clp::clp::run(static_cast(argv.size() - 1), argv.data()); } } // namespace @@ -191,7 +190,7 @@ TEST_CASE("Test lexer", "[Search]") { } } -TEST_CASE("Error on schema rule with a single capture group", "[load_lexer]") { +TEST_CASE("Error on schema rule with a single non-header capture group", "[load_lexer]") { auto const schema_file_path{get_test_schema_files_dir() / "single_capture_group.txt"}; ByteLexer lexer; REQUIRE_THROWS_WITH( @@ -202,7 +201,7 @@ TEST_CASE("Error on schema rule with a single capture group", "[load_lexer]") { ); } -TEST_CASE("Error on schema rule with multiple capture groups", "[load_lexer]") { +TEST_CASE("Error on schema rule with multiple non-header capture groups", "[load_lexer]") { auto const schema_file_path{get_test_schema_files_dir() / "multiple_capture_groups.txt"}; ByteLexer lexer; REQUIRE_THROWS_WITH( @@ -213,7 +212,7 @@ TEST_CASE("Error on schema rule with multiple capture groups", "[load_lexer]") { ); } -TEST_CASE("Verify CLP compression fails with capture groups", "[Compression]") { +TEST_CASE("Verify CLP compression fails with non-header capture groups", "[Compression]") { auto const log_file_path{get_test_log_dir() / "log_with_capture.txt"}; auto const schema_file_path{get_test_schema_files_dir() / "single_capture_group.txt"}; TestOutputCleaner const cleaner{{std::string{cTestArchiveDirectory}}}; @@ -226,3 +225,84 @@ TEST_CASE("Verify CLP compression fails with capture groups", "[Compression]") { "groups.\n" ); } + +TEST_CASE("Succeed on header rule with no capture", "[load_lexer]") { + auto const schema_file_path{get_test_schema_files_dir() / "header_with_no_capture.txt"}; + ByteLexer lexer; + REQUIRE_NOTHROW(load_lexer_from_file(schema_file_path, lexer)); +} + +TEST_CASE("Succeed on header rule with a single timestamp capture", "[load_lexer]") { + auto const schema_file_path{get_test_schema_files_dir() / "header_with_timestamp.txt"}; + ByteLexer lexer; + REQUIRE_NOTHROW(load_lexer_from_file(schema_file_path, lexer)); +} + +TEST_CASE("Error on header rule with a single non-timestamp capture", "[load_lexer]") { + auto const schema_file_path{get_test_schema_files_dir() / "header_with_int.txt"}; + ByteLexer lexer; + REQUIRE_THROWS_WITH( + load_lexer_from_file(schema_file_path, lexer), + schema_file_path.string() + + ":3: error: the schema rule 'header' has a regex pattern containing capture " + "groups (found 1).\n" + ); +} + + +TEST_CASE("Error on header rule with a timestamp and non-timestamp capture", "[load_lexer]") { + auto const schema_file_path{get_test_schema_files_dir() / "header_with_timestamp_and_int.txt"}; + ByteLexer lexer; + REQUIRE_THROWS_WITH( + load_lexer_from_file(schema_file_path, lexer), + schema_file_path.string() + + ":3: error: the schema rule 'header' has a regex pattern containing capture " + "groups (found 2).\n" + ); +} + +TEST_CASE("Verify CLP compression succeeds with non-capture header", "[Compression]") { + auto const log_file_path{get_test_log_dir() / "log_with_capture.txt"}; + auto const schema_file_path{get_test_schema_files_dir() / "header_with_no_capture.txt"}; + TestOutputCleaner const cleaner{{std::string{cTestArchiveDirectory}}}; + std::filesystem::create_directory(cTestArchiveDirectory); + + REQUIRE(0 == run_clp_compress(schema_file_path, cTestArchiveDirectory, log_file_path)); +} + +TEST_CASE("Verify CLP compression succeeds with timestamp capture header", "[Compression]") { + auto const log_file_path{get_test_log_dir() / "log_with_capture.txt"}; + auto const schema_file_path{get_test_schema_files_dir() / "header_with_timestamp.txt"}; + TestOutputCleaner const cleaner{{std::string{cTestArchiveDirectory}}}; + std::filesystem::create_directory(cTestArchiveDirectory); + + REQUIRE(0 == run_clp_compress(schema_file_path, cTestArchiveDirectory, log_file_path)); +} + +TEST_CASE("Verify CLP compression fails with non-timestamp capture header", "[Compression]") { + auto const log_file_path{get_test_log_dir() / "log_with_capture.txt"}; + auto const schema_file_path{get_test_schema_files_dir() / "header_with_int.txt"}; + TestOutputCleaner const cleaner{{std::string{cTestArchiveDirectory}}}; + std::filesystem::create_directory(cTestArchiveDirectory); + + REQUIRE_THROWS_WITH( + run_clp_compress(schema_file_path, cTestArchiveDirectory, log_file_path), + schema_file_path.string() + + ": error: the schema rule 'header' has a regex pattern containing capture " + "groups.\n" + ); +} + +TEST_CASE("Verify CLP compression fails with multi-capture header", "[Compression]") { + auto const log_file_path{get_test_log_dir() / "log_with_capture.txt"}; + auto const schema_file_path{get_test_schema_files_dir() / "header_with_timestamp_and_int.txt"}; + TestOutputCleaner const cleaner{{std::string{cTestArchiveDirectory}}}; + std::filesystem::create_directory(cTestArchiveDirectory); + + REQUIRE_THROWS_WITH( + run_clp_compress(schema_file_path, cTestArchiveDirectory, log_file_path), + schema_file_path.string() + + ": error: the schema rule 'header' has a regex pattern containing capture " + "groups.\n" + ); +} diff --git a/components/core/tests/test_schema_files/header_with_int.txt b/components/core/tests/test_schema_files/header_with_int.txt new file mode 100644 index 0000000000..5a0214346b --- /dev/null +++ b/components/core/tests/test_schema_files/header_with_int.txt @@ -0,0 +1,3 @@ +delimiters: \r\n + +header:(?\d+) diff --git a/components/core/tests/test_schema_files/header_with_no_capture.txt b/components/core/tests/test_schema_files/header_with_no_capture.txt new file mode 100644 index 0000000000..56c4c3b149 --- /dev/null +++ b/components/core/tests/test_schema_files/header_with_no_capture.txt @@ -0,0 +1,3 @@ +delimiters: \r\n + +header:\d{2}-\d{2}-\d{4}:\d{2}:\d{2}:\d{2}\.\d{3} diff --git a/components/core/tests/test_schema_files/header_with_timestamp.txt b/components/core/tests/test_schema_files/header_with_timestamp.txt new file mode 100644 index 0000000000..1669fd022a --- /dev/null +++ b/components/core/tests/test_schema_files/header_with_timestamp.txt @@ -0,0 +1,3 @@ +delimiters: \r\n + +header:(?\d{2}-\d{2}-\d{4}:\d{2}:\d{2}:\d{2}\.\d{3}) diff --git a/components/core/tests/test_schema_files/header_with_timestamp_and_int.txt b/components/core/tests/test_schema_files/header_with_timestamp_and_int.txt new file mode 100644 index 0000000000..62dcc5b074 --- /dev/null +++ b/components/core/tests/test_schema_files/header_with_timestamp_and_int.txt @@ -0,0 +1,3 @@ +delimiters: \r\n + +header:(?\d{2}-\d{2}-\d{4}:\d{2}:\d{2}:\d{2}\.\d{3}) and (?\d+) From 27d6d127077d0d68cacf034750e769eeb7dd1ea8 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 18 Feb 2026 08:02:59 -0500 Subject: [PATCH 129/164] Revert change to schema. --- components/core/config/schemas.txt | 2 +- components/core/tests/test-ParserWithUserSchema.cpp | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/components/core/config/schemas.txt b/components/core/config/schemas.txt index a42c4b952f..457a3c2473 100644 --- a/components/core/config/schemas.txt +++ b/components/core/config/schemas.txt @@ -16,4 +16,4 @@ float:-?\d+\.\d+ // Dictionary variables hex:[a-fA-F]+ hasNumber:.*\d.* -equals:[a-zA-Z0-9]+=(?.*[a-zA-Z0-9].*) +equals:.*=.*[a-zA-Z0-9].* diff --git a/components/core/tests/test-ParserWithUserSchema.cpp b/components/core/tests/test-ParserWithUserSchema.cpp index 43e4d1acd5..167061d6ba 100644 --- a/components/core/tests/test-ParserWithUserSchema.cpp +++ b/components/core/tests/test-ParserWithUserSchema.cpp @@ -249,7 +249,6 @@ TEST_CASE("Error on header rule with a single non-timestamp capture", "[load_lex ); } - TEST_CASE("Error on header rule with a timestamp and non-timestamp capture", "[load_lexer]") { auto const schema_file_path{get_test_schema_files_dir() / "header_with_timestamp_and_int.txt"}; ByteLexer lexer; From 176e828e661f6037e3211a72c81a8ed5e41af19c Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 18 Feb 2026 08:09:50 -0500 Subject: [PATCH 130/164] Add drop_all comment. --- components/core/tests/test-ParserWithUserSchema.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/components/core/tests/test-ParserWithUserSchema.cpp b/components/core/tests/test-ParserWithUserSchema.cpp index 167061d6ba..2ae28525c4 100644 --- a/components/core/tests/test-ParserWithUserSchema.cpp +++ b/components/core/tests/test-ParserWithUserSchema.cpp @@ -83,6 +83,9 @@ auto run_clp_compress( input_path_str.data(), nullptr }; + // `clp::clp::run` registers a logger for `spdlog` that persists across runs. `spdlog` will + // error if a logger with the same name already exists. `spdlog::drop_all` clears all loggers, + // ensuring `clp::clp::run` can safely create a fresh logger for each new call. spdlog::drop_all(); return clp::clp::run(static_cast(argv.size() - 1), argv.data()); } From 8d4a3a2897011150a9c4721c3a52d0dd0828a2c7 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Thu, 19 Feb 2026 11:49:17 -0500 Subject: [PATCH 131/164] Fix decompression bug. --- components/core/src/clp/SchemaSearcher.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/components/core/src/clp/SchemaSearcher.hpp b/components/core/src/clp/SchemaSearcher.hpp index c4672d8560..405ac0b2df 100644 --- a/components/core/src/clp/SchemaSearcher.hpp +++ b/components/core/src/clp/SchemaSearcher.hpp @@ -318,6 +318,8 @@ auto SchemaSearcher::process_schema_var_token( bool const is_mask_encoded, SubQuery& sub_query ) -> bool { + sub_query.mark_wildcard_match_required(); + auto const& raw_string{variable_token.get_query_substring()}; auto const var_has_wildcard{variable_token.get_contains_wildcard()}; auto const var_type{static_cast(variable_token.get_variable_type())}; @@ -325,7 +327,6 @@ auto SchemaSearcher::process_schema_var_token( bool const is_float{log_surgeon::SymbolId::TokenFloat == var_type}; if (is_mask_encoded) { - sub_query.mark_wildcard_match_required(); return true; } From 4ecd3990f834cba9068d9452e43a2920682c4fa2 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Thu, 19 Feb 2026 11:51:20 -0500 Subject: [PATCH 132/164] Rename macro. --- components/core/CMakeLists.txt | 2 +- components/core/src/clp/SchemaSearcher.hpp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/components/core/CMakeLists.txt b/components/core/CMakeLists.txt index 2a77ca34c4..6b6c68d664 100644 --- a/components/core/CMakeLists.txt +++ b/components/core/CMakeLists.txt @@ -799,6 +799,6 @@ if(CLP_ENABLE_TESTS) PRIVATE cxx_std_20 ) target_compile_definitions(unitTest - PRIVATE CLP_ENABLE_TESTS_IN_CPP + PRIVATE CLP_ENABLE_TESTS ) endif() diff --git a/components/core/src/clp/SchemaSearcher.hpp b/components/core/src/clp/SchemaSearcher.hpp index 405ac0b2df..f0624007d4 100644 --- a/components/core/src/clp/SchemaSearcher.hpp +++ b/components/core/src/clp/SchemaSearcher.hpp @@ -24,7 +24,7 @@ #include namespace clp { -#ifdef CLP_ENABLE_TESTS_IN_CPP +#ifdef CLP_ENABLE_TESTS class SchemaSearcherTest; #endif @@ -66,7 +66,7 @@ class SchemaSearcherTest; * generation, and per-variable processing. */ class SchemaSearcher { -#ifdef CLP_ENABLE_TESTS_IN_CPP +#ifdef CLP_ENABLE_TESTS friend class SchemaSearcherTest; #endif From 861bc370aba3cb879a51302cd87bc58ff4c50056 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Thu, 19 Feb 2026 13:28:28 -0500 Subject: [PATCH 133/164] Fix unit-tests based on previous change. --- components/core/tests/test-GrepCore.cpp | 2 +- components/core/tests/test-SchemaSearcher.cpp | 28 +++++++++---------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/components/core/tests/test-GrepCore.cpp b/components/core/tests/test-GrepCore.cpp index 32e376aaba..f567d5225c 100644 --- a/components/core/tests/test-GrepCore.cpp +++ b/components/core/tests/test-GrepCore.cpp @@ -206,6 +206,6 @@ TEST_CASE("process_raw_query", "[dfa_search]") { size_t i{0}; check_sub_query(i++, sub_queries, true, {wild_int, wild_has_num}, {1LL}); check_sub_query(i++, sub_queries, true, {wild_int}, {0LL}); - check_sub_query(i++, sub_queries, false, {wild_int, wild_has_num}, {2LL, 3LL}); + check_sub_query(i++, sub_queries, true, {wild_int, wild_has_num}, {2LL, 3LL}); check_sub_query(i++, sub_queries, true, {wild_int}, {5LL}); } diff --git a/components/core/tests/test-SchemaSearcher.cpp b/components/core/tests/test-SchemaSearcher.cpp index 13703e9d9c..093894a60b 100644 --- a/components/core/tests/test-SchemaSearcher.cpp +++ b/components/core/tests/test-SchemaSearcher.cpp @@ -168,7 +168,7 @@ TEST_CASE("process_schema_empty_token", "[dfa_search]") { SubQuery sub_query; VariableQueryToken const empty_int_token{cIntId, "", false}; REQUIRE(false == clp::SchemaSearcherTest::process_token(empty_int_token, var_dict, sub_query)); - REQUIRE(false == sub_query.wildcard_match_required()); + REQUIRE(sub_query.wildcard_match_required()); REQUIRE(0 == sub_query.get_num_possible_vars()); } @@ -178,7 +178,7 @@ TEST_CASE("process_schema_unmatched_token", "[dfa_search]") { SubQuery sub_query; VariableQueryToken const int_token{cIntId, "200", false}; REQUIRE(clp::SchemaSearcherTest::process_token(int_token, var_dict, sub_query)); - REQUIRE(false == sub_query.wildcard_match_required()); + REQUIRE(sub_query.wildcard_match_required()); REQUIRE(1 == sub_query.get_num_possible_vars()); auto const& var{sub_query.get_vars()[0]}; REQUIRE(false == var.is_dict_var()); @@ -192,7 +192,7 @@ TEST_CASE("process_schema_int_token", "[dfa_search]") { SubQuery sub_query; VariableQueryToken const int_token{cIntId, "100", false}; REQUIRE(clp::SchemaSearcherTest::process_token(int_token, var_dict, sub_query)); - REQUIRE(false == sub_query.wildcard_match_required()); + REQUIRE(sub_query.wildcard_match_required()); REQUIRE(1 == sub_query.get_num_possible_vars()); auto const& var{sub_query.get_vars()[0]}; REQUIRE(false == var.is_dict_var()); @@ -223,7 +223,7 @@ TEST_CASE("process_schema_encoded_non_greedy_wildcard_token", "[dfa_search]") { SubQuery sub_query; VariableQueryToken const has_number_token{cHasNumId, "10a?", true}; REQUIRE(clp::SchemaSearcherTest::process_token(has_number_token, var_dict, sub_query)); - REQUIRE(false == sub_query.wildcard_match_required()); + REQUIRE(sub_query.wildcard_match_required()); REQUIRE(1 == sub_query.get_num_possible_vars()); auto const& var{sub_query.get_vars()[0]}; REQUIRE(var.is_dict_var()); @@ -236,7 +236,7 @@ TEST_CASE("process_schema_encoded_non_greedy_wildcard_token", "[dfa_search]") { SubQuery sub_query; VariableQueryToken const has_number_token{cHasNumId, "10?0", true}; REQUIRE(clp::SchemaSearcherTest::process_token(has_number_token, var_dict, sub_query)); - REQUIRE(false == sub_query.wildcard_match_required()); + REQUIRE(sub_query.wildcard_match_required()); REQUIRE(1 == sub_query.get_num_possible_vars()); auto const& var{sub_query.get_vars()[0]}; REQUIRE(var.is_dict_var()); @@ -264,7 +264,7 @@ TEST_CASE("process_schema_non_encoded_non_greedy_wildcard_token", "[dfa_search]" SubQuery sub_query; VariableQueryToken const int_token{cIntId, "1000000000000000000000000?0", true}; REQUIRE(clp::SchemaSearcherTest::process_token(int_token, var_dict, sub_query)); - REQUIRE(false == sub_query.wildcard_match_required()); + REQUIRE(sub_query.wildcard_match_required()); REQUIRE(1 == sub_query.get_num_possible_vars()); auto const& var{sub_query.get_vars()[0]}; REQUIRE(var.is_dict_var()); @@ -279,7 +279,7 @@ TEST_CASE("process_schema_non_encoded_non_greedy_wildcard_token", "[dfa_search]" SubQuery sub_query; VariableQueryToken const float_token{cFloatId, "1000000000000000000000000?0", true}; REQUIRE(clp::SchemaSearcherTest::process_token(float_token, var_dict, sub_query)); - REQUIRE(false == sub_query.wildcard_match_required()); + REQUIRE(sub_query.wildcard_match_required()); REQUIRE(1 == sub_query.get_num_possible_vars()); auto const& var{sub_query.get_vars()[0]}; REQUIRE(var.is_dict_var()); @@ -294,7 +294,7 @@ TEST_CASE("process_schema_non_encoded_non_greedy_wildcard_token", "[dfa_search]" SubQuery sub_query; VariableQueryToken const has_number_token{cHasNumId, "1000000000000000000000000?0", true}; REQUIRE(clp::SchemaSearcherTest::process_token(has_number_token, var_dict, sub_query)); - REQUIRE(false == sub_query.wildcard_match_required()); + REQUIRE(sub_query.wildcard_match_required()); REQUIRE(1 == sub_query.get_num_possible_vars()); auto const& var{sub_query.get_vars()[0]}; REQUIRE(var.is_dict_var()); @@ -322,7 +322,7 @@ TEST_CASE("process_schema_greedy_wildcard_token", "[dfa_search]") { SubQuery sub_query; VariableQueryToken const int_token{cIntId, "10*0", true}; REQUIRE(clp::SchemaSearcherTest::process_token(int_token, var_dict, sub_query)); - REQUIRE(false == sub_query.wildcard_match_required()); + REQUIRE(sub_query.wildcard_match_required()); REQUIRE(1 == sub_query.get_num_possible_vars()); auto const& var{sub_query.get_vars()[0]}; REQUIRE(var.is_dict_var()); @@ -337,7 +337,7 @@ TEST_CASE("process_schema_greedy_wildcard_token", "[dfa_search]") { SubQuery sub_query; VariableQueryToken const float_token{cFloatId, "10*0", true}; REQUIRE(clp::SchemaSearcherTest::process_token(float_token, var_dict, sub_query)); - REQUIRE(false == sub_query.wildcard_match_required()); + REQUIRE(sub_query.wildcard_match_required()); REQUIRE(1 == sub_query.get_num_possible_vars()); auto const& var{sub_query.get_vars()[0]}; REQUIRE(var.is_dict_var()); @@ -352,7 +352,7 @@ TEST_CASE("process_schema_greedy_wildcard_token", "[dfa_search]") { SubQuery sub_query; VariableQueryToken const has_number_token{cHasNumId, "10*0", true}; REQUIRE(clp::SchemaSearcherTest::process_token(has_number_token, var_dict, sub_query)); - REQUIRE(false == sub_query.wildcard_match_required()); + REQUIRE(sub_query.wildcard_match_required()); REQUIRE(1 == sub_query.get_num_possible_vars()); auto const& var{sub_query.get_vars()[0]}; REQUIRE(var.is_dict_var()); @@ -367,7 +367,7 @@ TEST_CASE("process_schema_greedy_wildcard_token", "[dfa_search]") { SubQuery sub_query; VariableQueryToken const has_number_token{cHasNumId, "10b*", true}; REQUIRE(clp::SchemaSearcherTest::process_token(has_number_token, var_dict, sub_query)); - REQUIRE(false == sub_query.wildcard_match_required()); + REQUIRE(sub_query.wildcard_match_required()); REQUIRE(1 == sub_query.get_num_possible_vars()); auto const& var{sub_query.get_vars()[0]}; REQUIRE(var.is_dict_var()); @@ -435,7 +435,7 @@ TEST_CASE("generate_schema_sub_queries", "[dfa_search]") { size_t i{0}; check_sub_query(i++, sub_queries, true, {wild_int, wild_has_num}, {1LL}); check_sub_query(i++, sub_queries, true, {wild_int}, {0LL}); - check_sub_query(i++, sub_queries, false, {wild_int, wild_has_num}, {2LL, 3LL}); + check_sub_query(i++, sub_queries, true, {wild_int, wild_has_num}, {2LL, 3LL}); check_sub_query(i++, sub_queries, true, {wild_int}, {5LL}); } @@ -482,6 +482,6 @@ TEST_CASE("generate_schema_sub_queries_with_wildcard_duplication", "[dfa_search] size_t i{0}; check_sub_query(i++, sub_queries, true, {wild_int, wild_has_num}, {1LL}); check_sub_query(i++, sub_queries, true, {wild_int}, {0LL}); - check_sub_query(i++, sub_queries, false, {wild_int, wild_has_num}, {2LL, 3LL}); + check_sub_query(i++, sub_queries, true, {wild_int, wild_has_num}, {2LL, 3LL}); check_sub_query(i++, sub_queries, true, {wild_int}, {5LL}); } From 49e3b4d43d19c9b0720012d8e651bbd095f25638 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 4 Mar 2026 00:30:32 -0500 Subject: [PATCH 134/164] Add scoped profiler. --- components/core/CMakeLists.txt | 1 + components/core/src/clp/ScopedProfiler.hpp | 45 ++++++++++++++++++++++ components/core/src/clp/clp/CMakeLists.txt | 1 + 3 files changed, 47 insertions(+) create mode 100644 components/core/src/clp/ScopedProfiler.hpp diff --git a/components/core/CMakeLists.txt b/components/core/CMakeLists.txt index 6b6c68d664..2a11f2a819 100644 --- a/components/core/CMakeLists.txt +++ b/components/core/CMakeLists.txt @@ -631,6 +631,7 @@ set(SOURCE_FILES_unitTest src/clp/ReaderInterface.hpp src/clp/ReadOnlyMemoryMappedFile.cpp src/clp/ReadOnlyMemoryMappedFile.hpp + src/clp/ScopedProfiler.hpp src/clp/spdlog_with_specializations.hpp src/clp/SQLiteDB.cpp src/clp/SQLiteDB.hpp diff --git a/components/core/src/clp/ScopedProfiler.hpp b/components/core/src/clp/ScopedProfiler.hpp new file mode 100644 index 0000000000..2420ad5eb0 --- /dev/null +++ b/components/core/src/clp/ScopedProfiler.hpp @@ -0,0 +1,45 @@ +#ifndef CLP_SCOPED_PROFILER_HPP +#define CLP_SCOPED_PROFILER_HPP + +#include "Profiler.hpp" + +namespace clp { +/** + * RAII wrapper to measure the execution time of a code scope. + * + * This class starts a continuous measurement in its constructor and stops it in its destructor. It + * reports the measured time to the corresponding slot in the profiler. Use this class when you want + * to measure a single logical phase of your program (e.g., a method) without calling start/stop. + * + * Usage for a logical phase: + * - Define a unique measurement `index` in `Profiler::ContinuousMeasurementIndex`. Each `index` + * corresponds to a slot in the profiler that accumulates total time. + * - Use macro PROFILE_SCOPE(`index`) at the top of the logical phase, ideally this is always done + * at the top of a method for organization and clarity. + * - Set `DPROF_ENABLED=1` in `cmakelists`. + * + * Notes: + * - Safe with early returns and exceptions because stopping occurs in the destructor. + * - All measurements respect `PROF_ENABLED`, so no code is generated when profiling is disabled. + */ +template +class ScopedProfiler { +public: + inline ScopedProfiler() { Profiler::start_continuous_measurement(); } + + inline ~ScopedProfiler() { Profiler::stop_continuous_measurement(); } + + ScopedProfiler(const ScopedProfiler&) = delete; + ScopedProfiler& operator=(const ScopedProfiler&) = delete; + ScopedProfiler(ScopedProfiler&&) = delete; + ScopedProfiler& operator=(ScopedProfiler&&) = delete; +}; +} // namespace clp + +#define CLP_CONCAT_IMPL(x, y) x##y + +#define CLP_CONCAT(x, y) CLP_CONCAT_IMPL(x, y) + +#define PROFILE_SCOPE(x) ::clp::ScopedProfiler CLP_CONCAT(__clp_profile_scope_, __LINE__) + +#endif // CLP_SCOPED_PROFILER_HPP diff --git a/components/core/src/clp/clp/CMakeLists.txt b/components/core/src/clp/clp/CMakeLists.txt index 75e6bb4167..4559edd835 100644 --- a/components/core/src/clp/clp/CMakeLists.txt +++ b/components/core/src/clp/clp/CMakeLists.txt @@ -93,6 +93,7 @@ set( ../ReaderInterface.hpp ../ReadOnlyMemoryMappedFile.cpp ../ReadOnlyMemoryMappedFile.hpp + ../ScopedProfiler.hpp ../spdlog_with_specializations.hpp ../SQLiteDB.cpp ../SQLiteDB.hpp From 810bd8f1c6f6fbbdc2cd1b6ad585fb21bb65f206 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Thu, 5 Mar 2026 14:16:44 -0500 Subject: [PATCH 135/164] Switch to fragmented measurements for scope profiler; Add unit-tests for scoped measruement; Update stopwatch test to be more modern; Add profiler reporter; Add macro for prof_test_enabled --- components/core/CMakeLists.txt | 7 +- components/core/src/clp/Profiler.hpp | 41 +++++++---- components/core/src/clp/ProfilerReporter.hpp | 31 ++++++++ components/core/src/clp/ScopedProfiler.hpp | 10 +-- components/core/src/clp/clp/CMakeLists.txt | 1 + .../core/tests/test-ProfilerReporter.cpp | 9 +++ components/core/tests/test-ScopedProfiler.cpp | 71 +++++++++++++++++++ components/core/tests/test-Stopwatch.cpp | 21 +++--- 8 files changed, 163 insertions(+), 28 deletions(-) create mode 100644 components/core/src/clp/ProfilerReporter.hpp create mode 100644 components/core/tests/test-ProfilerReporter.cpp create mode 100644 components/core/tests/test-ScopedProfiler.cpp diff --git a/components/core/CMakeLists.txt b/components/core/CMakeLists.txt index 2a11f2a819..e20c82141a 100644 --- a/components/core/CMakeLists.txt +++ b/components/core/CMakeLists.txt @@ -623,6 +623,7 @@ set(SOURCE_FILES_unitTest src/clp/Platform.hpp src/clp/Profiler.cpp src/clp/Profiler.hpp + src/clp/ProfilerReporter.hpp src/clp/Query.cpp src/clp/Query.hpp src/clp/QueryToken.cpp @@ -737,9 +738,11 @@ set(SOURCE_FILES_unitTest tests/test-MemoryMappedFile.cpp tests/test-NetworkReader.cpp tests/test-ParserWithUserSchema.cpp + tests/test-ProfilerReporter.cpp tests/test-query_methods.cpp tests/test-regex_utils.cpp tests/test-SchemaSearcher.cpp + tests/test-ScopedProfiler.cpp tests/test-Segment.cpp tests/test-SQLiteDB.cpp tests/test-Stopwatch.cpp @@ -800,6 +803,8 @@ if(CLP_ENABLE_TESTS) PRIVATE cxx_std_20 ) target_compile_definitions(unitTest - PRIVATE CLP_ENABLE_TESTS + PRIVATE + CLP_ENABLE_TESTS + PROF_TEST_ENABLED=1 ) endif() diff --git a/components/core/src/clp/Profiler.hpp b/components/core/src/clp/Profiler.hpp index 68500a219b..9d307dbb3d 100644 --- a/components/core/src/clp/Profiler.hpp +++ b/components/core/src/clp/Profiler.hpp @@ -1,6 +1,12 @@ #ifndef CLP_PROFILER_HPP #define CLP_PROFILER_HPP +#if defined(PROF_ENABLED) || defined(PROF_TEST_ENABLED) + #define PROF_ACTIVE 1 +#else + #define PROF_ACTIVE 0 +#endif + #include #include @@ -28,7 +34,7 @@ namespace clp { * * Two implementation details allow this class to avoid inducing overhead when profiling is * disabled: - * - All methods bodies are defined in the header, guarded by `if constexpr (PROF_ENABLED)`. When + * - All methods bodies are defined in the header, guarded by `if constexpr (PROF_ACTIVE)`. When * profiling is disabled, the compiler will detect the empty body and won't add any code to the * binary; if the methods were instead defined in the .cpp file, the compiler would still generate * an empty method. @@ -46,6 +52,7 @@ class Profiler { Length }; enum class FragmentedMeasurementIndex : size_t { + Search = 0, Length }; @@ -60,6 +67,7 @@ class Profiler { }(); static constexpr auto cFragmentedMeasurementEnabled = []() { std::array enabled{}; + enabled[enum_to_underlying_type(FragmentedMeasurementIndex::Search)] = true; return enabled; }(); @@ -68,7 +76,7 @@ class Profiler { * Static initializer for class. This must be called before using the class. */ static void init() { - if constexpr (PROF_ENABLED) { + if constexpr (PROF_ACTIVE) { m_continuous_measurements = new std::vector( enum_to_underlying_type(ContinuousMeasurementIndex::Length) ); @@ -80,7 +88,7 @@ class Profiler { template static void start_continuous_measurement() { - if constexpr (PROF_ENABLED && cContinuousMeasurementEnabled[enum_to_underlying_type(index)]) + if constexpr (PROF_ACTIVE && cContinuousMeasurementEnabled[enum_to_underlying_type(index)]) { auto& stopwatch = (*m_continuous_measurements)[enum_to_underlying_type(index)]; stopwatch.reset(); @@ -90,7 +98,7 @@ class Profiler { template static void stop_continuous_measurement() { - if constexpr (PROF_ENABLED && cContinuousMeasurementEnabled[enum_to_underlying_type(index)]) + if constexpr (PROF_ACTIVE && cContinuousMeasurementEnabled[enum_to_underlying_type(index)]) { (*m_continuous_measurements)[enum_to_underlying_type(index)].stop(); } @@ -98,7 +106,7 @@ class Profiler { template static double get_continuous_measurement_in_seconds() { - if constexpr (PROF_ENABLED) { + if constexpr (PROF_ACTIVE) { return (*m_continuous_measurements)[enum_to_underlying_type(index)] .get_time_taken_in_seconds(); } else { @@ -108,7 +116,7 @@ class Profiler { template static void start_fragmented_measurement() { - if constexpr (PROF_ENABLED && cFragmentedMeasurementEnabled[enum_to_underlying_type(index)]) + if constexpr (PROF_ACTIVE && cFragmentedMeasurementEnabled[enum_to_underlying_type(index)]) { (*m_fragmented_measurements)[enum_to_underlying_type(index)].start(); } @@ -116,7 +124,7 @@ class Profiler { template static void stop_fragmented_measurement() { - if constexpr (PROF_ENABLED && cFragmentedMeasurementEnabled[enum_to_underlying_type(index)]) + if constexpr (PROF_ACTIVE && cFragmentedMeasurementEnabled[enum_to_underlying_type(index)]) { (*m_fragmented_measurements)[enum_to_underlying_type(index)].stop(); } @@ -124,7 +132,7 @@ class Profiler { template static void reset_fragmented_measurement() { - if constexpr (PROF_ENABLED && cFragmentedMeasurementEnabled[enum_to_underlying_type(index)]) + if constexpr (PROF_ACTIVE && cFragmentedMeasurementEnabled[enum_to_underlying_type(index)]) { (*m_fragmented_measurements)[enum_to_underlying_type(index)].reset(); } @@ -132,7 +140,16 @@ class Profiler { template static double get_fragmented_measurement_in_seconds() { - if constexpr (PROF_ENABLED) { + if constexpr (PROF_ACTIVE) { + return (*m_fragmented_measurements)[enum_to_underlying_type(index)] + .get_time_taken_in_seconds(); + } else { + return 0; + } + } + + static double get_fragmented_measurement_in_seconds_runtime(FragmentedMeasurementIndex index) { + if constexpr (PROF_ACTIVE) { return (*m_fragmented_measurements)[enum_to_underlying_type(index)] .get_time_taken_in_seconds(); } else { @@ -150,7 +167,7 @@ class Profiler { // NOTE: We use macros so that we can add the measurement index to the log (not easy to do with // templates). #define LOG_CONTINUOUS_MEASUREMENT(x) \ - if (PROF_ENABLED \ + if (PROF_ACTIVE \ && ::clp::Profiler::cContinuousMeasurementEnabled[enum_to_underlying_type(x)]) \ { \ SPDLOG_INFO( \ @@ -160,7 +177,7 @@ class Profiler { ); \ } #define LOG_FRAGMENTED_MEASUREMENT(x) \ - if (PROF_ENABLED \ + if (PROF_ACTIVE \ && ::clp::Profiler::cFragmentedMeasurementEnabled[enum_to_underlying_type(x)]) \ { \ SPDLOG_INFO( \ @@ -170,7 +187,7 @@ class Profiler { ); \ } #define PROFILER_SPDLOG_INFO(...) \ - if (PROF_ENABLED) { \ + if (PROF_ACTIVE) { \ SPDLOG_INFO(__VA_ARGS__); \ } diff --git a/components/core/src/clp/ProfilerReporter.hpp b/components/core/src/clp/ProfilerReporter.hpp new file mode 100644 index 0000000000..019335a917 --- /dev/null +++ b/components/core/src/clp/ProfilerReporter.hpp @@ -0,0 +1,31 @@ +#ifndef CLP_PROFILER_REPORT_HPP +#define CLP_PROFILER_REPORT_HPP + +#include "Profiler.hpp" +#include + +namespace clp { +class ProfilerReporter { +public: + ProfilerReporter() { Profiler::init(); } + ~ProfilerReporter() { print_all_enabled_measurements(); } + + ProfilerReporter(const ProfilerReporter&) = delete; + ProfilerReporter& operator=(const ProfilerReporter&) = delete; + ProfilerReporter(ProfilerReporter&&) = delete; + ProfilerReporter& operator=(ProfilerReporter&&) = delete; + + auto print_all_enabled_measurements() -> void { + auto length{enum_to_underlying_type(Profiler::FragmentedMeasurementIndex::Length)}; + for (size_t i{0}; i < length; ++i) { + if (Profiler::cFragmentedMeasurementEnabled[i]) { + auto index{static_cast(i)}; + auto runtime{Profiler::get_fragmented_measurement_in_seconds_runtime(index)}; + SPDLOG_INFO("Measurement {}: {} s", i, runtime); + } + } + } +}; +} // namespace clp + +#endif // CLP_PROFILER_REPORT_HPP diff --git a/components/core/src/clp/ScopedProfiler.hpp b/components/core/src/clp/ScopedProfiler.hpp index 2420ad5eb0..ff0d7c659d 100644 --- a/components/core/src/clp/ScopedProfiler.hpp +++ b/components/core/src/clp/ScopedProfiler.hpp @@ -7,12 +7,12 @@ namespace clp { /** * RAII wrapper to measure the execution time of a code scope. * - * This class starts a continuous measurement in its constructor and stops it in its destructor. It + * This class starts a fragmented measurement in its constructor and stops it in its destructor. It * reports the measured time to the corresponding slot in the profiler. Use this class when you want * to measure a single logical phase of your program (e.g., a method) without calling start/stop. * * Usage for a logical phase: - * - Define a unique measurement `index` in `Profiler::ContinuousMeasurementIndex`. Each `index` + * - Define a unique measurement `index` in `Profiler::FragmentedsMeasurementIndex`. Each `index` * corresponds to a slot in the profiler that accumulates total time. * - Use macro PROFILE_SCOPE(`index`) at the top of the logical phase, ideally this is always done * at the top of a method for organization and clarity. @@ -22,12 +22,12 @@ namespace clp { * - Safe with early returns and exceptions because stopping occurs in the destructor. * - All measurements respect `PROF_ENABLED`, so no code is generated when profiling is disabled. */ -template +template class ScopedProfiler { public: - inline ScopedProfiler() { Profiler::start_continuous_measurement(); } + ScopedProfiler() { Profiler::start_fragmented_measurement(); } - inline ~ScopedProfiler() { Profiler::stop_continuous_measurement(); } + ~ScopedProfiler() { Profiler::stop_fragmented_measurement(); } ScopedProfiler(const ScopedProfiler&) = delete; ScopedProfiler& operator=(const ScopedProfiler&) = delete; diff --git a/components/core/src/clp/clp/CMakeLists.txt b/components/core/src/clp/clp/CMakeLists.txt index 4559edd835..153e9802c2 100644 --- a/components/core/src/clp/clp/CMakeLists.txt +++ b/components/core/src/clp/clp/CMakeLists.txt @@ -87,6 +87,7 @@ set( ../Platform.hpp ../Profiler.cpp ../Profiler.hpp + ../ProfilerReporter.hpp ../Query.cpp ../Query.hpp ../ReaderInterface.cpp diff --git a/components/core/tests/test-ProfilerReporter.cpp b/components/core/tests/test-ProfilerReporter.cpp new file mode 100644 index 0000000000..1490dc3ee3 --- /dev/null +++ b/components/core/tests/test-ProfilerReporter.cpp @@ -0,0 +1,9 @@ +#include + +#include + +using clp::ProfilerReporter; + +TEST_CASE("create_profiler_repoter", "[profiler]") { + ProfilerReporter profiler_reporter; +} \ No newline at end of file diff --git a/components/core/tests/test-ScopedProfiler.cpp b/components/core/tests/test-ScopedProfiler.cpp new file mode 100644 index 0000000000..db843364fe --- /dev/null +++ b/components/core/tests/test-ScopedProfiler.cpp @@ -0,0 +1,71 @@ +#include +#include + +#include + +#include +#include +#include + +using clp::Profiler; +using clp::Profiler; +using clp::ScopedProfiler; + +constexpr auto cIndex{Profiler::FragmentedMeasurementIndex::Search}; + +TEST_CASE("macro_is_set", "[profiler]") { + REQUIRE(PROF_ACTIVE == 1); +} + +TEST_CASE("measurement_index_is_set", "[profiler]") { + REQUIRE(Profiler::cFragmentedMeasurementEnabled[clp::enum_to_underlying_type(cIndex)]); +} + +TEST_CASE("scoped_profiler_starts_and_stops_timer_automatically", "[profiler]") { + Profiler::init(); + Profiler::reset_fragmented_measurement(); + + { + ScopedProfiler profiler; + std::this_thread::sleep_for(std::chrono::milliseconds(50)); + } + + auto measured{Profiler::get_fragmented_measurement_in_seconds()}; + REQUIRE(measured >= 0.05); + REQUIRE(measured < 0.14); +} + +TEST_CASE("scoped_profiler_accumulates_across_multiple_scopes", "[profiler]") { + Profiler::init(); + Profiler::reset_fragmented_measurement(); + + { + ScopedProfiler profiler; + std::this_thread::sleep_for(std::chrono::milliseconds(20)); + } + + std::this_thread::sleep_for(std::chrono::milliseconds(200)); + + { + ScopedProfiler profiler; + std::this_thread::sleep_for(std::chrono::milliseconds(30)); + } + + auto measured{Profiler::get_fragmented_measurement_in_seconds()}; + REQUIRE(measured >= 0.05); + REQUIRE(measured < 0.14); +} + +TEST_CASE("scoped_profiler_macro_works", "[profiler]") { + Profiler::init(); + Profiler::reset_fragmented_measurement(); + + { + PROFILE_SCOPE(cIndex); + std::this_thread::sleep_for(std::chrono::milliseconds(50)); + } + + auto measured{Profiler::get_fragmented_measurement_in_seconds()}; + REQUIRE(measured >= 0.05); + REQUIRE(measured < 0.14); +} diff --git a/components/core/tests/test-Stopwatch.cpp b/components/core/tests/test-Stopwatch.cpp index 1a58df8661..66e0b3cb44 100644 --- a/components/core/tests/test-Stopwatch.cpp +++ b/components/core/tests/test-Stopwatch.cpp @@ -1,4 +1,5 @@ -#include +#include +#include #include @@ -15,7 +16,7 @@ TEST_CASE("Stopwatch", "[Stopwatch]") { SECTION("Test reset()") { // Measure some work stopwatch.start(); - sleep(1); + std::this_thread::sleep_for(std::chrono::milliseconds(50)); stopwatch.stop(); stopwatch.reset(); @@ -27,30 +28,30 @@ TEST_CASE("Stopwatch", "[Stopwatch]") { SECTION("Test single measurement") { // Measure some work stopwatch.start(); - sleep(1); + std::this_thread::sleep_for(std::chrono::milliseconds(50)); stopwatch.stop(); double time_taken = stopwatch.get_time_taken_in_seconds(); - REQUIRE(time_taken >= 1.0); - REQUIRE(time_taken < 1.1); + REQUIRE(time_taken >= 0.05); + REQUIRE(time_taken < 0.1); } SECTION("Test multiple measurements") { // Measure some work stopwatch.start(); - sleep(1); + std::this_thread::sleep_for(std::chrono::milliseconds(50)); stopwatch.stop(); // Do some other work - sleep(1); + std::this_thread::sleep_for(std::chrono::milliseconds(200)); // Measure some work again stopwatch.start(); - sleep(2); + std::this_thread::sleep_for(std::chrono::milliseconds(100)); stopwatch.stop(); double time_taken = stopwatch.get_time_taken_in_seconds(); - REQUIRE(time_taken >= 3.0); - REQUIRE(time_taken < 3.1); + REQUIRE(time_taken >= 0.15); + REQUIRE(time_taken < 0.25); } } From 7bd1740b9beb4445b742fc8a96be6d744f6922ba Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Thu, 5 Mar 2026 15:00:28 -0500 Subject: [PATCH 136/164] Use new profiling code. --- components/core/src/clp/Profiler.hpp | 41 +++---------------- components/core/src/clp/ScopedProfiler.hpp | 5 ++- components/core/src/clp/clg/clg.cpp | 13 +++--- components/core/src/clp/clo/clo.cpp | 5 ++- .../core/src/clp/clp/FileCompressor.cpp | 13 +++--- components/core/src/clp/clp/run.cpp | 14 +++---- 6 files changed, 30 insertions(+), 61 deletions(-) diff --git a/components/core/src/clp/Profiler.hpp b/components/core/src/clp/Profiler.hpp index 9d307dbb3d..f02cfd37a3 100644 --- a/components/core/src/clp/Profiler.hpp +++ b/components/core/src/clp/Profiler.hpp @@ -46,13 +46,12 @@ class Profiler { public: // Types enum class ContinuousMeasurementIndex : size_t { - Compression = 0, - ParseLogFile, - Search, Length }; enum class FragmentedMeasurementIndex : size_t { - Search = 0, + Compression = 0, + ParseLogFile, + Search, Length }; @@ -60,13 +59,12 @@ class Profiler { // NOTE: We use lambdas so that we can programmatically initialize the constexpr array static constexpr auto cContinuousMeasurementEnabled = []() { std::array enabled{}; - enabled[enum_to_underlying_type(ContinuousMeasurementIndex::Compression)] = true; - enabled[enum_to_underlying_type(ContinuousMeasurementIndex::ParseLogFile)] = true; - enabled[enum_to_underlying_type(ContinuousMeasurementIndex::Search)] = true; return enabled; }(); static constexpr auto cFragmentedMeasurementEnabled = []() { std::array enabled{}; + enabled[enum_to_underlying_type(FragmentedMeasurementIndex::Compression)] = true; + enabled[enum_to_underlying_type(FragmentedMeasurementIndex::ParseLogFile)] = true; enabled[enum_to_underlying_type(FragmentedMeasurementIndex::Search)] = true; return enabled; }(); @@ -162,33 +160,4 @@ class Profiler { static std::vector* m_continuous_measurements; }; } // namespace clp - -// Macros to log the measurements -// NOTE: We use macros so that we can add the measurement index to the log (not easy to do with -// templates). -#define LOG_CONTINUOUS_MEASUREMENT(x) \ - if (PROF_ACTIVE \ - && ::clp::Profiler::cContinuousMeasurementEnabled[enum_to_underlying_type(x)]) \ - { \ - SPDLOG_INFO( \ - "{} took {} s", \ - #x, \ - ::clp::Profiler::get_continuous_measurement_in_seconds() \ - ); \ - } -#define LOG_FRAGMENTED_MEASUREMENT(x) \ - if (PROF_ACTIVE \ - && ::clp::Profiler::cFragmentedMeasurementEnabled[enum_to_underlying_type(x)]) \ - { \ - SPDLOG_INFO( \ - "{} took {} s", \ - #x, \ - ::clp::Profiler::get_fragmented_measurement_in_seconds() \ - ); \ - } -#define PROFILER_SPDLOG_INFO(...) \ - if (PROF_ACTIVE) { \ - SPDLOG_INFO(__VA_ARGS__); \ - } - #endif // CLP_PROFILER_HPP diff --git a/components/core/src/clp/ScopedProfiler.hpp b/components/core/src/clp/ScopedProfiler.hpp index ff0d7c659d..5668aa4ba1 100644 --- a/components/core/src/clp/ScopedProfiler.hpp +++ b/components/core/src/clp/ScopedProfiler.hpp @@ -25,7 +25,10 @@ namespace clp { template class ScopedProfiler { public: - ScopedProfiler() { Profiler::start_fragmented_measurement(); } + ScopedProfiler() { + Profiler::init(); + Profiler::start_fragmented_measurement(); + } ~ScopedProfiler() { Profiler::stop_fragmented_measurement(); } diff --git a/components/core/src/clp/clg/clg.cpp b/components/core/src/clp/clg/clg.cpp index 7122a948a4..689070b9b2 100644 --- a/components/core/src/clp/clg/clg.cpp +++ b/components/core/src/clp/clg/clg.cpp @@ -13,6 +13,8 @@ #include "../Grep.hpp" #include "../GrepCore.hpp" #include "../Profiler.hpp" +#include "../ProfilerReporter.hpp" +#include "../ScopedProfiler.hpp" #include "../spdlog_with_specializations.hpp" #include "../streaming_archive/Constants.hpp" #include "../Utils.hpp" @@ -31,6 +33,7 @@ using clp::GrepCore; using clp::load_lexer_from_file; using clp::logtype_dictionary_id_t; using clp::Profiler; +using clp::ProfilerReporter; using clp::Query; using clp::segment_id_t; using clp::streaming_archive::MetadataDB; @@ -475,6 +478,8 @@ static void print_result_binary( } int main(int argc, char const* argv[]) { + PROFILE_SCOPE(Profiler::FragmentedMeasurementIndex::Search); + // Program-wide initialization try { auto stderr_logger = spdlog::stderr_logger_st("stderr"); @@ -484,7 +489,7 @@ int main(int argc, char const* argv[]) { // NOTE: We can't log an exception if the logger couldn't be constructed return -1; } - Profiler::init(); + ProfilerReporter profiler_reporter; clp::TimestampPattern::init(); CommandLineArguments command_line_args("clg"); @@ -499,8 +504,6 @@ int main(int argc, char const* argv[]) { break; } - Profiler::start_continuous_measurement(); - auto add_implicit_wildcards = [](string const& search_string) -> string { return clean_up_wildcard_search_string('*' + search_string + '*'); }; @@ -612,9 +615,5 @@ int main(int argc, char const* argv[]) { } global_metadata_db->close(); - - Profiler::stop_continuous_measurement(); - LOG_CONTINUOUS_MEASUREMENT(Profiler::ContinuousMeasurementIndex::Search) - return 0; } diff --git a/components/core/src/clp/clo/clo.cpp b/components/core/src/clp/clo/clo.cpp index 94b9e793b6..3b07734211 100644 --- a/components/core/src/clp/clo/clo.cpp +++ b/components/core/src/clp/clo/clo.cpp @@ -15,7 +15,7 @@ #include "../Grep.hpp" #include "../GrepCore.hpp" #include "../ir/constants.hpp" -#include "../Profiler.hpp" +#include "../ProfilerReporter.hpp" #include "../spdlog_with_specializations.hpp" #include "../Utils.hpp" #include "CommandLineArguments.hpp" @@ -42,6 +42,7 @@ using clp::GrepCore; using clp::ir::cIrFileExtension; using clp::load_lexer_from_file; using clp::logtype_dictionary_id_t; +using clp::ProfilerReporter; using clp::Query; using clp::segment_id_t; using clp::streaming_archive::MetadataDB; @@ -580,7 +581,7 @@ int main(int argc, char const* argv[]) { // NOTE: We can't log an exception if the logger couldn't be constructed return -1; } - clp::Profiler::init(); + ProfilerReporter profiler_reporter; clp::TimestampPattern::init(); CommandLineArguments command_line_args("clo"); diff --git a/components/core/src/clp/clp/FileCompressor.cpp b/components/core/src/clp/clp/FileCompressor.cpp index b685b22ac9..aabf27dada 100644 --- a/components/core/src/clp/clp/FileCompressor.cpp +++ b/components/core/src/clp/clp/FileCompressor.cpp @@ -18,6 +18,7 @@ #include "../ir/utils.hpp" #include "../LogSurgeonReader.hpp" #include "../Profiler.hpp" +#include "../ScopedProfiler.hpp" #include "../streaming_archive/writer/utils.hpp" #include "../utf8_utils.hpp" #include "utils.hpp" @@ -27,6 +28,8 @@ using clp::ir::four_byte_encoded_variable_t; using clp::ir::has_ir_stream_magic_number; using clp::ir::LogEventDeserializer; using clp::ParsedMessage; +using clp::Profiler; +using clp::ScopedProfiler; using clp::streaming_archive::writer::split_archive; using clp::streaming_archive::writer::split_file; using clp::streaming_archive::writer::split_file_and_archive; @@ -122,10 +125,9 @@ bool FileCompressor::compress_file( streaming_archive::writer::Archive& archive_writer, bool use_heuristic ) { - string file_name = std::filesystem::canonical(file_to_compress.get_path()).string(); + PROFILE_SCOPE(Profiler::FragmentedMeasurementIndex::ParseLogFile); - PROFILER_SPDLOG_INFO("Start parsing {}", file_name) - Profiler::start_continuous_measurement(); + string file_name = std::filesystem::canonical(file_to_compress.get_path()).string(); BufferedReader buffered_file_reader{make_unique(file_to_compress.get_path())}; @@ -179,11 +181,6 @@ bool FileCompressor::compress_file( succeeded = false; } } - - Profiler::stop_continuous_measurement(); - LOG_CONTINUOUS_MEASUREMENT(Profiler::ContinuousMeasurementIndex::ParseLogFile) - PROFILER_SPDLOG_INFO("Done parsing {}", file_name) - return succeeded; } diff --git a/components/core/src/clp/clp/run.cpp b/components/core/src/clp/clp/run.cpp index e1e73bf11f..f612b924ec 100644 --- a/components/core/src/clp/clp/run.cpp +++ b/components/core/src/clp/clp/run.cpp @@ -6,6 +6,8 @@ #include #include "../Profiler.hpp" +#include "../ProfilerReporter.hpp" +#include "../ScopedProfiler.hpp" #include "../spdlog_with_specializations.hpp" #include "../Utils.hpp" #include "CommandLineArguments.hpp" @@ -13,12 +15,16 @@ #include "decompression.hpp" #include "utils.hpp" +using clp::Profiler; +using clp::ProfilerReporter; using std::string; using std::unordered_set; using std::vector; namespace clp::clp { int run(int argc, char const* argv[]) { + PROFILE_SCOPE(Profiler::FragmentedMeasurementIndex::Compression); + // Program-wide initialization try { auto stderr_logger = spdlog::stderr_logger_st("stderr"); @@ -28,7 +34,7 @@ int run(int argc, char const* argv[]) { // NOTE: We can't log an exception if the logger couldn't be constructed return -1; } - Profiler::init(); + ProfilerReporter profiler_reporter; TimestampPattern::init(); CommandLineArguments command_line_args("clp"); @@ -45,8 +51,6 @@ int run(int argc, char const* argv[]) { vector input_paths = command_line_args.get_input_paths(); - Profiler::start_continuous_measurement(); - // Read input paths from file if necessary if (false == command_line_args.get_path_list_path().empty()) { if (false == read_input_paths(command_line_args.get_path_list_path(), input_paths)) { @@ -173,10 +177,6 @@ int run(int argc, char const* argv[]) { SPDLOG_ERROR("Command {} not implemented.", enum_to_underlying_type(command)); return -1; } - - Profiler::stop_continuous_measurement(); - LOG_CONTINUOUS_MEASUREMENT(Profiler::ContinuousMeasurementIndex::Compression) - return 0; } } // namespace clp::clp From 74778c16a97b0050a631d095daebc99b56b848f4 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Thu, 5 Mar 2026 16:12:15 -0500 Subject: [PATCH 137/164] Combine continuous and fragmented measurements; Seperate runtime and compile-time meaurements. --- components/core/src/clp/Profiler.cpp | 8 +- components/core/src/clp/Profiler.hpp | 143 ++++++++---------- components/core/src/clp/ProfilerReporter.hpp | 14 +- components/core/src/clp/ScopedProfiler.hpp | 20 +-- components/core/src/clp/clg/clg.cpp | 2 +- .../core/src/clp/clp/FileCompressor.cpp | 2 +- components/core/src/clp/clp/run.cpp | 4 +- components/core/tests/test-ScopedProfiler.cpp | 26 ++-- 8 files changed, 93 insertions(+), 126 deletions(-) diff --git a/components/core/src/clp/Profiler.cpp b/components/core/src/clp/Profiler.cpp index 784fbdd613..bc64a07e92 100644 --- a/components/core/src/clp/Profiler.cpp +++ b/components/core/src/clp/Profiler.cpp @@ -1,11 +1,9 @@ #include "Profiler.hpp" -#include - -using std::unique_ptr; using std::vector; namespace clp { -vector* Profiler::m_fragmented_measurements = nullptr; -vector* Profiler::m_continuous_measurements = nullptr; +std::unordered_map Profiler::m_runtime_measurements; +vector* Profiler::m_compile_time_measurements = nullptr; +bool Profiler::m_initialized = false; } // namespace clp diff --git a/components/core/src/clp/Profiler.hpp b/components/core/src/clp/Profiler.hpp index f02cfd37a3..f0d6420d2e 100644 --- a/components/core/src/clp/Profiler.hpp +++ b/components/core/src/clp/Profiler.hpp @@ -8,8 +8,11 @@ #endif #include +#include #include +#include + #include "Stopwatch.hpp" #include "type_utils.hpp" @@ -17,20 +20,16 @@ namespace clp { /** * Class to time code. * - * There are two types of measurements: - * - Continuous measurements where a user needs to time a single, continuous operation. - * - Fragmented measurements where a user needs to time multiple, separated instances of an - * operation. For example if we want to get the total run time taken for inserting entries into a - * dictionary, we could wrap the insertion with a fragmented measurement. - * - * To add a measurement, add it to the ContinuousMeasurementIndex or FragmentedMeasurementIndex - * enums and add a corresponding enable flag to cContinuousMeasurementEnabled or - * cFragmentedMeasurementEnabled. The flags allow enabling/disabling specific measurements such that - * a disabled measurement will not affect the performance of the program (except for extra heap - * storage). + * A Measurement can be taken over a single continuous operation, or called multiple times to + * accumulate fragemented measurements into a single total run time. * - * To log a measurement, use LOG_CONTINUOUS_MEASUREMENT or LOG_FRAGMENTED_MEASUREMENT, passing in - * the relevant measurement index enum. + * There are two ways to add a measurement: + * 1. For measurements that are taken a small number of times use a runtime measurement. + * 2. For measurements that are in hot loops, use a compile-time measurement, such that when it is + * disabled it has zero-overhead. In this case to add a measurement, add it to the + * MeasurementIndex enum and add a corresponding enable flag to cMeasurementEnabled. The flags + * allow enabling/disabling specific measurements such that a disabled measurement will not + * affect the performance of the program (except for extra heap storage). * * Two implementation details allow this class to avoid inducing overhead when profiling is * disabled: @@ -38,34 +37,20 @@ namespace clp { * profiling is disabled, the compiler will detect the empty body and won't add any code to the * binary; if the methods were instead defined in the .cpp file, the compiler would still generate * an empty method. - * - The methods use the measurement enum as a template parameter to indicate which measurement the - * method call is for. So at compile-time, for each measurement, the compiler can use the enable - * flag to determine whether to generate code to do the measurement or whether to do nothing. + * - The compile-time methods use the measurement enum as a template parameter to indicate which + * measurement the method call is for. So at compile-time, for each measurement, the compiler can + * use the enable flag to determine whether to generate code to do the measurement or whether to + * do nothing. */ class Profiler { public: - // Types - enum class ContinuousMeasurementIndex : size_t { - Length - }; - enum class FragmentedMeasurementIndex : size_t { - Compression = 0, - ParseLogFile, - Search, + enum class CompileTimeMeasurementIndex : uint8_t { Length }; - // Constants // NOTE: We use lambdas so that we can programmatically initialize the constexpr array - static constexpr auto cContinuousMeasurementEnabled = []() { - std::array enabled{}; - return enabled; - }(); - static constexpr auto cFragmentedMeasurementEnabled = []() { - std::array enabled{}; - enabled[enum_to_underlying_type(FragmentedMeasurementIndex::Compression)] = true; - enabled[enum_to_underlying_type(FragmentedMeasurementIndex::ParseLogFile)] = true; - enabled[enum_to_underlying_type(FragmentedMeasurementIndex::Search)] = true; + static constexpr auto cMeasurementEnabled = []() { + std::array enabled{}; return enabled; }(); @@ -75,80 +60,77 @@ class Profiler { */ static void init() { if constexpr (PROF_ACTIVE) { - m_continuous_measurements = new std::vector( - enum_to_underlying_type(ContinuousMeasurementIndex::Length) - ); - m_fragmented_measurements = new std::vector( - enum_to_underlying_type(FragmentedMeasurementIndex::Length) + if (m_initialized) { + return; + } + m_initialized = true; + m_compile_time_measurements = new std::vector( + enum_to_underlying_type(CompileTimeMeasurementIndex::Length) ); } } - template - static void start_continuous_measurement() { - if constexpr (PROF_ACTIVE && cContinuousMeasurementEnabled[enum_to_underlying_type(index)]) - { - auto& stopwatch = (*m_continuous_measurements)[enum_to_underlying_type(index)]; - stopwatch.reset(); - stopwatch.start(); + static auto start_runtime_measurement(std::string const& name) -> void { + if constexpr (PROF_ACTIVE) { + // implicitly creates the timer if it doesn't exist yet + m_runtime_measurements[name].start(); } } - template - static void stop_continuous_measurement() { - if constexpr (PROF_ACTIVE && cContinuousMeasurementEnabled[enum_to_underlying_type(index)]) - { - (*m_continuous_measurements)[enum_to_underlying_type(index)].stop(); + static auto stop_runtime_measurement(std::string const& name) -> void { + if constexpr (PROF_ACTIVE) { + m_runtime_measurements[name].stop(); } } - template - static double get_continuous_measurement_in_seconds() { + static auto reset_runtime_measurement(std::string const& name) -> void { if constexpr (PROF_ACTIVE) { - return (*m_continuous_measurements)[enum_to_underlying_type(index)] - .get_time_taken_in_seconds(); + m_runtime_measurements[name].reset(); + } + } + + static auto get_runtime_measurement_in_seconds(std::string const& name) -> double { + if constexpr (PROF_ACTIVE) { + return m_runtime_measurements[name].get_time_taken_in_seconds(); } else { return 0; } } - template - static void start_fragmented_measurement() { - if constexpr (PROF_ACTIVE && cFragmentedMeasurementEnabled[enum_to_underlying_type(index)]) - { - (*m_fragmented_measurements)[enum_to_underlying_type(index)].start(); + static auto print_all_runtime_measurements() -> void { + for (auto const& [name, stopwatch] : m_runtime_measurements) { + SPDLOG_INFO("Measurement {}: {} s", name, get_runtime_measurement_in_seconds(name)); } } - template - static void stop_fragmented_measurement() { - if constexpr (PROF_ACTIVE && cFragmentedMeasurementEnabled[enum_to_underlying_type(index)]) + template + static void start_compile_time_measurement() { + if constexpr (PROF_ACTIVE && cMeasurementEnabled[enum_to_underlying_type(index)]) { - (*m_fragmented_measurements)[enum_to_underlying_type(index)].stop(); + (*m_compile_time_measurements)[enum_to_underlying_type(index)].start(); } } - template - static void reset_fragmented_measurement() { - if constexpr (PROF_ACTIVE && cFragmentedMeasurementEnabled[enum_to_underlying_type(index)]) + template + static void stop_compile_time_measurement() { + if constexpr (PROF_ACTIVE && cMeasurementEnabled[enum_to_underlying_type(index)]) { - (*m_fragmented_measurements)[enum_to_underlying_type(index)].reset(); + (*m_compile_time_measurements)[enum_to_underlying_type(index)].stop(); } } - template - static double get_fragmented_measurement_in_seconds() { - if constexpr (PROF_ACTIVE) { - return (*m_fragmented_measurements)[enum_to_underlying_type(index)] - .get_time_taken_in_seconds(); - } else { - return 0; + template + static void reset_compile_time_measurement() { + if constexpr (PROF_ACTIVE && cMeasurementEnabled[enum_to_underlying_type(index)]) + { + (*m_compile_time_measurements)[enum_to_underlying_type(index)].reset(); } } - static double get_fragmented_measurement_in_seconds_runtime(FragmentedMeasurementIndex index) { - if constexpr (PROF_ACTIVE) { - return (*m_fragmented_measurements)[enum_to_underlying_type(index)] + template + static double get_compile_time_measurement_in_seconds() { + if constexpr (PROF_ACTIVE && cMeasurementEnabled[enum_to_underlying_type(index)]) { + return (*m_compile_time_measurements)[enum_to_underlying_type(index)] .get_time_taken_in_seconds(); } else { return 0; @@ -156,8 +138,9 @@ class Profiler { } private: - static std::vector* m_fragmented_measurements; - static std::vector* m_continuous_measurements; + static std::unordered_map m_runtime_measurements; + static std::vector* m_compile_time_measurements; + static bool m_initialized; }; } // namespace clp #endif // CLP_PROFILER_HPP diff --git a/components/core/src/clp/ProfilerReporter.hpp b/components/core/src/clp/ProfilerReporter.hpp index 019335a917..d54d1463ff 100644 --- a/components/core/src/clp/ProfilerReporter.hpp +++ b/components/core/src/clp/ProfilerReporter.hpp @@ -2,29 +2,17 @@ #define CLP_PROFILER_REPORT_HPP #include "Profiler.hpp" -#include namespace clp { class ProfilerReporter { public: ProfilerReporter() { Profiler::init(); } - ~ProfilerReporter() { print_all_enabled_measurements(); } + ~ProfilerReporter() { Profiler::print_all_runtime_measurements(); } ProfilerReporter(const ProfilerReporter&) = delete; ProfilerReporter& operator=(const ProfilerReporter&) = delete; ProfilerReporter(ProfilerReporter&&) = delete; ProfilerReporter& operator=(ProfilerReporter&&) = delete; - - auto print_all_enabled_measurements() -> void { - auto length{enum_to_underlying_type(Profiler::FragmentedMeasurementIndex::Length)}; - for (size_t i{0}; i < length; ++i) { - if (Profiler::cFragmentedMeasurementEnabled[i]) { - auto index{static_cast(i)}; - auto runtime{Profiler::get_fragmented_measurement_in_seconds_runtime(index)}; - SPDLOG_INFO("Measurement {}: {} s", i, runtime); - } - } - } }; } // namespace clp diff --git a/components/core/src/clp/ScopedProfiler.hpp b/components/core/src/clp/ScopedProfiler.hpp index 5668aa4ba1..8e41e77c6b 100644 --- a/components/core/src/clp/ScopedProfiler.hpp +++ b/components/core/src/clp/ScopedProfiler.hpp @@ -1,20 +1,20 @@ #ifndef CLP_SCOPED_PROFILER_HPP #define CLP_SCOPED_PROFILER_HPP +#include + #include "Profiler.hpp" namespace clp { /** * RAII wrapper to measure the execution time of a code scope. * - * This class starts a fragmented measurement in its constructor and stops it in its destructor. It + * This class starts a runtime measurement in its constructor and stops it in its destructor. It * reports the measured time to the corresponding slot in the profiler. Use this class when you want * to measure a single logical phase of your program (e.g., a method) without calling start/stop. * * Usage for a logical phase: - * - Define a unique measurement `index` in `Profiler::FragmentedsMeasurementIndex`. Each `index` - * corresponds to a slot in the profiler that accumulates total time. - * - Use macro PROFILE_SCOPE(`index`) at the top of the logical phase, ideally this is always done + * - Use macro PROFILE_SCOPE(`name`) at the top of the logical phase, ideally this is always done * at the top of a method for organization and clarity. * - Set `DPROF_ENABLED=1` in `cmakelists`. * @@ -22,20 +22,22 @@ namespace clp { * - Safe with early returns and exceptions because stopping occurs in the destructor. * - All measurements respect `PROF_ENABLED`, so no code is generated when profiling is disabled. */ -template class ScopedProfiler { public: - ScopedProfiler() { + ScopedProfiler(std::string const& name) : m_name(name) { Profiler::init(); - Profiler::start_fragmented_measurement(); + Profiler::start_runtime_measurement(name); } - ~ScopedProfiler() { Profiler::stop_fragmented_measurement(); } + ~ScopedProfiler() { Profiler::stop_runtime_measurement(m_name); } ScopedProfiler(const ScopedProfiler&) = delete; ScopedProfiler& operator=(const ScopedProfiler&) = delete; ScopedProfiler(ScopedProfiler&&) = delete; ScopedProfiler& operator=(ScopedProfiler&&) = delete; + +private: + std::string m_name; }; } // namespace clp @@ -43,6 +45,6 @@ class ScopedProfiler { #define CLP_CONCAT(x, y) CLP_CONCAT_IMPL(x, y) -#define PROFILE_SCOPE(x) ::clp::ScopedProfiler CLP_CONCAT(__clp_profile_scope_, __LINE__) +#define PROFILE_SCOPE(x) ::clp::ScopedProfiler CLP_CONCAT(__clp_profile_scope_, __LINE__){x} #endif // CLP_SCOPED_PROFILER_HPP diff --git a/components/core/src/clp/clg/clg.cpp b/components/core/src/clp/clg/clg.cpp index 689070b9b2..604e131eb6 100644 --- a/components/core/src/clp/clg/clg.cpp +++ b/components/core/src/clp/clg/clg.cpp @@ -478,7 +478,7 @@ static void print_result_binary( } int main(int argc, char const* argv[]) { - PROFILE_SCOPE(Profiler::FragmentedMeasurementIndex::Search); + PROFILE_SCOPE("clg::main"); // Program-wide initialization try { diff --git a/components/core/src/clp/clp/FileCompressor.cpp b/components/core/src/clp/clp/FileCompressor.cpp index aabf27dada..41e6e8f1e0 100644 --- a/components/core/src/clp/clp/FileCompressor.cpp +++ b/components/core/src/clp/clp/FileCompressor.cpp @@ -125,7 +125,7 @@ bool FileCompressor::compress_file( streaming_archive::writer::Archive& archive_writer, bool use_heuristic ) { - PROFILE_SCOPE(Profiler::FragmentedMeasurementIndex::ParseLogFile); + PROFILE_SCOPE("FileCompressor::compress_file"); string file_name = std::filesystem::canonical(file_to_compress.get_path()).string(); diff --git a/components/core/src/clp/clp/run.cpp b/components/core/src/clp/clp/run.cpp index f612b924ec..62900d2f63 100644 --- a/components/core/src/clp/clp/run.cpp +++ b/components/core/src/clp/clp/run.cpp @@ -23,7 +23,8 @@ using std::vector; namespace clp::clp { int run(int argc, char const* argv[]) { - PROFILE_SCOPE(Profiler::FragmentedMeasurementIndex::Compression); + ProfilerReporter profiler_reporter; + PROFILE_SCOPE("clp::main"); // Program-wide initialization try { @@ -34,7 +35,6 @@ int run(int argc, char const* argv[]) { // NOTE: We can't log an exception if the logger couldn't be constructed return -1; } - ProfilerReporter profiler_reporter; TimestampPattern::init(); CommandLineArguments command_line_args("clp"); diff --git a/components/core/tests/test-ScopedProfiler.cpp b/components/core/tests/test-ScopedProfiler.cpp index db843364fe..311cdc14ce 100644 --- a/components/core/tests/test-ScopedProfiler.cpp +++ b/components/core/tests/test-ScopedProfiler.cpp @@ -11,61 +11,57 @@ using clp::Profiler; using clp::Profiler; using clp::ScopedProfiler; -constexpr auto cIndex{Profiler::FragmentedMeasurementIndex::Search}; +constexpr auto cName{"ProfileName"}; TEST_CASE("macro_is_set", "[profiler]") { REQUIRE(PROF_ACTIVE == 1); } -TEST_CASE("measurement_index_is_set", "[profiler]") { - REQUIRE(Profiler::cFragmentedMeasurementEnabled[clp::enum_to_underlying_type(cIndex)]); -} - TEST_CASE("scoped_profiler_starts_and_stops_timer_automatically", "[profiler]") { Profiler::init(); - Profiler::reset_fragmented_measurement(); + Profiler::reset_runtime_measurement(cName); { - ScopedProfiler profiler; + ScopedProfiler profiler(cName); std::this_thread::sleep_for(std::chrono::milliseconds(50)); } - auto measured{Profiler::get_fragmented_measurement_in_seconds()}; + auto const measured{Profiler::get_runtime_measurement_in_seconds(cName)}; REQUIRE(measured >= 0.05); REQUIRE(measured < 0.14); } TEST_CASE("scoped_profiler_accumulates_across_multiple_scopes", "[profiler]") { Profiler::init(); - Profiler::reset_fragmented_measurement(); + Profiler::reset_runtime_measurement(cName); { - ScopedProfiler profiler; + ScopedProfiler profiler(cName); std::this_thread::sleep_for(std::chrono::milliseconds(20)); } std::this_thread::sleep_for(std::chrono::milliseconds(200)); { - ScopedProfiler profiler; + ScopedProfiler profiler(cName); std::this_thread::sleep_for(std::chrono::milliseconds(30)); } - auto measured{Profiler::get_fragmented_measurement_in_seconds()}; + auto const measured{Profiler::get_runtime_measurement_in_seconds(cName)}; REQUIRE(measured >= 0.05); REQUIRE(measured < 0.14); } TEST_CASE("scoped_profiler_macro_works", "[profiler]") { Profiler::init(); - Profiler::reset_fragmented_measurement(); + Profiler::reset_runtime_measurement(cName); { - PROFILE_SCOPE(cIndex); + PROFILE_SCOPE(cName); std::this_thread::sleep_for(std::chrono::milliseconds(50)); } - auto measured{Profiler::get_fragmented_measurement_in_seconds()}; + auto const measured{Profiler::get_runtime_measurement_in_seconds(cName)}; REQUIRE(measured >= 0.05); REQUIRE(measured < 0.14); } From ab0da8a6fe862bb7b111fb869b07ee49935eca98 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Thu, 5 Mar 2026 16:27:10 -0500 Subject: [PATCH 138/164] Add checks for init and timer existance. --- components/core/src/clp/Profiler.hpp | 62 +++++++++++++++++++++++++--- 1 file changed, 56 insertions(+), 6 deletions(-) diff --git a/components/core/src/clp/Profiler.hpp b/components/core/src/clp/Profiler.hpp index f0d6420d2e..aad69f560d 100644 --- a/components/core/src/clp/Profiler.hpp +++ b/components/core/src/clp/Profiler.hpp @@ -70,8 +70,32 @@ class Profiler { } } + static auto check_init() -> bool { + if constexpr (PROF_ACTIVE) { + if (false == m_initialized) { + SPDLOG_ERROR("Profiler used without calling Profiler::init()"); + } + return m_initialized; + } + return false; + } + + static auto check_runtime_timer_exists(std::string const& name) -> bool { + if constexpr (PROF_ACTIVE) { + if (false == m_runtime_measurements.contains(name)) { + SPDLOG_ERROR("Attempt to get runtime measurment of non existent timer {}", name); + return false;; + } + return true; + } + return false; + } + static auto start_runtime_measurement(std::string const& name) -> void { if constexpr (PROF_ACTIVE) { + if (false == check_init()) { + return; + } // implicitly creates the timer if it doesn't exist yet m_runtime_measurements[name].start(); } @@ -79,18 +103,27 @@ class Profiler { static auto stop_runtime_measurement(std::string const& name) -> void { if constexpr (PROF_ACTIVE) { + if (false == check_init() || false == check_runtime_timer_exists(name)) { + return; + } m_runtime_measurements[name].stop(); } } static auto reset_runtime_measurement(std::string const& name) -> void { if constexpr (PROF_ACTIVE) { + if (false == check_init() || false == check_runtime_timer_exists(name)) { + return; + } m_runtime_measurements[name].reset(); } } static auto get_runtime_measurement_in_seconds(std::string const& name) -> double { if constexpr (PROF_ACTIVE) { + if (false == check_init() || false == check_runtime_timer_exists(name)) { + return 0; + } return m_runtime_measurements[name].get_time_taken_in_seconds(); } else { return 0; @@ -98,38 +131,55 @@ class Profiler { } static auto print_all_runtime_measurements() -> void { - for (auto const& [name, stopwatch] : m_runtime_measurements) { - SPDLOG_INFO("Measurement {}: {} s", name, get_runtime_measurement_in_seconds(name)); + if constexpr (PROF_ACTIVE) { + if (false == check_init()) { + return; + } + for (auto const& [name, stopwatch] : m_runtime_measurements) { + SPDLOG_INFO("Measurement {}: {} s", name, get_runtime_measurement_in_seconds(name)); + } } } template - static void start_compile_time_measurement() { + static auto start_compile_time_measurement() -> void { if constexpr (PROF_ACTIVE && cMeasurementEnabled[enum_to_underlying_type(index)]) { + if (false == check_init()) { + return; + } (*m_compile_time_measurements)[enum_to_underlying_type(index)].start(); } } template - static void stop_compile_time_measurement() { + static auto stop_compile_time_measurement() -> void { if constexpr (PROF_ACTIVE && cMeasurementEnabled[enum_to_underlying_type(index)]) { + if (false == check_init()) { + return; + } (*m_compile_time_measurements)[enum_to_underlying_type(index)].stop(); } } template - static void reset_compile_time_measurement() { + static auto reset_compile_time_measurement() -> void { if constexpr (PROF_ACTIVE && cMeasurementEnabled[enum_to_underlying_type(index)]) { + if (false == check_init()) { + return; + } (*m_compile_time_measurements)[enum_to_underlying_type(index)].reset(); } } template - static double get_compile_time_measurement_in_seconds() { + static auto get_compile_time_measurement_in_seconds() -> double { if constexpr (PROF_ACTIVE && cMeasurementEnabled[enum_to_underlying_type(index)]) { + if (false == check_init()) { + return 0; + } return (*m_compile_time_measurements)[enum_to_underlying_type(index)] .get_time_taken_in_seconds(); } else { From f5b80b05317ff205ec4d71040ea9e701377148c0 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Thu, 5 Mar 2026 16:35:29 -0500 Subject: [PATCH 139/164] Add ProfilerReported documentation. --- components/core/src/clp/ProfilerReporter.hpp | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/components/core/src/clp/ProfilerReporter.hpp b/components/core/src/clp/ProfilerReporter.hpp index d54d1463ff..ba878a7c7b 100644 --- a/components/core/src/clp/ProfilerReporter.hpp +++ b/components/core/src/clp/ProfilerReporter.hpp @@ -4,6 +4,23 @@ #include "Profiler.hpp" namespace clp { +/** + * RAII helper for automatically reporting all runtime measurements at scope exit. + * + * This class is designed to simplify the reporting of runtime measurements captured using the + * `Profiler` class. By creating an instance of `ProfilerReporter`, all runtime measurements will + * automatically be printed when the object goes out of scope. + * + * Usage: + * - Define a `ProfilerReporter` at any logical unit that encompasses all operations you want to + * profile. A common place is in the `main()` function of an executables. + * - Once the object is destructured (scope exit), the runtime measurmenets are reported. + * + * Notes: + * - Only runtime measurements (those tracked by string names) are reported. This class is + * primarily designed to work in tandem with the `ScopedProfiler` class. + * - Copy and move operations are removed to prevent accidentaly multiple reporting. + */ class ProfilerReporter { public: ProfilerReporter() { Profiler::init(); } From 4cddd715b1430a76cbee2a59d2b1463d95244400 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Thu, 5 Mar 2026 17:13:49 -0500 Subject: [PATCH 140/164] Add unit tests; Make reset reset all runtime measurments. --- components/core/src/clp/Profiler.hpp | 7 +--- .../core/tests/test-ProfilerReporter.cpp | 41 ++++++++++++++++++- components/core/tests/test-ScopedProfiler.cpp | 6 +-- 3 files changed, 44 insertions(+), 10 deletions(-) diff --git a/components/core/src/clp/Profiler.hpp b/components/core/src/clp/Profiler.hpp index aad69f560d..0005a65271 100644 --- a/components/core/src/clp/Profiler.hpp +++ b/components/core/src/clp/Profiler.hpp @@ -110,12 +110,9 @@ class Profiler { } } - static auto reset_runtime_measurement(std::string const& name) -> void { + static auto reset_runtime_measurements() -> void { if constexpr (PROF_ACTIVE) { - if (false == check_init() || false == check_runtime_timer_exists(name)) { - return; - } - m_runtime_measurements[name].reset(); + m_runtime_measurements.clear(); } } diff --git a/components/core/tests/test-ProfilerReporter.cpp b/components/core/tests/test-ProfilerReporter.cpp index 1490dc3ee3..7a38101a0b 100644 --- a/components/core/tests/test-ProfilerReporter.cpp +++ b/components/core/tests/test-ProfilerReporter.cpp @@ -1,9 +1,46 @@ +#include +#include + #include +#include #include +#include +using clp::Profiler; using clp::ProfilerReporter; +using clp::ScopedProfiler; + +TEST_CASE("profiler_reporter_reports_runtime_measurements", "[profiler]") { + Profiler::reset_runtime_measurements(); + { + ProfilerReporter outter_profiler_reporter; + PROFILE_SCOPE("scope0"); + + { + PROFILE_SCOPE("scope1"); + std::this_thread::sleep_for(std::chrono::milliseconds(20)); + ProfilerReporter inner_profiler_reporter; + } + + { + PROFILE_SCOPE("scope2"); + std::this_thread::sleep_for(std::chrono::milliseconds(20)); + ProfilerReporter inner_profiler_reporter; + } + + { + PROFILE_SCOPE("scope3"); + std::this_thread::sleep_for(std::chrono::milliseconds(50)); + ProfilerReporter inner_profiler_reporter; + } + + { + PROFILE_SCOPE("scope3"); + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + ProfilerReporter inner_profiler_reporter; + } -TEST_CASE("create_profiler_repoter", "[profiler]") { - ProfilerReporter profiler_reporter; + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + } } \ No newline at end of file diff --git a/components/core/tests/test-ScopedProfiler.cpp b/components/core/tests/test-ScopedProfiler.cpp index 311cdc14ce..f45e4ded52 100644 --- a/components/core/tests/test-ScopedProfiler.cpp +++ b/components/core/tests/test-ScopedProfiler.cpp @@ -19,7 +19,7 @@ TEST_CASE("macro_is_set", "[profiler]") { TEST_CASE("scoped_profiler_starts_and_stops_timer_automatically", "[profiler]") { Profiler::init(); - Profiler::reset_runtime_measurement(cName); + Profiler::reset_runtime_measurements(); { ScopedProfiler profiler(cName); @@ -33,7 +33,7 @@ TEST_CASE("scoped_profiler_starts_and_stops_timer_automatically", "[profiler]") TEST_CASE("scoped_profiler_accumulates_across_multiple_scopes", "[profiler]") { Profiler::init(); - Profiler::reset_runtime_measurement(cName); + Profiler::reset_runtime_measurements(); { ScopedProfiler profiler(cName); @@ -54,7 +54,7 @@ TEST_CASE("scoped_profiler_accumulates_across_multiple_scopes", "[profiler]") { TEST_CASE("scoped_profiler_macro_works", "[profiler]") { Profiler::init(); - Profiler::reset_runtime_measurement(cName); + Profiler::reset_runtime_measurements(); { PROFILE_SCOPE(cName); From 76955b6393e92e4437f5aaabf53af452ed7d53b4 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Thu, 5 Mar 2026 17:23:12 -0500 Subject: [PATCH 141/164] Make sure profiler reporter is called first. --- components/core/src/clp/ProfilerReporter.hpp | 2 ++ components/core/src/clp/clg/clg.cpp | 2 +- components/core/src/clp/clo/clo.cpp | 3 ++- components/core/tests/test-ProfilerReporter.cpp | 8 ++++---- 4 files changed, 9 insertions(+), 6 deletions(-) diff --git a/components/core/src/clp/ProfilerReporter.hpp b/components/core/src/clp/ProfilerReporter.hpp index ba878a7c7b..f27b5eed5f 100644 --- a/components/core/src/clp/ProfilerReporter.hpp +++ b/components/core/src/clp/ProfilerReporter.hpp @@ -14,6 +14,8 @@ namespace clp { * Usage: * - Define a `ProfilerReporter` at any logical unit that encompasses all operations you want to * profile. A common place is in the `main()` function of an executables. + * - If `ScopedProfiler` or `Profiler` is used in the same scope as `ProfilerReporter`, + * `ProfilerReporter`must be declared first, such that its destructor is called last. * - Once the object is destructured (scope exit), the runtime measurmenets are reported. * * Notes: diff --git a/components/core/src/clp/clg/clg.cpp b/components/core/src/clp/clg/clg.cpp index 604e131eb6..a7a1f90e33 100644 --- a/components/core/src/clp/clg/clg.cpp +++ b/components/core/src/clp/clg/clg.cpp @@ -478,6 +478,7 @@ static void print_result_binary( } int main(int argc, char const* argv[]) { + ProfilerReporter profiler_reporter; PROFILE_SCOPE("clg::main"); // Program-wide initialization @@ -489,7 +490,6 @@ int main(int argc, char const* argv[]) { // NOTE: We can't log an exception if the logger couldn't be constructed return -1; } - ProfilerReporter profiler_reporter; clp::TimestampPattern::init(); CommandLineArguments command_line_args("clg"); diff --git a/components/core/src/clp/clo/clo.cpp b/components/core/src/clp/clo/clo.cpp index 3b07734211..a315275be0 100644 --- a/components/core/src/clp/clo/clo.cpp +++ b/components/core/src/clp/clo/clo.cpp @@ -572,6 +572,8 @@ static bool search_archive( } int main(int argc, char const* argv[]) { + ProfilerReporter profiler_reporter; + // Program-wide initialization try { auto stderr_logger = spdlog::stderr_logger_st("stderr"); @@ -581,7 +583,6 @@ int main(int argc, char const* argv[]) { // NOTE: We can't log an exception if the logger couldn't be constructed return -1; } - ProfilerReporter profiler_reporter; clp::TimestampPattern::init(); CommandLineArguments command_line_args("clo"); diff --git a/components/core/tests/test-ProfilerReporter.cpp b/components/core/tests/test-ProfilerReporter.cpp index 7a38101a0b..a9e1c87e03 100644 --- a/components/core/tests/test-ProfilerReporter.cpp +++ b/components/core/tests/test-ProfilerReporter.cpp @@ -18,27 +18,27 @@ TEST_CASE("profiler_reporter_reports_runtime_measurements", "[profiler]") { PROFILE_SCOPE("scope0"); { + ProfilerReporter inner_profiler_reporter; PROFILE_SCOPE("scope1"); std::this_thread::sleep_for(std::chrono::milliseconds(20)); - ProfilerReporter inner_profiler_reporter; } { + ProfilerReporter inner_profiler_reporter; PROFILE_SCOPE("scope2"); std::this_thread::sleep_for(std::chrono::milliseconds(20)); - ProfilerReporter inner_profiler_reporter; } { + ProfilerReporter inner_profiler_reporter; PROFILE_SCOPE("scope3"); std::this_thread::sleep_for(std::chrono::milliseconds(50)); - ProfilerReporter inner_profiler_reporter; } { + ProfilerReporter inner_profiler_reporter; PROFILE_SCOPE("scope3"); std::this_thread::sleep_for(std::chrono::milliseconds(100)); - ProfilerReporter inner_profiler_reporter; } std::this_thread::sleep_for(std::chrono::milliseconds(100)); From 71e780a6cba3d141d9eb270c4e4b3a4a107e44ad Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Thu, 5 Mar 2026 17:25:32 -0500 Subject: [PATCH 142/164] Remove init from runtime measurements. --- components/core/src/clp/Profiler.hpp | 10 ++-------- components/core/src/clp/ProfilerReporter.hpp | 2 +- components/core/src/clp/ScopedProfiler.hpp | 1 - 3 files changed, 3 insertions(+), 10 deletions(-) diff --git a/components/core/src/clp/Profiler.hpp b/components/core/src/clp/Profiler.hpp index 0005a65271..e5159518c8 100644 --- a/components/core/src/clp/Profiler.hpp +++ b/components/core/src/clp/Profiler.hpp @@ -93,9 +93,6 @@ class Profiler { static auto start_runtime_measurement(std::string const& name) -> void { if constexpr (PROF_ACTIVE) { - if (false == check_init()) { - return; - } // implicitly creates the timer if it doesn't exist yet m_runtime_measurements[name].start(); } @@ -103,7 +100,7 @@ class Profiler { static auto stop_runtime_measurement(std::string const& name) -> void { if constexpr (PROF_ACTIVE) { - if (false == check_init() || false == check_runtime_timer_exists(name)) { + if (false == check_runtime_timer_exists(name)) { return; } m_runtime_measurements[name].stop(); @@ -118,7 +115,7 @@ class Profiler { static auto get_runtime_measurement_in_seconds(std::string const& name) -> double { if constexpr (PROF_ACTIVE) { - if (false == check_init() || false == check_runtime_timer_exists(name)) { + if (false == check_runtime_timer_exists(name)) { return 0; } return m_runtime_measurements[name].get_time_taken_in_seconds(); @@ -129,9 +126,6 @@ class Profiler { static auto print_all_runtime_measurements() -> void { if constexpr (PROF_ACTIVE) { - if (false == check_init()) { - return; - } for (auto const& [name, stopwatch] : m_runtime_measurements) { SPDLOG_INFO("Measurement {}: {} s", name, get_runtime_measurement_in_seconds(name)); } diff --git a/components/core/src/clp/ProfilerReporter.hpp b/components/core/src/clp/ProfilerReporter.hpp index f27b5eed5f..7e3dfc9d10 100644 --- a/components/core/src/clp/ProfilerReporter.hpp +++ b/components/core/src/clp/ProfilerReporter.hpp @@ -25,7 +25,7 @@ namespace clp { */ class ProfilerReporter { public: - ProfilerReporter() { Profiler::init(); } + ProfilerReporter() = default; ~ProfilerReporter() { Profiler::print_all_runtime_measurements(); } ProfilerReporter(const ProfilerReporter&) = delete; diff --git a/components/core/src/clp/ScopedProfiler.hpp b/components/core/src/clp/ScopedProfiler.hpp index 8e41e77c6b..18a6cb9ddf 100644 --- a/components/core/src/clp/ScopedProfiler.hpp +++ b/components/core/src/clp/ScopedProfiler.hpp @@ -25,7 +25,6 @@ namespace clp { class ScopedProfiler { public: ScopedProfiler(std::string const& name) : m_name(name) { - Profiler::init(); Profiler::start_runtime_measurement(name); } From 0ffb467765ca329cd68d620b77b28bc79caf6a06 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Fri, 6 Mar 2026 01:07:28 -0500 Subject: [PATCH 143/164] Finish unit-test for ProfilerReporter. --- components/core/src/clp/Profiler.hpp | 8 +-- components/core/src/clp/ProfilerReporter.hpp | 24 ++++++++- components/core/src/clp/Stopwatch.cpp | 2 +- components/core/src/clp/Stopwatch.hpp | 2 +- .../core/tests/test-ProfilerReporter.cpp | 50 ++++++++++++++----- 5 files changed, 64 insertions(+), 22 deletions(-) diff --git a/components/core/src/clp/Profiler.hpp b/components/core/src/clp/Profiler.hpp index e5159518c8..e42a65f30c 100644 --- a/components/core/src/clp/Profiler.hpp +++ b/components/core/src/clp/Profiler.hpp @@ -124,12 +124,8 @@ class Profiler { } } - static auto print_all_runtime_measurements() -> void { - if constexpr (PROF_ACTIVE) { - for (auto const& [name, stopwatch] : m_runtime_measurements) { - SPDLOG_INFO("Measurement {}: {} s", name, get_runtime_measurement_in_seconds(name)); - } - } + static auto get_runtime_measurements() -> std::unordered_map const& { + return m_runtime_measurements; } template diff --git a/components/core/src/clp/ProfilerReporter.hpp b/components/core/src/clp/ProfilerReporter.hpp index 7e3dfc9d10..925027161e 100644 --- a/components/core/src/clp/ProfilerReporter.hpp +++ b/components/core/src/clp/ProfilerReporter.hpp @@ -1,7 +1,13 @@ #ifndef CLP_PROFILER_REPORT_HPP #define CLP_PROFILER_REPORT_HPP +#include +#include + +#include + #include "Profiler.hpp" +#include "Stopwatch.hpp" namespace clp { /** @@ -26,12 +32,28 @@ namespace clp { class ProfilerReporter { public: ProfilerReporter() = default; - ~ProfilerReporter() { Profiler::print_all_runtime_measurements(); } + explicit ProfilerReporter(std::unordered_map& sink) : m_sink(&sink) {} + + ~ProfilerReporter() { + if (nullptr != m_sink) { + *m_sink = Profiler::get_runtime_measurements(); + return; + } + + SPDLOG_INFO("---MEASUREMENTS START---"); + for (auto const& [name, stopwatch] : Profiler::get_runtime_measurements()) { + SPDLOG_INFO("{}: {} s", name, Profiler::get_runtime_measurement_in_seconds(name)); + } + SPDLOG_INFO("----MEASUREMENTS END----"); + } ProfilerReporter(const ProfilerReporter&) = delete; ProfilerReporter& operator=(const ProfilerReporter&) = delete; ProfilerReporter(ProfilerReporter&&) = delete; ProfilerReporter& operator=(ProfilerReporter&&) = delete; + +private: + std::unordered_map* m_sink{nullptr}; }; } // namespace clp diff --git a/components/core/src/clp/Stopwatch.cpp b/components/core/src/clp/Stopwatch.cpp index 4c645b2025..1c53834fe5 100644 --- a/components/core/src/clp/Stopwatch.cpp +++ b/components/core/src/clp/Stopwatch.cpp @@ -20,7 +20,7 @@ void Stopwatch::reset() { m_time_taken = std::chrono::steady_clock::duration::zero(); } -double Stopwatch::get_time_taken_in_seconds() { +auto Stopwatch::get_time_taken_in_seconds() const -> double { std::chrono::duration time_taken_in_seconds = m_time_taken; return time_taken_in_seconds.count(); } diff --git a/components/core/src/clp/Stopwatch.hpp b/components/core/src/clp/Stopwatch.hpp index 0b87911ebb..31bc43255e 100644 --- a/components/core/src/clp/Stopwatch.hpp +++ b/components/core/src/clp/Stopwatch.hpp @@ -16,7 +16,7 @@ class Stopwatch { void stop(); void reset(); - double get_time_taken_in_seconds(); + auto get_time_taken_in_seconds() const -> double; private: // Variables diff --git a/components/core/tests/test-ProfilerReporter.cpp b/components/core/tests/test-ProfilerReporter.cpp index a9e1c87e03..f1bbcb3033 100644 --- a/components/core/tests/test-ProfilerReporter.cpp +++ b/components/core/tests/test-ProfilerReporter.cpp @@ -1,46 +1,70 @@ #include +#include #include +#include #include #include #include #include +#include using clp::Profiler; using clp::ProfilerReporter; using clp::ScopedProfiler; +using clp::Stopwatch; +using std::string; +using std::unordered_map; + +using Sink = unordered_map; +using ExpectedSink = unordered_map; + +constexpr double cTimerMarginOfError{0.1}; + +auto check_sink(Sink const& actual_sink, ExpectedSink const& expected_sink) { + for (auto const& [name, expected_time] : expected_sink) { + REQUIRE(actual_sink.contains(name)); + auto const time{actual_sink.at(name).get_time_taken_in_seconds()}; + REQUIRE(time >= expected_time); + REQUIRE(time < expected_time + cTimerMarginOfError); + } +} TEST_CASE("profiler_reporter_reports_runtime_measurements", "[profiler]") { + Sink sink0; + Sink sink1; + Sink sink2; + Sink sink3; + Profiler::reset_runtime_measurements(); { - ProfilerReporter outter_profiler_reporter; + ProfilerReporter profiler0(sink0); PROFILE_SCOPE("scope0"); { - ProfilerReporter inner_profiler_reporter; + ProfilerReporter profiler1(sink1); PROFILE_SCOPE("scope1"); - std::this_thread::sleep_for(std::chrono::milliseconds(20)); + std::this_thread::sleep_for(std::chrono::milliseconds(10)); } { - ProfilerReporter inner_profiler_reporter; + ProfilerReporter profiler2(sink2); PROFILE_SCOPE("scope2"); std::this_thread::sleep_for(std::chrono::milliseconds(20)); } { - ProfilerReporter inner_profiler_reporter; - PROFILE_SCOPE("scope3"); + ProfilerReporter profiler3(sink3); + PROFILE_SCOPE("scope2"); std::this_thread::sleep_for(std::chrono::milliseconds(50)); } - { - ProfilerReporter inner_profiler_reporter; - PROFILE_SCOPE("scope3"); - std::this_thread::sleep_for(std::chrono::milliseconds(100)); - } - std::this_thread::sleep_for(std::chrono::milliseconds(100)); } -} \ No newline at end of file + + check_sink(sink1, {{"scope0",0}, {"scope1",0.01}}); + check_sink(sink2, {{"scope0",0}, {"scope1",0.01}, {"scope2",0.02}}); + check_sink(sink3, {{"scope0",0}, {"scope1",0.01}, {"scope2",0.07}}); + check_sink(sink0, {{"scope0",0.18}, {"scope1",0.01}, {"scope2",0.07}}); +} From 85e92fa52fecfb657c6635dc27f97e7deba858b7 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Fri, 6 Mar 2026 01:28:36 -0500 Subject: [PATCH 144/164] Add stats to profiling. --- components/core/src/clp/ProfilerReporter.hpp | 6 +++++- components/core/src/clp/Stopwatch.cpp | 9 +++++++++ components/core/src/clp/Stopwatch.hpp | 8 +++++--- 3 files changed, 19 insertions(+), 4 deletions(-) diff --git a/components/core/src/clp/ProfilerReporter.hpp b/components/core/src/clp/ProfilerReporter.hpp index 925027161e..a61a61a547 100644 --- a/components/core/src/clp/ProfilerReporter.hpp +++ b/components/core/src/clp/ProfilerReporter.hpp @@ -42,7 +42,11 @@ class ProfilerReporter { SPDLOG_INFO("---MEASUREMENTS START---"); for (auto const& [name, stopwatch] : Profiler::get_runtime_measurements()) { - SPDLOG_INFO("{}: {} s", name, Profiler::get_runtime_measurement_in_seconds(name)); + auto total{stopwatch.get_time_taken_in_seconds()}; + auto calls{stopwatch.get_call_count()}; + auto avg{calls > 0 ? total/calls : 0.0}; + + SPDLOG_INFO("{}: total {:.3f} s | calls {} | avg {:.3f} s", name, total, calls, avg); } SPDLOG_INFO("----MEASUREMENTS END----"); } diff --git a/components/core/src/clp/Stopwatch.cpp b/components/core/src/clp/Stopwatch.cpp index 1c53834fe5..aaf2cc1bb1 100644 --- a/components/core/src/clp/Stopwatch.cpp +++ b/components/core/src/clp/Stopwatch.cpp @@ -1,5 +1,8 @@ #include "Stopwatch.hpp" +#include +#include + namespace clp { Stopwatch::Stopwatch() { reset(); @@ -14,6 +17,8 @@ void Stopwatch::stop() { auto time_taken = end - m_begin; m_time_taken += time_taken; + + m_call_count++; } void Stopwatch::reset() { @@ -24,4 +29,8 @@ auto Stopwatch::get_time_taken_in_seconds() const -> double { std::chrono::duration time_taken_in_seconds = m_time_taken; return time_taken_in_seconds.count(); } + +auto Stopwatch::get_call_count() const -> uint32_t { + return m_call_count; +} } // namespace clp diff --git a/components/core/src/clp/Stopwatch.hpp b/components/core/src/clp/Stopwatch.hpp index 31bc43255e..4002e02e7f 100644 --- a/components/core/src/clp/Stopwatch.hpp +++ b/components/core/src/clp/Stopwatch.hpp @@ -2,8 +2,7 @@ #define CLP_STOPWATCH_HPP #include -#include -#include +#include namespace clp { class Stopwatch { @@ -16,12 +15,15 @@ class Stopwatch { void stop(); void reset(); - auto get_time_taken_in_seconds() const -> double; + [[nodiscard]] auto get_time_taken_in_seconds() const -> double; + + [[nodiscard]] auto get_call_count() const -> uint32_t; private: // Variables std::chrono::time_point m_begin; std::chrono::duration m_time_taken; + uint32_t m_call_count{0}; }; } // namespace clp From 12554b7198ebb58768546504ae754fb77a188c2d Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Fri, 6 Mar 2026 01:41:08 -0500 Subject: [PATCH 145/164] Update unit-test with stats. --- components/core/src/clp/Profiler.hpp | 11 +++++++++ .../core/tests/test-ProfilerReporter.cpp | 23 +++++++++++-------- components/core/tests/test-ScopedProfiler.cpp | 6 +++++ 3 files changed, 31 insertions(+), 9 deletions(-) diff --git a/components/core/src/clp/Profiler.hpp b/components/core/src/clp/Profiler.hpp index e42a65f30c..6df1e65932 100644 --- a/components/core/src/clp/Profiler.hpp +++ b/components/core/src/clp/Profiler.hpp @@ -124,6 +124,17 @@ class Profiler { } } + static auto get_runtime_call_count(std::string const& name) -> uint32_t { + if constexpr (PROF_ACTIVE) { + if (false == check_runtime_timer_exists(name)) { + return 0; + } + return m_runtime_measurements[name].get_call_count(); + } else { + return 0; + } + } + static auto get_runtime_measurements() -> std::unordered_map const& { return m_runtime_measurements; } diff --git a/components/core/tests/test-ProfilerReporter.cpp b/components/core/tests/test-ProfilerReporter.cpp index f1bbcb3033..443898fdde 100644 --- a/components/core/tests/test-ProfilerReporter.cpp +++ b/components/core/tests/test-ProfilerReporter.cpp @@ -18,16 +18,21 @@ using std::string; using std::unordered_map; using Sink = unordered_map; -using ExpectedSink = unordered_map; +using ExpectedSink = unordered_map>; constexpr double cTimerMarginOfError{0.1}; auto check_sink(Sink const& actual_sink, ExpectedSink const& expected_sink) { - for (auto const& [name, expected_time] : expected_sink) { + for (auto const& [name, expected] : expected_sink) { + auto expected_time{expected.first}; + auto expected_calls{expected.second}; + REQUIRE(actual_sink.contains(name)); - auto const time{actual_sink.at(name).get_time_taken_in_seconds()}; - REQUIRE(time >= expected_time); - REQUIRE(time < expected_time + cTimerMarginOfError); + auto const actual_time{actual_sink.at(name).get_time_taken_in_seconds()}; + auto const actual_calls{actual_sink.at(name).get_call_count()}; + REQUIRE(actual_time >= expected_time); + REQUIRE(actual_time < expected_time + cTimerMarginOfError); + REQUIRE(actual_calls == expected_calls); } } @@ -63,8 +68,8 @@ TEST_CASE("profiler_reporter_reports_runtime_measurements", "[profiler]") { std::this_thread::sleep_for(std::chrono::milliseconds(100)); } - check_sink(sink1, {{"scope0",0}, {"scope1",0.01}}); - check_sink(sink2, {{"scope0",0}, {"scope1",0.01}, {"scope2",0.02}}); - check_sink(sink3, {{"scope0",0}, {"scope1",0.01}, {"scope2",0.07}}); - check_sink(sink0, {{"scope0",0.18}, {"scope1",0.01}, {"scope2",0.07}}); + check_sink(sink1, {{"scope0",{0,0}}, {"scope1",{0.01,1}}}); + check_sink(sink2, {{"scope0",{0,0}}, {"scope1",{0.01,1}}, {"scope2",{0.02,1}}}); + check_sink(sink3, {{"scope0",{0,0}}, {"scope1",{0.01,1}}, {"scope2",{0.07,2}}}); + check_sink(sink0, {{"scope0",{0.18,1}}, {"scope1",{0.01,1}}, {"scope2",{0.07,2}}}); } diff --git a/components/core/tests/test-ScopedProfiler.cpp b/components/core/tests/test-ScopedProfiler.cpp index f45e4ded52..9c0a07d309 100644 --- a/components/core/tests/test-ScopedProfiler.cpp +++ b/components/core/tests/test-ScopedProfiler.cpp @@ -27,8 +27,10 @@ TEST_CASE("scoped_profiler_starts_and_stops_timer_automatically", "[profiler]") } auto const measured{Profiler::get_runtime_measurement_in_seconds(cName)}; + auto const calls{Profiler::get_runtime_call_count(cName)}; REQUIRE(measured >= 0.05); REQUIRE(measured < 0.14); + REQUIRE(calls == 1); } TEST_CASE("scoped_profiler_accumulates_across_multiple_scopes", "[profiler]") { @@ -48,8 +50,10 @@ TEST_CASE("scoped_profiler_accumulates_across_multiple_scopes", "[profiler]") { } auto const measured{Profiler::get_runtime_measurement_in_seconds(cName)}; + auto const calls{Profiler::get_runtime_call_count(cName)}; REQUIRE(measured >= 0.05); REQUIRE(measured < 0.14); + REQUIRE(calls == 2); } TEST_CASE("scoped_profiler_macro_works", "[profiler]") { @@ -62,6 +66,8 @@ TEST_CASE("scoped_profiler_macro_works", "[profiler]") { } auto const measured{Profiler::get_runtime_measurement_in_seconds(cName)}; + auto const calls{Profiler::get_runtime_call_count(cName)}; REQUIRE(measured >= 0.05); REQUIRE(measured < 0.14); + REQUIRE(calls == 1); } From 96fbe97795904903c69c6ad43228a61fe3b6a52d Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Fri, 6 Mar 2026 21:49:48 -0500 Subject: [PATCH 146/164] Format. --- components/core/src/clp/Profiler.hpp | 11 ++++------- components/core/src/clp/ProfilerReporter.hpp | 6 +++--- components/core/src/clp/ScopedProfiler.hpp | 7 ++++--- components/core/tests/test-ProfilerReporter.cpp | 8 ++++---- components/core/tests/test-ScopedProfiler.cpp | 1 - 5 files changed, 15 insertions(+), 18 deletions(-) diff --git a/components/core/src/clp/Profiler.hpp b/components/core/src/clp/Profiler.hpp index 6df1e65932..9c89517faf 100644 --- a/components/core/src/clp/Profiler.hpp +++ b/components/core/src/clp/Profiler.hpp @@ -84,7 +84,7 @@ class Profiler { if constexpr (PROF_ACTIVE) { if (false == m_runtime_measurements.contains(name)) { SPDLOG_ERROR("Attempt to get runtime measurment of non existent timer {}", name); - return false;; + return false; } return true; } @@ -141,8 +141,7 @@ class Profiler { template static auto start_compile_time_measurement() -> void { - if constexpr (PROF_ACTIVE && cMeasurementEnabled[enum_to_underlying_type(index)]) - { + if constexpr (PROF_ACTIVE && cMeasurementEnabled[enum_to_underlying_type(index)]) { if (false == check_init()) { return; } @@ -152,8 +151,7 @@ class Profiler { template static auto stop_compile_time_measurement() -> void { - if constexpr (PROF_ACTIVE && cMeasurementEnabled[enum_to_underlying_type(index)]) - { + if constexpr (PROF_ACTIVE && cMeasurementEnabled[enum_to_underlying_type(index)]) { if (false == check_init()) { return; } @@ -163,8 +161,7 @@ class Profiler { template static auto reset_compile_time_measurement() -> void { - if constexpr (PROF_ACTIVE && cMeasurementEnabled[enum_to_underlying_type(index)]) - { + if constexpr (PROF_ACTIVE && cMeasurementEnabled[enum_to_underlying_type(index)]) { if (false == check_init()) { return; } diff --git a/components/core/src/clp/ProfilerReporter.hpp b/components/core/src/clp/ProfilerReporter.hpp index a61a61a547..46bcf6f5c8 100644 --- a/components/core/src/clp/ProfilerReporter.hpp +++ b/components/core/src/clp/ProfilerReporter.hpp @@ -44,15 +44,15 @@ class ProfilerReporter { for (auto const& [name, stopwatch] : Profiler::get_runtime_measurements()) { auto total{stopwatch.get_time_taken_in_seconds()}; auto calls{stopwatch.get_call_count()}; - auto avg{calls > 0 ? total/calls : 0.0}; + auto avg{calls > 0 ? total / calls : 0.0}; SPDLOG_INFO("{}: total {:.3f} s | calls {} | avg {:.3f} s", name, total, calls, avg); } SPDLOG_INFO("----MEASUREMENTS END----"); } - ProfilerReporter(const ProfilerReporter&) = delete; - ProfilerReporter& operator=(const ProfilerReporter&) = delete; + ProfilerReporter(ProfilerReporter const&) = delete; + ProfilerReporter& operator=(ProfilerReporter const&) = delete; ProfilerReporter(ProfilerReporter&&) = delete; ProfilerReporter& operator=(ProfilerReporter&&) = delete; diff --git a/components/core/src/clp/ScopedProfiler.hpp b/components/core/src/clp/ScopedProfiler.hpp index 18a6cb9ddf..0d347c7c80 100644 --- a/components/core/src/clp/ScopedProfiler.hpp +++ b/components/core/src/clp/ScopedProfiler.hpp @@ -30,8 +30,8 @@ class ScopedProfiler { ~ScopedProfiler() { Profiler::stop_runtime_measurement(m_name); } - ScopedProfiler(const ScopedProfiler&) = delete; - ScopedProfiler& operator=(const ScopedProfiler&) = delete; + ScopedProfiler(ScopedProfiler const&) = delete; + ScopedProfiler& operator=(ScopedProfiler const&) = delete; ScopedProfiler(ScopedProfiler&&) = delete; ScopedProfiler& operator=(ScopedProfiler&&) = delete; @@ -44,6 +44,7 @@ class ScopedProfiler { #define CLP_CONCAT(x, y) CLP_CONCAT_IMPL(x, y) -#define PROFILE_SCOPE(x) ::clp::ScopedProfiler CLP_CONCAT(__clp_profile_scope_, __LINE__){x} +#define PROFILE_SCOPE(x) \ + ::clp::ScopedProfiler CLP_CONCAT(__clp_profile_scope_, __LINE__){x} #endif // CLP_SCOPED_PROFILER_HPP diff --git a/components/core/tests/test-ProfilerReporter.cpp b/components/core/tests/test-ProfilerReporter.cpp index 443898fdde..4ee0c69041 100644 --- a/components/core/tests/test-ProfilerReporter.cpp +++ b/components/core/tests/test-ProfilerReporter.cpp @@ -68,8 +68,8 @@ TEST_CASE("profiler_reporter_reports_runtime_measurements", "[profiler]") { std::this_thread::sleep_for(std::chrono::milliseconds(100)); } - check_sink(sink1, {{"scope0",{0,0}}, {"scope1",{0.01,1}}}); - check_sink(sink2, {{"scope0",{0,0}}, {"scope1",{0.01,1}}, {"scope2",{0.02,1}}}); - check_sink(sink3, {{"scope0",{0,0}}, {"scope1",{0.01,1}}, {"scope2",{0.07,2}}}); - check_sink(sink0, {{"scope0",{0.18,1}}, {"scope1",{0.01,1}}, {"scope2",{0.07,2}}}); + check_sink(sink1, {{"scope0", {0, 0}}, {"scope1", {0.01, 1}}}); + check_sink(sink2, {{"scope0", {0, 0}}, {"scope1", {0.01, 1}}, {"scope2", {0.02, 1}}}); + check_sink(sink3, {{"scope0", {0, 0}}, {"scope1", {0.01, 1}}, {"scope2", {0.07, 2}}}); + check_sink(sink0, {{"scope0", {0.18, 1}}, {"scope1", {0.01, 1}}, {"scope2", {0.07, 2}}}); } diff --git a/components/core/tests/test-ScopedProfiler.cpp b/components/core/tests/test-ScopedProfiler.cpp index 9c0a07d309..7b6cb1ce14 100644 --- a/components/core/tests/test-ScopedProfiler.cpp +++ b/components/core/tests/test-ScopedProfiler.cpp @@ -7,7 +7,6 @@ #include #include -using clp::Profiler; using clp::Profiler; using clp::ScopedProfiler; From e5b37e0d95d8c23c7db8381c1e076c83304ecf61 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Fri, 6 Mar 2026 21:55:35 -0500 Subject: [PATCH 147/164] Remove unused using. --- components/core/src/clp/clp/FileCompressor.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/components/core/src/clp/clp/FileCompressor.cpp b/components/core/src/clp/clp/FileCompressor.cpp index 41e6e8f1e0..ba27238b92 100644 --- a/components/core/src/clp/clp/FileCompressor.cpp +++ b/components/core/src/clp/clp/FileCompressor.cpp @@ -28,8 +28,6 @@ using clp::ir::four_byte_encoded_variable_t; using clp::ir::has_ir_stream_magic_number; using clp::ir::LogEventDeserializer; using clp::ParsedMessage; -using clp::Profiler; -using clp::ScopedProfiler; using clp::streaming_archive::writer::split_archive; using clp::streaming_archive::writer::split_file; using clp::streaming_archive::writer::split_file_and_archive; From 0131b76d0c6bf2a5da1334c481a4111dac918205 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Fri, 6 Mar 2026 21:56:31 -0500 Subject: [PATCH 148/164] Remove more unused usings. --- components/core/src/clp/clp/run.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/components/core/src/clp/clp/run.cpp b/components/core/src/clp/clp/run.cpp index 62900d2f63..b0f5dac153 100644 --- a/components/core/src/clp/clp/run.cpp +++ b/components/core/src/clp/clp/run.cpp @@ -5,7 +5,6 @@ #include #include -#include "../Profiler.hpp" #include "../ProfilerReporter.hpp" #include "../ScopedProfiler.hpp" #include "../spdlog_with_specializations.hpp" @@ -15,7 +14,6 @@ #include "decompression.hpp" #include "utils.hpp" -using clp::Profiler; using clp::ProfilerReporter; using std::string; using std::unordered_set; From e97089312347b4d9f067526475104867d2a8e678 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Sat, 7 Mar 2026 04:47:52 -0500 Subject: [PATCH 149/164] Format. --- components/core/src/clp/ProfilerReporter.hpp | 1 + components/core/src/clp/ScopedProfiler.hpp | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/components/core/src/clp/ProfilerReporter.hpp b/components/core/src/clp/ProfilerReporter.hpp index 46bcf6f5c8..6b257ad420 100644 --- a/components/core/src/clp/ProfilerReporter.hpp +++ b/components/core/src/clp/ProfilerReporter.hpp @@ -32,6 +32,7 @@ namespace clp { class ProfilerReporter { public: ProfilerReporter() = default; + explicit ProfilerReporter(std::unordered_map& sink) : m_sink(&sink) {} ~ProfilerReporter() { diff --git a/components/core/src/clp/ScopedProfiler.hpp b/components/core/src/clp/ScopedProfiler.hpp index 0d347c7c80..d58886f497 100644 --- a/components/core/src/clp/ScopedProfiler.hpp +++ b/components/core/src/clp/ScopedProfiler.hpp @@ -45,6 +45,6 @@ class ScopedProfiler { #define CLP_CONCAT(x, y) CLP_CONCAT_IMPL(x, y) #define PROFILE_SCOPE(x) \ - ::clp::ScopedProfiler CLP_CONCAT(__clp_profile_scope_, __LINE__){x} + ::clp::ScopedProfiler CLP_CONCAT(__clp_profile_scope_, __LINE__)(x) #endif // CLP_SCOPED_PROFILER_HPP From 027838eacb693045a480a6568dcb8da19d82358e Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Sat, 7 Mar 2026 04:48:44 -0500 Subject: [PATCH 150/164] Fix typo. --- components/core/src/clp/Profiler.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/core/src/clp/Profiler.hpp b/components/core/src/clp/Profiler.hpp index 9c89517faf..7af0d33e09 100644 --- a/components/core/src/clp/Profiler.hpp +++ b/components/core/src/clp/Profiler.hpp @@ -83,7 +83,7 @@ class Profiler { static auto check_runtime_timer_exists(std::string const& name) -> bool { if constexpr (PROF_ACTIVE) { if (false == m_runtime_measurements.contains(name)) { - SPDLOG_ERROR("Attempt to get runtime measurment of non existent timer {}", name); + SPDLOG_ERROR("Attempt to get runtime measurement of non existent timer {}", name); return false; } return true; From f91b94b42e6ed34e6770306f62ad01c722c36ba3 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Sat, 7 Mar 2026 04:50:08 -0500 Subject: [PATCH 151/164] Fix header guard. --- components/core/src/clp/ProfilerReporter.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/components/core/src/clp/ProfilerReporter.hpp b/components/core/src/clp/ProfilerReporter.hpp index 6b257ad420..76148bd8c5 100644 --- a/components/core/src/clp/ProfilerReporter.hpp +++ b/components/core/src/clp/ProfilerReporter.hpp @@ -1,5 +1,5 @@ -#ifndef CLP_PROFILER_REPORT_HPP -#define CLP_PROFILER_REPORT_HPP +#ifndef CLP_PROFILER_REPORTER_HPP +#define CLP_PROFILER_REPORTER_HPP #include #include @@ -62,4 +62,4 @@ class ProfilerReporter { }; } // namespace clp -#endif // CLP_PROFILER_REPORT_HPP +#endif // CLP_PROFILER_REPORTER_HPP From c2c2f41b948909d7c1b086dca493d929cb12ba62 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Sat, 7 Mar 2026 04:51:28 -0500 Subject: [PATCH 152/164] Fix typo. --- components/core/src/clp/ProfilerReporter.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/components/core/src/clp/ProfilerReporter.hpp b/components/core/src/clp/ProfilerReporter.hpp index 76148bd8c5..35d89e445d 100644 --- a/components/core/src/clp/ProfilerReporter.hpp +++ b/components/core/src/clp/ProfilerReporter.hpp @@ -22,12 +22,12 @@ namespace clp { * profile. A common place is in the `main()` function of an executables. * - If `ScopedProfiler` or `Profiler` is used in the same scope as `ProfilerReporter`, * `ProfilerReporter`must be declared first, such that its destructor is called last. - * - Once the object is destructured (scope exit), the runtime measurmenets are reported. + * - Once the object is destructed (scope exit), the runtime measurmenets are reported. * * Notes: * - Only runtime measurements (those tracked by string names) are reported. This class is * primarily designed to work in tandem with the `ScopedProfiler` class. - * - Copy and move operations are removed to prevent accidentaly multiple reporting. + * - Copy and move operations are removed to prevent accidental multiple reporting. */ class ProfilerReporter { public: From a64478bf56dd422735ad21356a71e71cd97cc87f Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Sat, 7 Mar 2026 04:54:47 -0500 Subject: [PATCH 153/164] make stopwatch reset call count. --- components/core/src/clp/Stopwatch.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/components/core/src/clp/Stopwatch.cpp b/components/core/src/clp/Stopwatch.cpp index aaf2cc1bb1..b97deb71d6 100644 --- a/components/core/src/clp/Stopwatch.cpp +++ b/components/core/src/clp/Stopwatch.cpp @@ -23,6 +23,7 @@ void Stopwatch::stop() { void Stopwatch::reset() { m_time_taken = std::chrono::steady_clock::duration::zero(); + m_call_count = 0; } auto Stopwatch::get_time_taken_in_seconds() const -> double { From d3abf6b2af4557da7cedb1db85fdcb1ab557df5b Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Sat, 7 Mar 2026 04:56:02 -0500 Subject: [PATCH 154/164] Typo fix. --- components/core/src/clp/Profiler.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/core/src/clp/Profiler.hpp b/components/core/src/clp/Profiler.hpp index 7af0d33e09..61bd5cef7c 100644 --- a/components/core/src/clp/Profiler.hpp +++ b/components/core/src/clp/Profiler.hpp @@ -21,7 +21,7 @@ namespace clp { * Class to time code. * * A Measurement can be taken over a single continuous operation, or called multiple times to - * accumulate fragemented measurements into a single total run time. + * accumulate fragmented measurements into a single total run time. * * There are two ways to add a measurement: * 1. For measurements that are taken a small number of times use a runtime measurement. From 08f7675ae549bf29ec3c664826b3b4c5eb545c5a Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Sat, 7 Mar 2026 04:58:37 -0500 Subject: [PATCH 155/164] Document unfreed vector. --- components/core/src/clp/Profiler.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/components/core/src/clp/Profiler.hpp b/components/core/src/clp/Profiler.hpp index 61bd5cef7c..37b9b102bd 100644 --- a/components/core/src/clp/Profiler.hpp +++ b/components/core/src/clp/Profiler.hpp @@ -56,7 +56,8 @@ class Profiler { // Methods /** - * Static initializer for class. This must be called before using the class. + * Static initializer for class. This must be called before using the class. This is meant to be + * process lifetime storage, so the vector is never freed. */ static void init() { if constexpr (PROF_ACTIVE) { From a8074dca4c21221f7cb7de0b6d1933f26eabf1d9 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Sat, 7 Mar 2026 04:59:23 -0500 Subject: [PATCH 156/164] Remove unused variable. --- components/core/src/clp/clp/FileCompressor.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/components/core/src/clp/clp/FileCompressor.cpp b/components/core/src/clp/clp/FileCompressor.cpp index ba27238b92..631aadf577 100644 --- a/components/core/src/clp/clp/FileCompressor.cpp +++ b/components/core/src/clp/clp/FileCompressor.cpp @@ -125,8 +125,6 @@ bool FileCompressor::compress_file( ) { PROFILE_SCOPE("FileCompressor::compress_file"); - string file_name = std::filesystem::canonical(file_to_compress.get_path()).string(); - BufferedReader buffered_file_reader{make_unique(file_to_compress.get_path())}; // Check that file is UTF-8 encoded From c7afef7e632fdf8957f4bea6e33e2d5d246076f1 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Sat, 7 Mar 2026 05:43:31 -0500 Subject: [PATCH 157/164] Remove ScopedProfiling if profiler disabled. --- components/core/src/clp/ScopedProfiler.hpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/components/core/src/clp/ScopedProfiler.hpp b/components/core/src/clp/ScopedProfiler.hpp index d58886f497..2fb7ee55e8 100644 --- a/components/core/src/clp/ScopedProfiler.hpp +++ b/components/core/src/clp/ScopedProfiler.hpp @@ -44,7 +44,10 @@ class ScopedProfiler { #define CLP_CONCAT(x, y) CLP_CONCAT_IMPL(x, y) -#define PROFILE_SCOPE(x) \ - ::clp::ScopedProfiler CLP_CONCAT(__clp_profile_scope_, __LINE__)(x) +#if PROF_ACTIVE +#define PROFILE_SCOPE(x) ::clp::ScopedProfiler CLP_CONCAT(__clp_profile_scope_, __LINE__)(x) +#else +#define PROFILE_SCOPE(x) ((void)0) +#endif #endif // CLP_SCOPED_PROFILER_HPP From 22cf1cc1e57299daeb8b3acc9c49322c551febdf Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Sat, 7 Mar 2026 05:45:00 -0500 Subject: [PATCH 158/164] Update doc to specify init is only need for compile-time measurements. --- components/core/src/clp/Profiler.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/components/core/src/clp/Profiler.hpp b/components/core/src/clp/Profiler.hpp index 37b9b102bd..4732b73f11 100644 --- a/components/core/src/clp/Profiler.hpp +++ b/components/core/src/clp/Profiler.hpp @@ -56,8 +56,8 @@ class Profiler { // Methods /** - * Static initializer for class. This must be called before using the class. This is meant to be - * process lifetime storage, so the vector is never freed. + * Static initializer for class. This must be called before using compile-time measurements. + * This is meant to be process lifetime storage, so the vector is never freed. */ static void init() { if constexpr (PROF_ACTIVE) { From 449d3c0696003e8c4c50c3532e8c0732fb2f9f1a Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Sat, 7 Mar 2026 05:54:35 -0500 Subject: [PATCH 159/164] Prevent error message spam. --- components/core/src/clp/Profiler.hpp | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/components/core/src/clp/Profiler.hpp b/components/core/src/clp/Profiler.hpp index 4732b73f11..c2d2e35347 100644 --- a/components/core/src/clp/Profiler.hpp +++ b/components/core/src/clp/Profiler.hpp @@ -9,6 +9,7 @@ #include #include +#include #include #include @@ -74,7 +75,11 @@ class Profiler { static auto check_init() -> bool { if constexpr (PROF_ACTIVE) { if (false == m_initialized) { - SPDLOG_ERROR("Profiler used without calling Profiler::init()"); + static bool s_logged{false}; + if (false == s_logged) { + s_logged = true; + SPDLOG_ERROR("Profiler used without calling Profiler::init()"); + } } return m_initialized; } @@ -84,7 +89,11 @@ class Profiler { static auto check_runtime_timer_exists(std::string const& name) -> bool { if constexpr (PROF_ACTIVE) { if (false == m_runtime_measurements.contains(name)) { - SPDLOG_ERROR("Attempt to get runtime measurement of non existent timer {}", name); + static std::unordered_set s_logged_names; + if (false == s_logged_names.contains(name)) { + s_logged_names.insert(name); + SPDLOG_ERROR("Attempt to get runtime measurement of non existent timer {}", name); + } return false; } return true; From 5ea5af16b0718619a06f76a0ecd1dabf0f9855f0 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Sat, 7 Mar 2026 05:56:07 -0500 Subject: [PATCH 160/164] Indent. --- components/core/src/clp/ScopedProfiler.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/components/core/src/clp/ScopedProfiler.hpp b/components/core/src/clp/ScopedProfiler.hpp index 2fb7ee55e8..e663531aba 100644 --- a/components/core/src/clp/ScopedProfiler.hpp +++ b/components/core/src/clp/ScopedProfiler.hpp @@ -45,9 +45,9 @@ class ScopedProfiler { #define CLP_CONCAT(x, y) CLP_CONCAT_IMPL(x, y) #if PROF_ACTIVE -#define PROFILE_SCOPE(x) ::clp::ScopedProfiler CLP_CONCAT(__clp_profile_scope_, __LINE__)(x) + #define PROFILE_SCOPE(x) ::clp::ScopedProfiler CLP_CONCAT(__clp_profile_scope_, __LINE__)(x) #else -#define PROFILE_SCOPE(x) ((void)0) + #define PROFILE_SCOPE(x) ((void)0) #endif #endif // CLP_SCOPED_PROFILER_HPP From 25bff02e2f019ed476f4ad0f03384de93c4f5c5b Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Sat, 7 Mar 2026 06:03:17 -0500 Subject: [PATCH 161/164] Reword error. --- components/core/src/clp/Profiler.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/core/src/clp/Profiler.hpp b/components/core/src/clp/Profiler.hpp index c2d2e35347..67e0f07e85 100644 --- a/components/core/src/clp/Profiler.hpp +++ b/components/core/src/clp/Profiler.hpp @@ -92,7 +92,7 @@ class Profiler { static std::unordered_set s_logged_names; if (false == s_logged_names.contains(name)) { s_logged_names.insert(name); - SPDLOG_ERROR("Attempt to get runtime measurement of non existent timer {}", name); + SPDLOG_ERROR("Attempt to get non-existent runtime measurement: {}", name); } return false; } From ca945de42695ec71ca145979c7d804aeac9f5759 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Sat, 7 Mar 2026 06:27:51 -0500 Subject: [PATCH 162/164] Update unit-test tags to fix CI. --- components/core/tests/test-ProfilerReporter.cpp | 2 +- components/core/tests/test-ScopedProfiler.cpp | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/components/core/tests/test-ProfilerReporter.cpp b/components/core/tests/test-ProfilerReporter.cpp index 4ee0c69041..b3d012df07 100644 --- a/components/core/tests/test-ProfilerReporter.cpp +++ b/components/core/tests/test-ProfilerReporter.cpp @@ -36,7 +36,7 @@ auto check_sink(Sink const& actual_sink, ExpectedSink const& expected_sink) { } } -TEST_CASE("profiler_reporter_reports_runtime_measurements", "[profiler]") { +TEST_CASE("profiler_reporter_reports_runtime_measurements", "[ProfilerReporter][Stopwatch]") { Sink sink0; Sink sink1; Sink sink2; diff --git a/components/core/tests/test-ScopedProfiler.cpp b/components/core/tests/test-ScopedProfiler.cpp index 7b6cb1ce14..c528dd8166 100644 --- a/components/core/tests/test-ScopedProfiler.cpp +++ b/components/core/tests/test-ScopedProfiler.cpp @@ -12,11 +12,11 @@ using clp::ScopedProfiler; constexpr auto cName{"ProfileName"}; -TEST_CASE("macro_is_set", "[profiler]") { +TEST_CASE("macro_is_set", "[ScopedProfiler]") { REQUIRE(PROF_ACTIVE == 1); } -TEST_CASE("scoped_profiler_starts_and_stops_timer_automatically", "[profiler]") { +TEST_CASE("scoped_profiler_starts_and_stops_timer_automatically", "[ScopedProfiler][Stopwatch]") { Profiler::init(); Profiler::reset_runtime_measurements(); @@ -32,7 +32,7 @@ TEST_CASE("scoped_profiler_starts_and_stops_timer_automatically", "[profiler]") REQUIRE(calls == 1); } -TEST_CASE("scoped_profiler_accumulates_across_multiple_scopes", "[profiler]") { +TEST_CASE("scoped_profiler_accumulates_across_multiple_scopes", "[ScopedProfiler][Stopwatch]") { Profiler::init(); Profiler::reset_runtime_measurements(); @@ -55,7 +55,7 @@ TEST_CASE("scoped_profiler_accumulates_across_multiple_scopes", "[profiler]") { REQUIRE(calls == 2); } -TEST_CASE("scoped_profiler_macro_works", "[profiler]") { +TEST_CASE("scoped_profiler_macro_works", "[ScopedProfiler][Stopwatch]") { Profiler::init(); Profiler::reset_runtime_measurements(); From 5765cfa6e4bbb57d8bc3996172789e5dc8896a83 Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 11 Mar 2026 05:32:53 -0400 Subject: [PATCH 163/164] Allow for disabling scopes in reporting. --- components/core/src/clp/ProfilerReporter.hpp | 45 ++++++++++++++++---- 1 file changed, 37 insertions(+), 8 deletions(-) diff --git a/components/core/src/clp/ProfilerReporter.hpp b/components/core/src/clp/ProfilerReporter.hpp index 35d89e445d..c4c5b28112 100644 --- a/components/core/src/clp/ProfilerReporter.hpp +++ b/components/core/src/clp/ProfilerReporter.hpp @@ -3,6 +3,7 @@ #include #include +#include #include @@ -14,15 +15,22 @@ namespace clp { * RAII helper for automatically reporting all runtime measurements at scope exit. * * This class is designed to simplify the reporting of runtime measurements captured using the - * `Profiler` class. By creating an instance of `ProfilerReporter`, all runtime measurements will - * automatically be printed when the object goes out of scope. + * `Profiler` class. + * + * Features: + * - Prints runtime measurements (total time, call count, average time) when the object is + * destructed. This is the default behavior if no sink is provided in the constructor. + * - Can write measurements to a user provided sink when the object is destructed. + * - The sink will be cleared before reporting. + * - Printing will not occur if writing to a sink. + * - Supports disabling certain scopes so neither printed nor written to sink. * * Usage: * - Define a `ProfilerReporter` at any logical unit that encompasses all operations you want to - * profile. A common place is in the `main()` function of an executables. + * profile. A common place is in the `main()` function of an executable. * - If `ScopedProfiler` or `Profiler` is used in the same scope as `ProfilerReporter`, - * `ProfilerReporter`must be declared first, such that its destructor is called last. - * - Once the object is destructed (scope exit), the runtime measurmenets are reported. + * `ProfilerReporter` must be declared first, such that its destructor is called last. + * - Once the object is destructed (scope exit), the runtime measurments are reported. * * Notes: * - Only runtime measurements (those tracked by string names) are reported. This class is @@ -31,18 +39,34 @@ namespace clp { */ class ProfilerReporter { public: - ProfilerReporter() = default; + explicit ProfilerReporter(std::unordered_set disabled_scopes = {}) { + set_disabled_scopes(std::move(disabled_scopes)); + } - explicit ProfilerReporter(std::unordered_map& sink) : m_sink(&sink) {} + explicit ProfilerReporter( + std::unordered_map& sink, + std::unordered_set disabled_scopes = {} + ) + : m_sink(&sink) { + set_disabled_scopes(std::move(disabled_scopes)); + } ~ProfilerReporter() { if (nullptr != m_sink) { - *m_sink = Profiler::get_runtime_measurements(); + m_sink->clear(); + for (auto const& [name, stopwatch] : Profiler::get_runtime_measurements()) { + if (false == m_disabled_scopes.contains(name)) { + m_sink->insert({name, stopwatch}); + } + } return; } SPDLOG_INFO("---MEASUREMENTS START---"); for (auto const& [name, stopwatch] : Profiler::get_runtime_measurements()) { + if (m_disabled_scopes.contains(name)) { + continue; + } auto total{stopwatch.get_time_taken_in_seconds()}; auto calls{stopwatch.get_call_count()}; auto avg{calls > 0 ? total / calls : 0.0}; @@ -58,7 +82,12 @@ class ProfilerReporter { ProfilerReporter& operator=(ProfilerReporter&&) = delete; private: + auto set_disabled_scopes(std::unordered_set disabled_scopes) -> void { + m_disabled_scopes = std::move(disabled_scopes); + } + std::unordered_map* m_sink{nullptr}; + std::unordered_set m_disabled_scopes; }; } // namespace clp From 2b700ad0f1c759e686e6de07f846ceb6ef13262e Mon Sep 17 00:00:00 2001 From: SharafMohamed Date: Wed, 11 Mar 2026 05:40:06 -0400 Subject: [PATCH 164/164] Update unit-test. --- .../core/tests/test-ProfilerReporter.cpp | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/components/core/tests/test-ProfilerReporter.cpp b/components/core/tests/test-ProfilerReporter.cpp index b3d012df07..c8735d9fd3 100644 --- a/components/core/tests/test-ProfilerReporter.cpp +++ b/components/core/tests/test-ProfilerReporter.cpp @@ -73,3 +73,24 @@ TEST_CASE("profiler_reporter_reports_runtime_measurements", "[ProfilerReporter][ check_sink(sink3, {{"scope0", {0, 0}}, {"scope1", {0.01, 1}}, {"scope2", {0.07, 2}}}); check_sink(sink0, {{"scope0", {0.18, 1}}, {"scope1", {0.01, 1}}, {"scope2", {0.07, 2}}}); } + +TEST_CASE("profiler_reporter_respects_disable_scopes", "[ProfilerReporter][Stopwatch]") { + Sink sink; + + Profiler::reset_runtime_measurements(); + { + ProfilerReporter profiler(sink, {"scope1"}); + + PROFILE_SCOPE("scope0"); + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + + PROFILE_SCOPE("scope1"); + std::this_thread::sleep_for(std::chrono::milliseconds(20)); + + PROFILE_SCOPE("scope2"); + std::this_thread::sleep_for(std::chrono::milliseconds(30)); + } + + REQUIRE_FALSE(sink.contains("scope1")); + check_sink(sink, {{"scope0", {0.01, 1}}, {"scope2", {0.03, 1}}}); +}