From 4931f6b17cf863e7cfdf59e38df28d48dc92fed2 Mon Sep 17 00:00:00 2001 From: davidlion Date: Thu, 28 Aug 2025 11:20:07 -0400 Subject: [PATCH 01/14] feat(log-surgeon)!: Add support for a single capture group in a schema rule to have parity with the heuristic parser. --- components/core/src/clp/Utils.cpp | 11 +++ .../clp/streaming_archive/writer/Archive.cpp | 92 ++++++++++++++----- .../core/tests/test-ParserWithUserSchema.cpp | 23 +++++ .../multiple_capture_groups.txt | 1 + .../single_capture_group.txt | 1 + 5 files changed, 105 insertions(+), 23 deletions(-) create mode 100644 components/core/tests/test_schema_files/multiple_capture_groups.txt create mode 100644 components/core/tests/test_schema_files/single_capture_group.txt diff --git a/components/core/src/clp/Utils.cpp b/components/core/src/clp/Utils.cpp index f52cb87cdf..8fcf17ff26 100644 --- a/components/core/src/clp/Utils.cpp +++ b/components/core/src/clp/Utils.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -188,6 +189,16 @@ load_lexer_from_file(std::string const& schema_file_path, log_surgeon::lexers::B for (std::unique_ptr const& parser_ast : schema_ast->m_schema_vars) { auto* rule = dynamic_cast(parser_ast.get()); + // Currently, we only support at most a single capture group in each variable. If a capture + // group is present its match will be treated as the variable rather than the full match. + if (1 < rule->m_regex_ptr->get_subtree_positive_captures().size()) { + throw std::runtime_error( + schema_file_path + ":" + std::to_string(rule->m_line_num + 1) + + ": error: the schema rule '" + rule->m_name + + "' has a regex pattern containing > 1 capture group.\n" + ); + } + if ("timestamp" == rule->m_name) { continue; } diff --git a/components/core/src/clp/streaming_archive/writer/Archive.cpp b/components/core/src/clp/streaming_archive/writer/Archive.cpp index a5fe8a29f9..b1edd0c9ce 100644 --- a/components/core/src/clp/streaming_archive/writer/Archive.cpp +++ b/components/core/src/clp/streaming_archive/writer/Archive.cpp @@ -2,14 +2,17 @@ #include +#include #include -#include -#include +#include +#include +#include #include #include #include #include +#include #include #include #include @@ -23,11 +26,7 @@ using clp::ir::eight_byte_encoded_variable_t; using clp::ir::four_byte_encoded_variable_t; -using log_surgeon::LogEventView; -using std::list; -using std::make_unique; using std::string; -using std::unordered_set; using std::vector; namespace clp::streaming_archive::writer { @@ -315,13 +314,13 @@ Archive::write_msg(epochtime_t timestamp, string const& message, size_t num_unco update_segment_indices(logtype_id, var_ids); } -void Archive::write_msg_using_schema(LogEventView const& log_view) { +void Archive::write_msg_using_schema(log_surgeon::LogEventView const& log_view) { epochtime_t timestamp = 0; TimestampPattern* timestamp_pattern = nullptr; auto const& log_output_buffer = log_view.get_log_output_buffer(); if (log_output_buffer->has_timestamp()) { - size_t start; - size_t end; + size_t start{}; + size_t end{}; timestamp_pattern = (TimestampPattern*)TimestampPattern::search_known_ts_patterns( log_output_buffer->get_mutable_token(0).to_string(), timestamp, @@ -360,7 +359,7 @@ void Archive::write_msg_using_schema(LogEventView const& log_view) { if (timestamp_pattern == nullptr) { start_pos = log_output_buffer->get_token(1).m_start_pos; } - uint32_t end_pos = log_output_buffer->get_token(log_output_buffer->pos() - 1).m_end_pos; + uint32_t const end_pos = log_output_buffer->get_token(log_output_buffer->pos() - 1).m_end_pos; if (start_pos <= end_pos) { num_uncompressed_bytes = end_pos - start_pos; } else { @@ -369,7 +368,7 @@ void Archive::write_msg_using_schema(LogEventView const& log_view) { } for (uint32_t i = 1; i < log_output_buffer->pos(); i++) { log_surgeon::Token& token = log_output_buffer->get_mutable_token(i); - int token_type = token.m_type_ids_ptr->at(0); + auto const token_type{token.m_type_ids_ptr->at(0)}; if (log_output_buffer->has_delimiters() && (timestamp_pattern != nullptr || i > 1) && token_type != static_cast(log_surgeon::SymbolId::TokenUncaughtString) && token_type != static_cast(log_surgeon::SymbolId::TokenNewline)) @@ -388,13 +387,13 @@ void Archive::write_msg_using_schema(LogEventView const& log_view) { break; } case static_cast(log_surgeon::SymbolId::TokenInt): { - encoded_variable_t encoded_var; + encoded_variable_t encoded_var{}; if (!EncodedVariableInterpreter::convert_string_to_representable_integer_var( token.to_string(), encoded_var )) { - variable_dictionary_id_t id; + variable_dictionary_id_t id{}; m_var_dict.add_entry(token.to_string(), id); encoded_var = EncodedVariableInterpreter::encode_var_dict_id(id); m_logtype_dict_entry.add_dictionary_var(); @@ -405,13 +404,13 @@ void Archive::write_msg_using_schema(LogEventView const& log_view) { break; } case static_cast(log_surgeon::SymbolId::TokenFloat): { - encoded_variable_t encoded_var; + encoded_variable_t encoded_var{}; if (!EncodedVariableInterpreter::convert_string_to_representable_float_var( token.to_string(), encoded_var )) { - variable_dictionary_id_t id; + variable_dictionary_id_t id{}; m_var_dict.add_entry(token.to_string(), id); encoded_var = EncodedVariableInterpreter::encode_var_dict_id(id); m_logtype_dict_entry.add_dictionary_var(); @@ -422,21 +421,68 @@ void Archive::write_msg_using_schema(LogEventView const& log_view) { break; } default: { - // Variable string looks like a dictionary variable, so encode it as so - encoded_variable_t encoded_var; - variable_dictionary_id_t id; - m_var_dict.add_entry(token.to_string(), id); - encoded_var = EncodedVariableInterpreter::encode_var_dict_id(id); - m_var_ids.push_back(id); + auto const& lexer{log_view.get_log_parser().m_lexer}; + auto capture_ids{lexer.get_capture_ids_from_rule_id(token_type)}; + + // If the variable token contains capture groups, we break the token up by storing + // each capture as a variable and any substrings surrounding the capture as part of + // the logtype. If there are no capture groups the entire variable token is stored + // as a variable. + if (false == capture_ids.has_value()) { + variable_dictionary_id_t id{}; + m_var_dict.add_entry(token.to_string(), id); + m_var_ids.push_back(id); + m_encoded_vars.push_back(EncodedVariableInterpreter::encode_var_dict_id(id)); + m_logtype_dict_entry.add_dictionary_var(); + + break; + } + auto const register_ids{lexer.get_reg_ids_from_capture_id(capture_ids.value().at(0))}; + if (false == register_ids.has_value()) { + throw(std::runtime_error( + "No register IDs found for variable's capture group. Full token: " + + token.to_string() + )); + } + auto const [start_reg_id, end_reg_id]{register_ids.value()}; + auto const capture_start{token.get_reversed_reg_positions(start_reg_id).front()}; + auto const capture_end{token.get_reversed_reg_positions(end_reg_id).back()}; + auto const token_view{token.to_string_view()}; + size_t token_pos{0}; + + auto const before_capture{token_view.substr(token_pos, capture_start)}; + m_logtype_dict_entry + .add_constant(before_capture, 0, before_capture.length()); + token_pos += before_capture.length(); + + // If a capture has repetition we store all instances as a single variable. + auto const capture_len = [&]() -> size_t { + if (capture_start <= capture_end) { + return capture_end - capture_start; + } + return token_view.length() - capture_start + capture_end; + }(); + auto const capture{token_view.substr(token_pos, capture_len)}; + + variable_dictionary_id_t id{}; + m_var_dict.add_entry(capture, id); + m_var_ids.push_back(id); + m_encoded_vars.push_back( + EncodedVariableInterpreter::encode_var_dict_id(id) + ); m_logtype_dict_entry.add_dictionary_var(); - m_encoded_vars.push_back(encoded_var); + token_pos += capture.length(); + + m_logtype_dict_entry + .add_constant(token_view, token_pos, token_view.length() - token_pos); + break; } } } if (!m_logtype_dict_entry.get_value().empty()) { - logtype_dictionary_id_t logtype_id; + logtype_dictionary_id_t logtype_id{}; m_logtype_dict.add_entry(m_logtype_dict_entry, logtype_id); m_file->write_encoded_msg( timestamp, diff --git a/components/core/tests/test-ParserWithUserSchema.cpp b/components/core/tests/test-ParserWithUserSchema.cpp index 61ca7ffdbe..9cc99efb1f 100644 --- a/components/core/tests/test-ParserWithUserSchema.cpp +++ b/components/core/tests/test-ParserWithUserSchema.cpp @@ -179,3 +179,26 @@ TEST_CASE("Test lexer", "[Search]") { token = opt_token.value(); } } + +TEST_CASE("Test schema with single capture group", "[load_lexer]") { + std::string const schema_path{"../tests/test_schema_files/single_capture_group.txt"}; + ByteLexer lexer; + load_lexer_from_file(schema_path, lexer); + + auto const rule_id{lexer.m_symbol_id.at("capture")}; + auto const capture_ids{lexer.get_capture_ids_from_rule_id(rule_id)}; + REQUIRE(capture_ids.has_value()); + REQUIRE(1 == capture_ids.value().size()); + REQUIRE("group" == lexer.m_id_symbol.at(capture_ids->at(0))); +} + +TEST_CASE("Test error for schema rule with multiple capture groups", "[load_lexer]") { + std::string const schema_path{"../tests/test_schema_files/multiple_capture_groups.txt"}; + ByteLexer lexer; + REQUIRE_THROWS_WITH( + load_lexer_from_file(schema_path, lexer), + schema_path + + ":1: error: the schema rule 'multicapture' has a regex pattern containing > " + "1 capture group.\n" + ); +} diff --git a/components/core/tests/test_schema_files/multiple_capture_groups.txt b/components/core/tests/test_schema_files/multiple_capture_groups.txt new file mode 100644 index 0000000000..e55c56a9be --- /dev/null +++ b/components/core/tests/test_schema_files/multiple_capture_groups.txt @@ -0,0 +1 @@ +multicapture:text(?var0)text(?var1)text diff --git a/components/core/tests/test_schema_files/single_capture_group.txt b/components/core/tests/test_schema_files/single_capture_group.txt new file mode 100644 index 0000000000..6984421a10 --- /dev/null +++ b/components/core/tests/test_schema_files/single_capture_group.txt @@ -0,0 +1 @@ +capture:text(?var)text From 61d5715f41dc26fee19c1c060804de922a12a20f Mon Sep 17 00:00:00 2001 From: davidlion Date: Thu, 28 Aug 2025 11:43:28 -0400 Subject: [PATCH 02/14] Add schema equals rule capture. --- components/core/config/schemas.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/core/config/schemas.txt b/components/core/config/schemas.txt index e0b777859d..9c09fb5d45 100644 --- a/components/core/config/schemas.txt +++ b/components/core/config/schemas.txt @@ -16,4 +16,4 @@ float:\-{0,1}[0-9]+\.[0-9]+ // Dictionary variables hex:[a-fA-F]+ hasNumber:.*\d.* -equals:.*=.*[a-zA-Z0-9].* +equals:.*=(?.*[a-zA-Z0-9].*) From 20b26f9956fe37a30309583c3b77755ee4ec5a75 Mon Sep 17 00:00:00 2001 From: davidlion Date: Thu, 18 Sep 2025 13:22:10 -0400 Subject: [PATCH 03/14] Address some review comments. --- .../clp/streaming_archive/writer/Archive.cpp | 27 +++++++++---------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/components/core/src/clp/streaming_archive/writer/Archive.cpp b/components/core/src/clp/streaming_archive/writer/Archive.cpp index b1edd0c9ce..f14c6df87a 100644 --- a/components/core/src/clp/streaming_archive/writer/Archive.cpp +++ b/components/core/src/clp/streaming_archive/writer/Archive.cpp @@ -421,13 +421,13 @@ void Archive::write_msg_using_schema(log_surgeon::LogEventView const& log_view) break; } default: { - auto const& lexer{log_view.get_log_parser().m_lexer}; - auto capture_ids{lexer.get_capture_ids_from_rule_id(token_type)}; - + // If there are no capture groups the entire variable token is stored as a variable. // If the variable token contains capture groups, we break the token up by storing // each capture as a variable and any substrings surrounding the capture as part of - // the logtype. If there are no capture groups the entire variable token is stored - // as a variable. + // the logtype. + + auto const& lexer{log_view.get_log_parser().m_lexer}; + auto capture_ids{lexer.get_capture_ids_from_rule_id(token_type)}; if (false == capture_ids.has_value()) { variable_dictionary_id_t id{}; m_var_dict.add_entry(token.to_string(), id); @@ -438,7 +438,9 @@ void Archive::write_msg_using_schema(log_surgeon::LogEventView const& log_view) break; } - auto const register_ids{lexer.get_reg_ids_from_capture_id(capture_ids.value().at(0))}; + auto const register_ids{ + lexer.get_reg_ids_from_capture_id(capture_ids.value().at(0)) + }; if (false == register_ids.has_value()) { throw(std::runtime_error( "No register IDs found for variable's capture group. Full token: " @@ -446,14 +448,13 @@ void Archive::write_msg_using_schema(log_surgeon::LogEventView const& log_view) )); } auto const [start_reg_id, end_reg_id]{register_ids.value()}; - auto const capture_start{token.get_reversed_reg_positions(start_reg_id).front()}; - auto const capture_end{token.get_reversed_reg_positions(end_reg_id).back()}; + auto const capture_start{token.get_reversed_reg_positions(start_reg_id).back()}; + auto const capture_end{token.get_reversed_reg_positions(end_reg_id).front()}; auto const token_view{token.to_string_view()}; size_t token_pos{0}; auto const before_capture{token_view.substr(token_pos, capture_start)}; - m_logtype_dict_entry - .add_constant(before_capture, 0, before_capture.length()); + m_logtype_dict_entry.add_constant(before_capture, 0, before_capture.length()); token_pos += before_capture.length(); // If a capture has repetition we store all instances as a single variable. @@ -468,9 +469,7 @@ void Archive::write_msg_using_schema(log_surgeon::LogEventView const& log_view) variable_dictionary_id_t id{}; m_var_dict.add_entry(capture, id); m_var_ids.push_back(id); - m_encoded_vars.push_back( - EncodedVariableInterpreter::encode_var_dict_id(id) - ); + m_encoded_vars.push_back(EncodedVariableInterpreter::encode_var_dict_id(id)); m_logtype_dict_entry.add_dictionary_var(); token_pos += capture.length(); @@ -481,7 +480,7 @@ void Archive::write_msg_using_schema(log_surgeon::LogEventView const& log_view) } } } - if (!m_logtype_dict_entry.get_value().empty()) { + if (false == m_logtype_dict_entry.get_value().empty()) { logtype_dictionary_id_t logtype_id{}; m_logtype_dict.add_entry(m_logtype_dict_entry, logtype_id); m_file->write_encoded_msg( From b8089f107617b2627bc272d5f4672119c3890d1f Mon Sep 17 00:00:00 2001 From: davidlion Date: Thu, 18 Sep 2025 13:22:54 -0400 Subject: [PATCH 04/14] Tweak tests. --- components/core/src/clp/Utils.cpp | 10 ++++++---- components/core/tests/test-ParserWithUserSchema.cpp | 6 +++--- .../test_schema_files/multiple_capture_groups.txt | 2 ++ .../tests/test_schema_files/single_capture_group.txt | 2 ++ 4 files changed, 13 insertions(+), 7 deletions(-) diff --git a/components/core/src/clp/Utils.cpp b/components/core/src/clp/Utils.cpp index 8fcf17ff26..8f22bda569 100644 --- a/components/core/src/clp/Utils.cpp +++ b/components/core/src/clp/Utils.cpp @@ -191,13 +191,15 @@ load_lexer_from_file(std::string const& schema_file_path, log_surgeon::lexers::B // Currently, we only support at most a single capture group in each variable. If a capture // group is present its match will be treated as the variable rather than the full match. - if (1 < rule->m_regex_ptr->get_subtree_positive_captures().size()) { + auto const num_captures = rule->m_regex_ptr->get_subtree_positive_captures().size(); + if (1 < num_captures) { throw std::runtime_error( schema_file_path + ":" + std::to_string(rule->m_line_num + 1) + ": error: the schema rule '" + rule->m_name - + "' has a regex pattern containing > 1 capture group.\n" - ); - } + + "' has a regex pattern containing > 1 capture groups (found " + + std::to_string(num_captures) + ").\n" + ); + } if ("timestamp" == rule->m_name) { continue; diff --git a/components/core/tests/test-ParserWithUserSchema.cpp b/components/core/tests/test-ParserWithUserSchema.cpp index 9cc99efb1f..d2f5adfd74 100644 --- a/components/core/tests/test-ParserWithUserSchema.cpp +++ b/components/core/tests/test-ParserWithUserSchema.cpp @@ -188,7 +188,7 @@ TEST_CASE("Test schema with single capture group", "[load_lexer]") { auto const rule_id{lexer.m_symbol_id.at("capture")}; auto const capture_ids{lexer.get_capture_ids_from_rule_id(rule_id)}; REQUIRE(capture_ids.has_value()); - REQUIRE(1 == capture_ids.value().size()); + REQUIRE(1 == capture_ids->size()); REQUIRE("group" == lexer.m_id_symbol.at(capture_ids->at(0))); } @@ -198,7 +198,7 @@ TEST_CASE("Test error for schema rule with multiple capture groups", "[load_lexe REQUIRE_THROWS_WITH( load_lexer_from_file(schema_path, lexer), schema_path - + ":1: error: the schema rule 'multicapture' has a regex pattern containing > " - "1 capture group.\n" + + ":3: error: the schema rule 'multicapture' has a regex pattern containing > " + "1 capture groups (found 2).\n" ); } diff --git a/components/core/tests/test_schema_files/multiple_capture_groups.txt b/components/core/tests/test_schema_files/multiple_capture_groups.txt index e55c56a9be..09998bc19d 100644 --- a/components/core/tests/test_schema_files/multiple_capture_groups.txt +++ b/components/core/tests/test_schema_files/multiple_capture_groups.txt @@ -1 +1,3 @@ +delimiters: \r\n + multicapture:text(?var0)text(?var1)text diff --git a/components/core/tests/test_schema_files/single_capture_group.txt b/components/core/tests/test_schema_files/single_capture_group.txt index 6984421a10..7c4bf17688 100644 --- a/components/core/tests/test_schema_files/single_capture_group.txt +++ b/components/core/tests/test_schema_files/single_capture_group.txt @@ -1 +1,3 @@ +delimiters: \r\n + capture:text(?var)text From 98d10885ab8bf3fa92cde98473fa0ef1484013e6 Mon Sep 17 00:00:00 2001 From: davidlion Date: Fri, 19 Sep 2025 10:01:31 -0400 Subject: [PATCH 05/14] Fix wrapped token logic. --- .../clp/streaming_archive/writer/Archive.cpp | 36 ++++++++++--------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/components/core/src/clp/streaming_archive/writer/Archive.cpp b/components/core/src/clp/streaming_archive/writer/Archive.cpp index f14c6df87a..041b41157d 100644 --- a/components/core/src/clp/streaming_archive/writer/Archive.cpp +++ b/components/core/src/clp/streaming_archive/writer/Archive.cpp @@ -424,7 +424,8 @@ void Archive::write_msg_using_schema(log_surgeon::LogEventView const& log_view) // If there are no capture groups the entire variable token is stored as a variable. // If the variable token contains capture groups, we break the token up by storing // each capture as a variable and any substrings surrounding the capture as part of - // the logtype. + // the logtype. If a capture has repetition we store all instances as a single + // variable. auto const& lexer{log_view.get_log_parser().m_lexer}; auto capture_ids{lexer.get_capture_ids_from_rule_id(token_type)}; @@ -450,31 +451,32 @@ void Archive::write_msg_using_schema(log_surgeon::LogEventView const& log_view) auto const [start_reg_id, end_reg_id]{register_ids.value()}; auto const capture_start{token.get_reversed_reg_positions(start_reg_id).back()}; auto const capture_end{token.get_reversed_reg_positions(end_reg_id).front()}; - auto const token_view{token.to_string_view()}; - size_t token_pos{0}; - - auto const before_capture{token_view.substr(token_pos, capture_start)}; - m_logtype_dict_entry.add_constant(before_capture, 0, before_capture.length()); - token_pos += before_capture.length(); - // If a capture has repetition we store all instances as a single variable. - auto const capture_len = [&]() -> size_t { - if (capture_start <= capture_end) { - return capture_end - capture_start; + auto const get_wrapped_size = [&](size_t start, size_t end) -> size_t { + if (start <= end) { + return end - start; } - return token_view.length() - capture_start + capture_end; - }(); - auto const capture{token_view.substr(token_pos, capture_len)}; + return token.m_buffer_size - start + end; + }; + + auto const token_view{token.to_string_view()}; + auto const before_capture_size{get_wrapped_size(token.m_start_pos, capture_start)}; + m_logtype_dict_entry.add_constant(token_view, 0, before_capture_size); + auto const capture_size{get_wrapped_size(capture_start, capture_end)}; + auto const capture{token_view.substr(before_capture_size, capture_size)}; variable_dictionary_id_t id{}; m_var_dict.add_entry(capture, id); m_var_ids.push_back(id); m_encoded_vars.push_back(EncodedVariableInterpreter::encode_var_dict_id(id)); m_logtype_dict_entry.add_dictionary_var(); - token_pos += capture.length(); - m_logtype_dict_entry - .add_constant(token_view, token_pos, token_view.length() - token_pos); + auto const capture_end_pos{before_capture_size + capture_size}; + m_logtype_dict_entry.add_constant( + token_view, + capture_end_pos, + token_view.length() - capture_end_pos + ); break; } From dec9f6e97d9319b6ade25ee8ebcafb6cb96685e2 Mon Sep 17 00:00:00 2001 From: davidlion Date: Fri, 19 Sep 2025 10:28:46 -0400 Subject: [PATCH 06/14] Fix formatting. --- components/core/src/clp/Utils.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/components/core/src/clp/Utils.cpp b/components/core/src/clp/Utils.cpp index 8f22bda569..f8449bc4ca 100644 --- a/components/core/src/clp/Utils.cpp +++ b/components/core/src/clp/Utils.cpp @@ -198,8 +198,8 @@ load_lexer_from_file(std::string const& schema_file_path, log_surgeon::lexers::B + ": error: the schema rule '" + rule->m_name + "' has a regex pattern containing > 1 capture groups (found " + std::to_string(num_captures) + ").\n" - ); - } + ); + } if ("timestamp" == rule->m_name) { continue; From 03a1f029a7808bb2273684dce38e8d817cfa0c46 Mon Sep 17 00:00:00 2001 From: davidlion Date: Fri, 19 Sep 2025 10:29:13 -0400 Subject: [PATCH 07/14] Remove wrapping logic by manipulating the token. --- .../clp/streaming_archive/writer/Archive.cpp | 32 ++++++++----------- 1 file changed, 13 insertions(+), 19 deletions(-) diff --git a/components/core/src/clp/streaming_archive/writer/Archive.cpp b/components/core/src/clp/streaming_archive/writer/Archive.cpp index 041b41157d..7d125847ea 100644 --- a/components/core/src/clp/streaming_archive/writer/Archive.cpp +++ b/components/core/src/clp/streaming_archive/writer/Archive.cpp @@ -451,32 +451,26 @@ void Archive::write_msg_using_schema(log_surgeon::LogEventView const& log_view) auto const [start_reg_id, end_reg_id]{register_ids.value()}; auto const capture_start{token.get_reversed_reg_positions(start_reg_id).back()}; auto const capture_end{token.get_reversed_reg_positions(end_reg_id).front()}; + auto token_view{token}; + auto const token_end_pos{token_view.m_end_pos}; - auto const get_wrapped_size = [&](size_t start, size_t end) -> size_t { - if (start <= end) { - return end - start; - } - return token.m_buffer_size - start + end; - }; - - auto const token_view{token.to_string_view()}; - auto const before_capture_size{get_wrapped_size(token.m_start_pos, capture_start)}; - m_logtype_dict_entry.add_constant(token_view, 0, before_capture_size); + token_view.m_end_pos = capture_start; + auto const text_before_capture{token_view.to_string_view()}; + m_logtype_dict_entry + .add_constant(text_before_capture, 0, text_before_capture.size()); - auto const capture_size{get_wrapped_size(capture_start, capture_end)}; - auto const capture{token_view.substr(before_capture_size, capture_size)}; + token_view.m_start_pos = capture_start; + token_view.m_end_pos = capture_end; variable_dictionary_id_t id{}; - m_var_dict.add_entry(capture, id); + m_var_dict.add_entry(token_view.to_string_view(), id); m_var_ids.push_back(id); m_encoded_vars.push_back(EncodedVariableInterpreter::encode_var_dict_id(id)); m_logtype_dict_entry.add_dictionary_var(); - auto const capture_end_pos{before_capture_size + capture_size}; - m_logtype_dict_entry.add_constant( - token_view, - capture_end_pos, - token_view.length() - capture_end_pos - ); + token_view.m_start_pos = capture_end; + token_view.m_end_pos = token_end_pos; + auto const text_after_capture{token_view.to_string_view()}; + m_logtype_dict_entry.add_constant(text_after_capture, 0, text_after_capture.size()); break; } From 65541fbe09a1078418a0d70deb6301d12d416f3e Mon Sep 17 00:00:00 2001 From: davidlion Date: Tue, 18 Nov 2025 21:41:07 -0500 Subject: [PATCH 08/14] Update log-surgeon. --- taskfiles/deps/main.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/taskfiles/deps/main.yaml b/taskfiles/deps/main.yaml index 927cedb2a1..a5a4785b26 100644 --- a/taskfiles/deps/main.yaml +++ b/taskfiles/deps/main.yaml @@ -367,8 +367,8 @@ tasks: - "-DCMAKE_INSTALL_MESSAGE=LAZY" - "-Dlog_surgeon_BUILD_TESTING=OFF" LIB_NAME: "log_surgeon" - TARBALL_SHA256: "4551ea50cd22e8423770fd66a167e1c86053b1f4957f72c582a2da93e7820210" - TARBALL_URL: "https://github.com/y-scope/log-surgeon/archive/840f262.tar.gz" + TARBALL_SHA256: "396ef8822e687b0bb8ca433a360e9e1d39a51620df89cac606db5e0f49d1b265" + TARBALL_URL: "https://github.com/y-scope/log-surgeon/archive/1135c2e.tar.gz" lz4: internal: true From 56bb7a6526d29e10782d27af8379787e47936dc4 Mon Sep 17 00:00:00 2001 From: davidlion Date: Tue, 18 Nov 2025 22:42:07 -0500 Subject: [PATCH 09/14] Update GrepCore. --- components/core/src/clp/GrepCore.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/components/core/src/clp/GrepCore.cpp b/components/core/src/clp/GrepCore.cpp index 1a4bf499e2..b8abf5a980 100644 --- a/components/core/src/clp/GrepCore.cpp +++ b/components/core/src/clp/GrepCore.cpp @@ -257,9 +257,9 @@ bool GrepCore::get_bounds_of_next_potential_var( return false; } search_token = SearchToken{token.value()}; - search_token.m_type_ids_set.insert(search_token.m_type_ids_ptr->at(0)); + search_token.m_type_ids_set.insert(search_token.get_type_ids()->at(0)); } - auto const& type = search_token.m_type_ids_ptr->at(0); + auto const& type = search_token.get_type_ids()->at(0); if (type != static_cast(log_surgeon::SymbolId::TokenUncaughtString) && type != static_cast(log_surgeon::SymbolId::TokenEnd)) { From 610e76f218201262773d8f2e12ac2b6703517cb1 Mon Sep 17 00:00:00 2001 From: davidlion Date: Wed, 19 Nov 2025 23:03:12 -0500 Subject: [PATCH 10/14] Update log surgeon code + tests with new version. --- .../clp/streaming_archive/writer/Archive.cpp | 109 +++++++++--------- .../core/tests/test-ParserWithUserSchema.cpp | 15 +-- .../tests/test_schema_files/search_schema.txt | 2 +- 3 files changed, 65 insertions(+), 61 deletions(-) diff --git a/components/core/src/clp/streaming_archive/writer/Archive.cpp b/components/core/src/clp/streaming_archive/writer/Archive.cpp index 9934683f5f..ce71ee7a91 100644 --- a/components/core/src/clp/streaming_archive/writer/Archive.cpp +++ b/components/core/src/clp/streaming_archive/writer/Archive.cpp @@ -22,6 +22,7 @@ #include "../../spdlog_with_specializations.hpp" #include "../../Utils.hpp" #include "../Constants.hpp" +#include "TimestampPattern.hpp" #include "utils.hpp" using clp::ir::eight_byte_encoded_variable_t; @@ -315,14 +316,14 @@ Archive::write_msg(epochtime_t timestamp, string const& message, size_t num_unco } void Archive::write_msg_using_schema(log_surgeon::LogEventView const& log_view) { - epochtime_t timestamp = 0; - TimestampPattern* timestamp_pattern = nullptr; - auto const& log_output_buffer = log_view.get_log_output_buffer(); - if (log_output_buffer->has_timestamp()) { + epochtime_t timestamp{0}; + TimestampPattern const* timestamp_pattern{nullptr}; + auto const& log_buf = log_view.get_log_output_buffer(); + if (log_buf->has_timestamp()) { size_t start{}; size_t end{}; - timestamp_pattern = (TimestampPattern*)TimestampPattern::search_known_ts_patterns( - log_output_buffer->get_mutable_token(0).to_string(), + timestamp_pattern = TimestampPattern::search_known_ts_patterns( + log_buf->get_mutable_token(0).to_string(), timestamp, start, end @@ -330,14 +331,16 @@ void Archive::write_msg_using_schema(log_surgeon::LogEventView const& log_view) if (nullptr == timestamp_pattern) { throw(std::runtime_error( "Schema contains a timestamp regex that matches " - + log_output_buffer->get_mutable_token(0).to_string() + + log_buf->get_mutable_token(0).to_string() + " which does not match any known timestamp pattern." )); } if (m_old_ts_pattern != timestamp_pattern) { change_ts_pattern(timestamp_pattern); - m_old_ts_pattern = timestamp_pattern; + m_old_ts_pattern = const_cast(timestamp_pattern); } + } else { + timestamp_pattern = nullptr; } if (get_data_size_of_dictionaries() >= m_target_data_size_of_dicts) { split_file_and_archive( @@ -353,48 +356,45 @@ void Archive::write_msg_using_schema(log_surgeon::LogEventView const& log_view) m_encoded_vars.clear(); m_var_ids.clear(); m_logtype_dict_entry.clear(); - size_t num_uncompressed_bytes = 0; + + size_t num_uncompressed_bytes{0}; // Timestamp is included in the uncompressed message size - uint32_t start_pos = log_output_buffer->get_token(0).m_start_pos; + auto start_pos{log_buf->get_token(0).get_start_pos()}; if (timestamp_pattern == nullptr) { - start_pos = log_output_buffer->get_token(1).m_start_pos; + start_pos = log_buf->get_token(1).get_start_pos(); } - uint32_t const end_pos = log_output_buffer->get_token(log_output_buffer->pos() - 1).m_end_pos; + auto const end_pos{log_buf->get_token(log_buf->pos() - 1).get_end_pos()}; if (start_pos <= end_pos) { num_uncompressed_bytes = end_pos - start_pos; } else { - num_uncompressed_bytes - = log_output_buffer->get_token(0).m_buffer_size - start_pos + end_pos; + num_uncompressed_bytes = log_buf->get_token(0).get_buffer_size() - start_pos + end_pos; } - for (uint32_t i = 1; i < log_output_buffer->pos(); i++) { - log_surgeon::Token& token = log_output_buffer->get_mutable_token(i); - auto const token_type{token.m_type_ids_ptr->at(0)}; - if (log_output_buffer->has_delimiters() && (timestamp_pattern != nullptr || i > 1) + for (auto token_idx{1}; token_idx < log_buf->pos(); token_idx++) { + auto token_view{log_buf->get_token(token_idx)}; + auto const token_type{token_view.get_type_ids()->at(0)}; + if (log_buf->has_delimiters() && (timestamp_pattern != nullptr || token_idx > 1) && token_type != static_cast(log_surgeon::SymbolId::TokenUncaughtString) && token_type != static_cast(log_surgeon::SymbolId::TokenNewline)) { - m_logtype_dict_entry.add_constant(token.get_delimiter(), 0, 1); - if (token.m_start_pos == token.m_buffer_size - 1) { - token.m_start_pos = 0; - } else { - token.m_start_pos++; - } + m_logtype_dict_entry.add_constant(token_view.get_delimiter(), 0, 1); + token_view.increment_start_pos(); } switch (token_type) { case static_cast(log_surgeon::SymbolId::TokenNewline): case static_cast(log_surgeon::SymbolId::TokenUncaughtString): { - m_logtype_dict_entry.add_constant(token.to_string(), 0, token.get_length()); + m_logtype_dict_entry + .add_constant(token_view.to_string(), 0, token_view.get_length()); break; } case static_cast(log_surgeon::SymbolId::TokenInt): { encoded_variable_t encoded_var{}; if (!EncodedVariableInterpreter::convert_string_to_representable_integer_var( - token.to_string(), + token_view.to_string(), encoded_var )) { variable_dictionary_id_t id{}; - m_var_dict.add_entry(token.to_string(), id); + m_var_dict.add_entry(token_view.to_string(), id); encoded_var = EncodedVariableInterpreter::encode_var_dict_id(id); m_logtype_dict_entry.add_dictionary_var(); } else { @@ -406,12 +406,12 @@ void Archive::write_msg_using_schema(log_surgeon::LogEventView const& log_view) case static_cast(log_surgeon::SymbolId::TokenFloat): { encoded_variable_t encoded_var{}; if (!EncodedVariableInterpreter::convert_string_to_representable_float_var( - token.to_string(), + token_view.to_string(), encoded_var )) { variable_dictionary_id_t id{}; - m_var_dict.add_entry(token.to_string(), id); + m_var_dict.add_entry(token_view.to_string(), id); encoded_var = EncodedVariableInterpreter::encode_var_dict_id(id); m_logtype_dict_entry.add_dictionary_var(); } else { @@ -424,18 +424,17 @@ void Archive::write_msg_using_schema(log_surgeon::LogEventView const& log_view) // If there are no capture groups the entire variable token is stored as a variable. // If the variable token contains capture groups, we break the token up by storing // each capture as a variable and any substrings surrounding the capture as part of - // the logtype. If a capture has repetition we store all instances as a single - // variable. + // the logtype. Capture repetition currently does not work so we explicitly only + // store the first capture. auto const& lexer{log_view.get_log_parser().m_lexer}; auto capture_ids{lexer.get_capture_ids_from_rule_id(token_type)}; if (false == capture_ids.has_value()) { variable_dictionary_id_t id{}; - m_var_dict.add_entry(token.to_string(), id); + m_var_dict.add_entry(token_view.to_string(), id); m_var_ids.push_back(id); m_encoded_vars.push_back(EncodedVariableInterpreter::encode_var_dict_id(id)); m_logtype_dict_entry.add_dictionary_var(); - break; } @@ -445,33 +444,37 @@ void Archive::write_msg_using_schema(log_surgeon::LogEventView const& log_view) if (false == register_ids.has_value()) { throw(std::runtime_error( "No register IDs found for variable's capture group. Full token: " - + token.to_string() + + token_view.to_string() )); } + auto const [start_reg_id, end_reg_id]{register_ids.value()}; - auto const capture_start{token.get_reversed_reg_positions(start_reg_id).back()}; - auto const capture_end{token.get_reversed_reg_positions(end_reg_id).front()}; - auto token_view{token}; - auto const token_end_pos{token_view.m_end_pos}; + auto const start_positions{token_view.get_reversed_reg_positions(start_reg_id)}; + auto const end_positions{token_view.get_reversed_reg_positions(end_reg_id)}; - token_view.m_end_pos = capture_start; - auto const text_before_capture{token_view.to_string_view()}; - m_logtype_dict_entry - .add_constant(text_before_capture, 0, text_before_capture.size()); + if (false == start_positions.empty() && -1 < start_positions[0] + && false == end_positions.empty() && -1 < end_positions[0]) + { + auto token_end{token_view.get_end_pos()}; - token_view.m_start_pos = capture_start; - token_view.m_end_pos = capture_end; - variable_dictionary_id_t id{}; - m_var_dict.add_entry(token_view.to_string_view(), id); - m_var_ids.push_back(id); - m_encoded_vars.push_back(EncodedVariableInterpreter::encode_var_dict_id(id)); - m_logtype_dict_entry.add_dictionary_var(); + token_view.set_end_pos(start_positions[0]); + auto const before_capture{token_view.to_string_view()}; + m_logtype_dict_entry.add_constant(before_capture, 0, before_capture.size()); - token_view.m_start_pos = capture_end; - token_view.m_end_pos = token_end_pos; - auto const text_after_capture{token_view.to_string_view()}; - m_logtype_dict_entry.add_constant(text_after_capture, 0, text_after_capture.size()); + token_view.set_start_pos(start_positions[0]); + token_view.set_end_pos(end_positions[0]); + variable_dictionary_id_t id{}; + m_var_dict.add_entry(token_view.to_string_view(), id); + m_var_ids.push_back(id); + m_encoded_vars.push_back(EncodedVariableInterpreter::encode_var_dict_id(id)); + m_logtype_dict_entry.add_dictionary_var(); + + token_view.set_start_pos(end_positions[0]); + token_view.set_end_pos(token_end); + auto const after_capture{token_view.to_string_view()}; + m_logtype_dict_entry.add_constant(after_capture, 0, after_capture.size()); + } break; } } diff --git a/components/core/tests/test-ParserWithUserSchema.cpp b/components/core/tests/test-ParserWithUserSchema.cpp index 491b4b1c79..a18ff00f28 100644 --- a/components/core/tests/test-ParserWithUserSchema.cpp +++ b/components/core/tests/test-ParserWithUserSchema.cpp @@ -192,10 +192,11 @@ TEST_CASE("Test lexer", "[Search]") { auto [error_code, opt_token] = lexer.scan(parser_input_buffer); REQUIRE(error_code == log_surgeon::ErrorCode::Success); Token token{opt_token.value()}; - while (token.m_type_ids_ptr->at(0) != static_cast(log_surgeon::SymbolId::TokenEnd)) { + while (token.get_type_ids()->at(0) != static_cast(log_surgeon::SymbolId::TokenEnd)) { SPDLOG_INFO("token:" + token.to_string() + "\n"); SPDLOG_INFO( - "token.m_type_ids->back():" + lexer.m_id_symbol[token.m_type_ids_ptr->back()] + "\n" + "token.get_type_ids()->back():" + lexer.m_id_symbol[token.get_type_ids()->back()] + + "\n" ); auto [error_code, opt_token] = lexer.scan(parser_input_buffer); REQUIRE(error_code == log_surgeon::ErrorCode::Success); @@ -204,9 +205,9 @@ TEST_CASE("Test lexer", "[Search]") { } TEST_CASE("Test schema with single capture group", "[load_lexer]") { - std::string const schema_path{"../tests/test_schema_files/single_capture_group.txt"}; + auto const schema_file_path{get_test_schema_files_dir() / "single_capture_group.txt"}; ByteLexer lexer; - load_lexer_from_file(schema_path, lexer); + load_lexer_from_file(schema_file_path, lexer); auto const rule_id{lexer.m_symbol_id.at("capture")}; auto const capture_ids{lexer.get_capture_ids_from_rule_id(rule_id)}; @@ -216,11 +217,11 @@ TEST_CASE("Test schema with single capture group", "[load_lexer]") { } TEST_CASE("Test error for schema rule with multiple capture groups", "[load_lexer]") { - std::string const schema_path{"../tests/test_schema_files/multiple_capture_groups.txt"}; + auto const schema_file_path{get_test_schema_files_dir() / "multiple_capture_groups.txt"}; ByteLexer lexer; REQUIRE_THROWS_WITH( - load_lexer_from_file(schema_path, lexer), - schema_path + load_lexer_from_file(schema_file_path, lexer), + schema_file_path.string() + ":3: error: the schema rule 'multicapture' has a regex pattern containing > " "1 capture groups (found 2).\n" ); diff --git a/components/core/tests/test_schema_files/search_schema.txt b/components/core/tests/test_schema_files/search_schema.txt index 60d0c12f00..f49a6dbfa4 100644 --- a/components/core/tests/test_schema_files/search_schema.txt +++ b/components/core/tests/test_schema_files/search_schema.txt @@ -1,5 +1,5 @@ // Delimiters -delimiters: \r\n:,=!;%\? +delimiters: \r\n:,=!;%? // First set of variables timestamp:[0-9]{4}\-[0-9]{2}\-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}(\.[0-9]{3}){0,1} From 578601a356e3eb029683996d5d28a1022c54985e Mon Sep 17 00:00:00 2001 From: davidlion Date: Mon, 24 Nov 2025 11:43:45 -0500 Subject: [PATCH 11/14] Refactor writing a token to dictionaries into a private helper. --- .../clp/streaming_archive/writer/Archive.cpp | 204 +++++++++--------- .../clp/streaming_archive/writer/Archive.hpp | 12 +- 2 files changed, 115 insertions(+), 101 deletions(-) diff --git a/components/core/src/clp/streaming_archive/writer/Archive.cpp b/components/core/src/clp/streaming_archive/writer/Archive.cpp index ce71ee7a91..19d6223be7 100644 --- a/components/core/src/clp/streaming_archive/writer/Archive.cpp +++ b/components/core/src/clp/streaming_archive/writer/Archive.cpp @@ -15,6 +15,7 @@ #include #include #include +#include #include #include "../../EncodedVariableInterpreter.hpp" @@ -315,6 +316,109 @@ Archive::write_msg(epochtime_t timestamp, string const& message, size_t num_unco update_segment_indices(logtype_id, var_ids); } +auto Archive::add_token_to_dicts( + log_surgeon::LogEventView const& log_view, + log_surgeon::Token token_view +) -> void { + switch (token_view.get_type_ids()->at(0)) { + case static_cast(log_surgeon::SymbolId::TokenNewline): + case static_cast(log_surgeon::SymbolId::TokenUncaughtString): { + m_logtype_dict_entry.add_constant(token_view.to_string(), 0, token_view.get_length()); + break; + } + case static_cast(log_surgeon::SymbolId::TokenInt): { + encoded_variable_t encoded_var{}; + if (!EncodedVariableInterpreter::convert_string_to_representable_integer_var( + token_view.to_string(), + encoded_var + )) + { + variable_dictionary_id_t id{}; + m_var_dict.add_entry(token_view.to_string(), id); + encoded_var = EncodedVariableInterpreter::encode_var_dict_id(id); + m_logtype_dict_entry.add_dictionary_var(); + } else { + m_logtype_dict_entry.add_int_var(); + } + m_encoded_vars.push_back(encoded_var); + break; + } + case static_cast(log_surgeon::SymbolId::TokenFloat): { + encoded_variable_t encoded_var{}; + if (!EncodedVariableInterpreter::convert_string_to_representable_float_var( + token_view.to_string(), + encoded_var + )) + { + variable_dictionary_id_t id{}; + m_var_dict.add_entry(token_view.to_string(), id); + encoded_var = EncodedVariableInterpreter::encode_var_dict_id(id); + m_logtype_dict_entry.add_dictionary_var(); + } else { + m_logtype_dict_entry.add_float_var(); + } + m_encoded_vars.push_back(encoded_var); + break; + } + default: { + // If there are no capture groups the entire variable token is stored as a variable. + // If the variable token contains capture groups, we break the token up by storing + // each capture as a variable and any substrings surrounding the capture as part of + // the logtype. Capture repetition currently does not work so we explicitly only + // store the first capture. + + auto const token_type{token_view.get_type_ids()->at(0)}; + auto const& lexer{log_view.get_log_parser().m_lexer}; + auto capture_ids{lexer.get_capture_ids_from_rule_id(token_type)}; + if (false == capture_ids.has_value()) { + variable_dictionary_id_t id{}; + m_var_dict.add_entry(token_view.to_string(), id); + m_var_ids.push_back(id); + m_encoded_vars.push_back(EncodedVariableInterpreter::encode_var_dict_id(id)); + m_logtype_dict_entry.add_dictionary_var(); + break; + } + + auto const register_ids{lexer.get_reg_ids_from_capture_id(capture_ids.value().at(0))}; + if (false == register_ids.has_value()) { + throw(std::runtime_error( + "No register IDs found for variable's capture group. Full token: " + + token_view.to_string() + )); + } + + auto const [start_reg_id, end_reg_id]{register_ids.value()}; + auto const start_positions{token_view.get_reversed_reg_positions(start_reg_id)}; + auto const end_positions{token_view.get_reversed_reg_positions(end_reg_id)}; + + if (false == start_positions.empty() && -1 < start_positions[0] + && false == end_positions.empty() && -1 < end_positions[0]) + { + auto token_end{token_view.get_end_pos()}; + + token_view.set_end_pos(start_positions[0]); + auto const before_capture{token_view.to_string_view()}; + m_logtype_dict_entry.add_constant(before_capture, 0, before_capture.size()); + + token_view.set_start_pos(start_positions[0]); + token_view.set_end_pos(end_positions[0]); + + variable_dictionary_id_t id{}; + m_var_dict.add_entry(token_view.to_string_view(), id); + m_var_ids.push_back(id); + m_encoded_vars.push_back(EncodedVariableInterpreter::encode_var_dict_id(id)); + m_logtype_dict_entry.add_dictionary_var(); + + token_view.set_start_pos(end_positions[0]); + token_view.set_end_pos(token_end); + auto const after_capture{token_view.to_string_view()}; + m_logtype_dict_entry.add_constant(after_capture, 0, after_capture.size()); + } + break; + } + } +} + void Archive::write_msg_using_schema(log_surgeon::LogEventView const& log_view) { epochtime_t timestamp{0}; TimestampPattern const* timestamp_pattern{nullptr}; @@ -379,105 +483,7 @@ void Archive::write_msg_using_schema(log_surgeon::LogEventView const& log_view) m_logtype_dict_entry.add_constant(token_view.get_delimiter(), 0, 1); token_view.increment_start_pos(); } - switch (token_type) { - case static_cast(log_surgeon::SymbolId::TokenNewline): - case static_cast(log_surgeon::SymbolId::TokenUncaughtString): { - m_logtype_dict_entry - .add_constant(token_view.to_string(), 0, token_view.get_length()); - break; - } - case static_cast(log_surgeon::SymbolId::TokenInt): { - encoded_variable_t encoded_var{}; - if (!EncodedVariableInterpreter::convert_string_to_representable_integer_var( - token_view.to_string(), - encoded_var - )) - { - variable_dictionary_id_t id{}; - m_var_dict.add_entry(token_view.to_string(), id); - encoded_var = EncodedVariableInterpreter::encode_var_dict_id(id); - m_logtype_dict_entry.add_dictionary_var(); - } else { - m_logtype_dict_entry.add_int_var(); - } - m_encoded_vars.push_back(encoded_var); - break; - } - case static_cast(log_surgeon::SymbolId::TokenFloat): { - encoded_variable_t encoded_var{}; - if (!EncodedVariableInterpreter::convert_string_to_representable_float_var( - token_view.to_string(), - encoded_var - )) - { - variable_dictionary_id_t id{}; - m_var_dict.add_entry(token_view.to_string(), id); - encoded_var = EncodedVariableInterpreter::encode_var_dict_id(id); - m_logtype_dict_entry.add_dictionary_var(); - } else { - m_logtype_dict_entry.add_float_var(); - } - m_encoded_vars.push_back(encoded_var); - break; - } - default: { - // If there are no capture groups the entire variable token is stored as a variable. - // If the variable token contains capture groups, we break the token up by storing - // each capture as a variable and any substrings surrounding the capture as part of - // the logtype. Capture repetition currently does not work so we explicitly only - // store the first capture. - - auto const& lexer{log_view.get_log_parser().m_lexer}; - auto capture_ids{lexer.get_capture_ids_from_rule_id(token_type)}; - if (false == capture_ids.has_value()) { - variable_dictionary_id_t id{}; - m_var_dict.add_entry(token_view.to_string(), id); - m_var_ids.push_back(id); - m_encoded_vars.push_back(EncodedVariableInterpreter::encode_var_dict_id(id)); - m_logtype_dict_entry.add_dictionary_var(); - break; - } - - auto const register_ids{ - lexer.get_reg_ids_from_capture_id(capture_ids.value().at(0)) - }; - if (false == register_ids.has_value()) { - throw(std::runtime_error( - "No register IDs found for variable's capture group. Full token: " - + token_view.to_string() - )); - } - - auto const [start_reg_id, end_reg_id]{register_ids.value()}; - auto const start_positions{token_view.get_reversed_reg_positions(start_reg_id)}; - auto const end_positions{token_view.get_reversed_reg_positions(end_reg_id)}; - - if (false == start_positions.empty() && -1 < start_positions[0] - && false == end_positions.empty() && -1 < end_positions[0]) - { - auto token_end{token_view.get_end_pos()}; - - token_view.set_end_pos(start_positions[0]); - auto const before_capture{token_view.to_string_view()}; - m_logtype_dict_entry.add_constant(before_capture, 0, before_capture.size()); - - token_view.set_start_pos(start_positions[0]); - token_view.set_end_pos(end_positions[0]); - - variable_dictionary_id_t id{}; - m_var_dict.add_entry(token_view.to_string_view(), id); - m_var_ids.push_back(id); - m_encoded_vars.push_back(EncodedVariableInterpreter::encode_var_dict_id(id)); - m_logtype_dict_entry.add_dictionary_var(); - - token_view.set_start_pos(end_positions[0]); - token_view.set_end_pos(token_end); - auto const after_capture{token_view.to_string_view()}; - m_logtype_dict_entry.add_constant(after_capture, 0, after_capture.size()); - } - break; - } - } + add_token_to_dicts(log_view, token_view); } if (false == m_logtype_dict_entry.get_value().empty()) { logtype_dictionary_id_t logtype_id{}; diff --git a/components/core/src/clp/streaming_archive/writer/Archive.hpp b/components/core/src/clp/streaming_archive/writer/Archive.hpp index 2b589881a5..2fa1d4661f 100644 --- a/components/core/src/clp/streaming_archive/writer/Archive.hpp +++ b/components/core/src/clp/streaming_archive/writer/Archive.hpp @@ -12,7 +12,6 @@ #include #include #include -#include #include "../../ArrayBackedPosIntSet.hpp" #include "../../ErrorCode.hpp" @@ -150,7 +149,7 @@ class Archive { * @param log_event_view * @throw FileWriter::OperationFailed if any write fails */ - void write_msg_using_schema(log_surgeon::LogEventView const& log_event_view); + void write_msg_using_schema(log_surgeon::LogEventView const& log_view); /** * Writes an IR log event to the current encoded file @@ -290,6 +289,15 @@ class Archive { */ auto update_global_metadata() -> void; + /** + * Inspect a log surgeon token and add its information to the logtype and variable dictionaries. + * @param log_view The log event containing the token. + * @param token_view The token to add to the dictionaries. + */ + auto + add_token_to_dicts(log_surgeon::LogEventView const& log_view, log_surgeon::Token token_view) + -> void; + // Variables boost::uuids::uuid m_id; std::string m_id_as_string; From 834fdb780e83ef23f472acb407cc5dcac595908b Mon Sep 17 00:00:00 2001 From: davidlion Date: Wed, 26 Nov 2025 23:19:48 -0500 Subject: [PATCH 12/14] Add unit test; refactor test-ParserWithUserSchema.cpp. --- .../core/tests/test-ParserWithUserSchema.cpp | 164 ++++++++++-------- .../tests/test_log_files/log_with_capture.txt | 1 + .../single_capture_group.txt | 2 +- 3 files changed, 90 insertions(+), 77 deletions(-) create mode 100644 components/core/tests/test_log_files/log_with_capture.txt diff --git a/components/core/tests/test-ParserWithUserSchema.cpp b/components/core/tests/test-ParserWithUserSchema.cpp index a18ff00f28..b51d212f9a 100644 --- a/components/core/tests/test-ParserWithUserSchema.cpp +++ b/components/core/tests/test-ParserWithUserSchema.cpp @@ -1,37 +1,49 @@ // TODO: move this test to log_surgeon // TODO: move load_lexer_from_file into SearchParser in log_surgeon -#include - #include #include +#include #include +#include #include #include +#include +#include #include #include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include -#include "../src/clp/clp/run.hpp" -#include "../src/clp/GlobalMySQLMetadataDB.hpp" -#include "../src/clp/LogSurgeonReader.hpp" -#include "../src/clp/Utils.hpp" +#include "TestOutputCleaner.hpp" -using clp::FileReader; using clp::load_lexer_from_file; -using clp::LogSurgeonReader; -using log_surgeon::DelimiterStringAST; using log_surgeon::lexers::ByteLexer; using log_surgeon::LogParser; -using log_surgeon::ParserAST; using log_surgeon::SchemaAST; -using log_surgeon::SchemaVarAST; -using log_surgeon::Token; namespace { +constexpr std::string_view cTestArchiveDirectory{"test-parser-with-user-schema-archive"}; + +auto run_clp_compress( + std::filesystem::path const& schema_path, + std::filesystem::path const& output_path, + std::filesystem::path const& input_path +) -> int; [[nodiscard]] auto get_tests_dir() -> std::filesystem::path; [[nodiscard]] auto get_test_schema_files_dir() -> std::filesystem::path; [[nodiscard]] auto get_test_queries_dir() -> std::filesystem::path; +[[nodiscard]] auto get_test_log_dir() -> std::filesystem::path; auto get_tests_dir() -> std::filesystem::path { std::filesystem::path const current_file_path{__FILE__}; @@ -45,6 +57,30 @@ auto get_test_schema_files_dir() -> std::filesystem::path { auto get_test_queries_dir() -> std::filesystem::path { return get_tests_dir() / "test_search_queries"; } + +auto get_test_log_dir() -> std::filesystem::path { + return get_tests_dir() / "test_log_files"; +} + +auto run_clp_compress( + std::filesystem::path const& schema_path, + std::filesystem::path const& output_path, + std::filesystem::path const& input_path +) -> int { + auto const schema_path_str{schema_path.string()}; + auto const output_path_str{output_path.string()}; + auto const input_path_str{input_path.string()}; + std::vector argv{ + "clp", + "c", + "--schema-path", + schema_path_str.data(), + output_path_str.data(), + input_path_str.data(), + nullptr + }; + return clp::clp::run(static_cast(argv.size() - 1), argv.data()); +} } // namespace std::unique_ptr generate_schema_ast(std::string const& schema_file) { @@ -60,43 +96,6 @@ std::unique_ptr generate_log_parser(std::string const& schema_file) { return log_parser; } -void compress( - std::string const& output_dir, - std::string const& file_to_compress, - std::string schema_file, - bool old = false -) { - std::vector arguments; - if (old) { - arguments = {"main.cpp", "c", output_dir, file_to_compress}; - } else { - arguments - = {"main.cpp", - "c", - output_dir, - file_to_compress, - "--schema-path", - std::move(schema_file)}; - } - std::vector argv; - for (auto const& arg : arguments) { - argv.push_back(arg.data()); - } - argv.push_back(nullptr); - clp::clp::run(argv.size() - 1, argv.data()); -} - -void decompress(std::string archive_dir, std::string output_dir) { - std::vector arguments - = {"main.cpp", "x", std::move(archive_dir), std::move(output_dir)}; - std::vector argv; - for (auto const& arg : arguments) { - argv.push_back(arg.data()); - } - argv.push_back(nullptr); - clp::clp::run(argv.size() - 1, argv.data()); -} - TEST_CASE("Test error for missing schema file", "[LALR1Parser][SchemaParser]") { auto const file_path = get_test_schema_files_dir() / "missing_schema.txt"; auto const file_path_string = file_path.string(); @@ -156,42 +155,19 @@ TEST_CASE("Test creating log parser without delimiters", "[LALR1Parser][LogParse ); } -// TODO: This test doesn't currently work because delimiters are allowed in -// schema files, and there is no option to disable this yet -// TEST_CASE("Test error for creating log file with delimiter in regex pattern", -// "[LALR1Parser]SchemaParser]") { -// std::string file_path = "../tests/test_schema_files/schema_with_delimiter_in_regex_error.txt"; -// std::string file_name = boost::filesystem::canonical(file_path).string(); -// REQUIRE_THROWS_WITH(generate_log_parser(file_path), -// file_name + -// ":2: error: 'equals' has regex pattern which contains delimiter '='.\n" -// + " equals:.*=.*\n" -// + " ^^^^^\n"); -//} - -// TODO: This error check is performed correctly by CLP, but it is handled by -// something different now so this test will fail as is -// TEST_CASE("Test error for missing log file", "[LALR1Parser][LogParser]") { -// std::string file_name = "../tests/test_log_files/missing_log.txt"; -// std::string file_path = boost::filesystem::weakly_canonical(file_name).string(); -// REQUIRE_THROWS(compress("../tests/test_archives", file_name, -// "../tests/test_schema_files/schema_that_does_not_exist.txt"), -// "Specified schema file does not exist."); -//} - TEST_CASE("Test lexer", "[Search]") { ByteLexer lexer; auto const schema_file_path = get_test_schema_files_dir() / "search_schema.txt"; load_lexer_from_file(schema_file_path.string(), lexer); auto const query_file_path = get_test_queries_dir() / "easy.txt"; - FileReader file_reader{query_file_path.string()}; - LogSurgeonReader reader_wrapper(file_reader); + clp::FileReader file_reader{query_file_path.string()}; + clp::LogSurgeonReader reader_wrapper(file_reader); log_surgeon::ParserInputBuffer parser_input_buffer; parser_input_buffer.read_if_safe(reader_wrapper); lexer.reset(); auto [error_code, opt_token] = lexer.scan(parser_input_buffer); REQUIRE(error_code == log_surgeon::ErrorCode::Success); - Token token{opt_token.value()}; + auto token{opt_token.value()}; while (token.get_type_ids()->at(0) != static_cast(log_surgeon::SymbolId::TokenEnd)) { SPDLOG_INFO("token:" + token.to_string() + "\n"); SPDLOG_INFO( @@ -216,7 +192,7 @@ TEST_CASE("Test schema with single capture group", "[load_lexer]") { REQUIRE("group" == lexer.m_id_symbol.at(capture_ids->at(0))); } -TEST_CASE("Test error for schema rule with multiple capture groups", "[load_lexer]") { +TEST_CASE("Error on schema rule with multiple capture groups", "[load_lexer]") { auto const schema_file_path{get_test_schema_files_dir() / "multiple_capture_groups.txt"}; ByteLexer lexer; REQUIRE_THROWS_WITH( @@ -226,3 +202,39 @@ TEST_CASE("Test error for schema rule with multiple capture groups", "[load_lexe "1 capture groups (found 2).\n" ); } + +TEST_CASE("Verify dictionary contents", "[Compression]") { + auto const log_file_path{get_test_log_dir() / "log_with_capture.txt"}; + auto const schema_file_path{get_test_schema_files_dir() / "single_capture_group.txt"}; + TestOutputCleaner const cleaner{{std::string{cTestArchiveDirectory}}}; + std::filesystem::create_directory(cTestArchiveDirectory); + + REQUIRE(0 == run_clp_compress(schema_file_path, cTestArchiveDirectory, log_file_path)); + + std::vector archive_paths; + for (auto const& entry : std::filesystem::directory_iterator{cTestArchiveDirectory}) { + auto const& path{entry.path()}; + if (false == path.string().ends_with(clp::streaming_archive::cMetadataDBFileName)) { + archive_paths.emplace_back(path); + } + } + REQUIRE(1 == archive_paths.size()); + + clp::streaming_archive::reader::Archive archive_reader; + archive_reader.open(archive_paths.at(0)); + archive_reader.refresh_dictionaries(); + + auto const& logtype_dict{archive_reader.get_logtype_dictionary()}; + REQUIRE(1 == logtype_dict.get_entries().size()); + REQUIRE(fmt::format( + "2016-05-08 07:34:05.251 MyDog{} APet{} test.txt\n", + clp::enum_to_underlying_type(clp::ir::VariablePlaceholder::Dictionary), + clp::enum_to_underlying_type(clp::ir::VariablePlaceholder::Dictionary) + ) + == logtype_dict.get_value(0)); + + auto const& var_dict{archive_reader.get_var_dictionary()}; + REQUIRE(2 == var_dict.get_entries().size()); + REQUIRE("123" == var_dict.get_value(0)); + REQUIRE("4123" == var_dict.get_value(1)); +} diff --git a/components/core/tests/test_log_files/log_with_capture.txt b/components/core/tests/test_log_files/log_with_capture.txt new file mode 100644 index 0000000000..6210ea8bdc --- /dev/null +++ b/components/core/tests/test_log_files/log_with_capture.txt @@ -0,0 +1 @@ +2016-05-08 07:34:05.251 MyDog123 APet4123 test.txt diff --git a/components/core/tests/test_schema_files/single_capture_group.txt b/components/core/tests/test_schema_files/single_capture_group.txt index 7c4bf17688..b9543fbdcc 100644 --- a/components/core/tests/test_schema_files/single_capture_group.txt +++ b/components/core/tests/test_schema_files/single_capture_group.txt @@ -1,3 +1,3 @@ delimiters: \r\n -capture:text(?var)text +capture:[A-Za-z]+(?\d+) From ea1ad3b63b22a1ddb5418b3bd0b03429cca9b6e0 Mon Sep 17 00:00:00 2001 From: davidlion Date: Thu, 27 Nov 2025 09:14:31 -0500 Subject: [PATCH 13/14] Add null + empty check for token type ids. --- .../src/clp/streaming_archive/writer/Archive.cpp | 12 ++++++++++-- .../src/clp/streaming_archive/writer/Archive.hpp | 2 +- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/components/core/src/clp/streaming_archive/writer/Archive.cpp b/components/core/src/clp/streaming_archive/writer/Archive.cpp index 19d6223be7..194c55488e 100644 --- a/components/core/src/clp/streaming_archive/writer/Archive.cpp +++ b/components/core/src/clp/streaming_archive/writer/Archive.cpp @@ -320,7 +320,11 @@ auto Archive::add_token_to_dicts( log_surgeon::LogEventView const& log_view, log_surgeon::Token token_view ) -> void { - switch (token_view.get_type_ids()->at(0)) { + auto const* type_ids{token_view.get_type_ids()}; + if (nullptr == type_ids || type_ids->empty()) { + throw std::runtime_error("Token has no type IDs: " + token_view.to_string()); + } + switch (type_ids->at(0)) { case static_cast(log_surgeon::SymbolId::TokenNewline): case static_cast(log_surgeon::SymbolId::TokenUncaughtString): { m_logtype_dict_entry.add_constant(token_view.to_string(), 0, token_view.get_length()); @@ -475,7 +479,11 @@ void Archive::write_msg_using_schema(log_surgeon::LogEventView const& log_view) } for (auto token_idx{1}; token_idx < log_buf->pos(); token_idx++) { auto token_view{log_buf->get_token(token_idx)}; - auto const token_type{token_view.get_type_ids()->at(0)}; + auto const* type_ids{token_view.get_type_ids()}; + if (nullptr == type_ids || type_ids->empty()) { + throw std::runtime_error("Token has no type IDs: " + token_view.to_string()); + } + auto const token_type{type_ids->at(0)}; if (log_buf->has_delimiters() && (timestamp_pattern != nullptr || token_idx > 1) && token_type != static_cast(log_surgeon::SymbolId::TokenUncaughtString) && token_type != static_cast(log_surgeon::SymbolId::TokenNewline)) diff --git a/components/core/src/clp/streaming_archive/writer/Archive.hpp b/components/core/src/clp/streaming_archive/writer/Archive.hpp index 2fa1d4661f..b1351199a2 100644 --- a/components/core/src/clp/streaming_archive/writer/Archive.hpp +++ b/components/core/src/clp/streaming_archive/writer/Archive.hpp @@ -149,7 +149,7 @@ class Archive { * @param log_event_view * @throw FileWriter::OperationFailed if any write fails */ - void write_msg_using_schema(log_surgeon::LogEventView const& log_view); + auto write_msg_using_schema(log_surgeon::LogEventView const& log_view) -> void; /** * Writes an IR log event to the current encoded file From 3b36328baba7de064484e6af5bf839d63ee0e43a Mon Sep 17 00:00:00 2001 From: davidlion Date: Thu, 27 Nov 2025 09:25:27 -0500 Subject: [PATCH 14/14] Fix coderabbit nits. --- .../core/src/clp/streaming_archive/writer/Archive.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/components/core/src/clp/streaming_archive/writer/Archive.cpp b/components/core/src/clp/streaming_archive/writer/Archive.cpp index 194c55488e..dcc191f5bd 100644 --- a/components/core/src/clp/streaming_archive/writer/Archive.cpp +++ b/components/core/src/clp/streaming_archive/writer/Archive.cpp @@ -324,7 +324,8 @@ auto Archive::add_token_to_dicts( if (nullptr == type_ids || type_ids->empty()) { throw std::runtime_error("Token has no type IDs: " + token_view.to_string()); } - switch (type_ids->at(0)) { + auto const token_type{type_ids->at(0)}; + switch (token_type) { case static_cast(log_surgeon::SymbolId::TokenNewline): case static_cast(log_surgeon::SymbolId::TokenUncaughtString): { m_logtype_dict_entry.add_constant(token_view.to_string(), 0, token_view.get_length()); @@ -332,7 +333,8 @@ auto Archive::add_token_to_dicts( } case static_cast(log_surgeon::SymbolId::TokenInt): { encoded_variable_t encoded_var{}; - if (!EncodedVariableInterpreter::convert_string_to_representable_integer_var( + if (false + == EncodedVariableInterpreter::convert_string_to_representable_integer_var( token_view.to_string(), encoded_var )) @@ -349,7 +351,8 @@ auto Archive::add_token_to_dicts( } case static_cast(log_surgeon::SymbolId::TokenFloat): { encoded_variable_t encoded_var{}; - if (!EncodedVariableInterpreter::convert_string_to_representable_float_var( + if (false + == EncodedVariableInterpreter::convert_string_to_representable_float_var( token_view.to_string(), encoded_var )) @@ -371,7 +374,6 @@ auto Archive::add_token_to_dicts( // the logtype. Capture repetition currently does not work so we explicitly only // store the first capture. - auto const token_type{token_view.get_type_ids()->at(0)}; auto const& lexer{log_view.get_log_parser().m_lexer}; auto capture_ids{lexer.get_capture_ids_from_rule_id(token_type)}; if (false == capture_ids.has_value()) {