y-scope · davidlion · Aug 28, 2025 · Aug 28, 2025 · Aug 28, 2025 · Sep 17, 2025
@@ -16,4 +16,4 @@ float:\-{0,1}[0-9]+\.[0-9]+
 // Dictionary variables
 hex:[a-fA-F]+
 hasNumber:.*\d.*
-equals:.*=.*[a-zA-Z0-9].*
+equals:.*=(?<var>.*[a-zA-Z0-9].*)
@@ -8,6 +8,7 @@
 #include <iostream>
 #include <memory>
 #include <set>
+#include <string>
 
 #include <boost/algorithm/string.hpp>
 #include <boost/lexical_cast.hpp>
@@ -188,6 +189,16 @@ load_lexer_from_file(std::string const& schema_file_path, log_surgeon::lexers::B
     for (std::unique_ptr<log_surgeon::ParserAST> const& parser_ast : schema_ast->m_schema_vars) {
         auto* rule = dynamic_cast<log_surgeon::SchemaVarAST*>(parser_ast.get());
 
+        // Currently, we only support at most a single capture group in each variable. If a capture
+        // group is present its match will be treated as the variable rather than the full match.
+        if (1 < rule->m_regex_ptr->get_subtree_positive_captures().size()) {
+            throw std::runtime_error(
+                    schema_file_path + ":" + std::to_string(rule->m_line_num + 1)
+                    + ": error: the schema rule '" + rule->m_name
+                    + "' has a regex pattern containing > 1 capture group.\n"
+            );
+        }
+
         if ("timestamp" == rule->m_name) {
             continue;
         }

@@ -2,14 +2,17 @@
 
 #include <sys/stat.h>
 
+#include <cstdint>
 #include <filesystem>
-#include <fstream>
-#include <iostream>
+#include <stdexcept>
+#include <string>
+#include <vector>
 
 #include <boost/asio.hpp>
 #include <boost/uuid/uuid.hpp>
 #include <boost/uuid/uuid_generators.hpp>
 #include <boost/uuid/uuid_io.hpp>
+#include <log_surgeon/Constants.hpp>
 #include <log_surgeon/LogEvent.hpp>
 #include <log_surgeon/LogParser.hpp>
 #include <nlohmann/json.hpp>
@@ -23,11 +26,7 @@
 
 using clp::ir::eight_byte_encoded_variable_t;
 using clp::ir::four_byte_encoded_variable_t;
-using log_surgeon::LogEventView;
-using std::list;
-using std::make_unique;
 using std::string;
-using std::unordered_set;
 using std::vector;
 
 namespace clp::streaming_archive::writer {
@@ -315,13 +314,13 @@ Archive::write_msg(epochtime_t timestamp, string const& message, size_t num_unco
     update_segment_indices(logtype_id, var_ids);
 }
 
-void Archive::write_msg_using_schema(LogEventView const& log_view) {
+void Archive::write_msg_using_schema(log_surgeon::LogEventView const& log_view) {
     epochtime_t timestamp = 0;
     TimestampPattern* timestamp_pattern = nullptr;
     auto const& log_output_buffer = log_view.get_log_output_buffer();
     if (log_output_buffer->has_timestamp()) {
-        size_t start;
-        size_t end;
+        size_t start{};
+        size_t end{};
         timestamp_pattern = (TimestampPattern*)TimestampPattern::search_known_ts_patterns(
                 log_output_buffer->get_mutable_token(0).to_string(),
                 timestamp,
@@ -360,7 +359,7 @@ void Archive::write_msg_using_schema(LogEventView const& log_view) {
     if (timestamp_pattern == nullptr) {
         start_pos = log_output_buffer->get_token(1).m_start_pos;
     }
-    uint32_t end_pos = log_output_buffer->get_token(log_output_buffer->pos() - 1).m_end_pos;
+    uint32_t const end_pos = log_output_buffer->get_token(log_output_buffer->pos() - 1).m_end_pos;
     if (start_pos <= end_pos) {
         num_uncompressed_bytes = end_pos - start_pos;
     } else {
@@ -369,7 +368,7 @@ void Archive::write_msg_using_schema(LogEventView const& log_view) {
     }
     for (uint32_t i = 1; i < log_output_buffer->pos(); i++) {
         log_surgeon::Token& token = log_output_buffer->get_mutable_token(i);
-        int token_type = token.m_type_ids_ptr->at(0);
+        auto const token_type{token.m_type_ids_ptr->at(0)};
         if (log_output_buffer->has_delimiters() && (timestamp_pattern != nullptr || i > 1)
             && token_type != static_cast<int>(log_surgeon::SymbolId::TokenUncaughtString)
             && token_type != static_cast<int>(log_surgeon::SymbolId::TokenNewline))
@@ -388,13 +387,13 @@ void Archive::write_msg_using_schema(LogEventView const& log_view) {
                 break;
             }
             case static_cast<int>(log_surgeon::SymbolId::TokenInt): {
-                encoded_variable_t encoded_var;
+                encoded_variable_t encoded_var{};
                 if (!EncodedVariableInterpreter::convert_string_to_representable_integer_var(
                             token.to_string(),
                             encoded_var
                     ))
                 {
-                    variable_dictionary_id_t id;
+                    variable_dictionary_id_t id{};
                     m_var_dict.add_entry(token.to_string(), id);
                     encoded_var = EncodedVariableInterpreter::encode_var_dict_id(id);
                     m_logtype_dict_entry.add_dictionary_var();
@@ -405,13 +404,13 @@ void Archive::write_msg_using_schema(LogEventView const& log_view) {
                 break;
             }
             case static_cast<int>(log_surgeon::SymbolId::TokenFloat): {
-                encoded_variable_t encoded_var;
+                encoded_variable_t encoded_var{};
                 if (!EncodedVariableInterpreter::convert_string_to_representable_float_var(
                             token.to_string(),
                             encoded_var
                     ))
                 {
-                    variable_dictionary_id_t id;
+                    variable_dictionary_id_t id{};
                     m_var_dict.add_entry(token.to_string(), id);
                     encoded_var = EncodedVariableInterpreter::encode_var_dict_id(id);
                     m_logtype_dict_entry.add_dictionary_var();
@@ -422,21 +421,68 @@ void Archive::write_msg_using_schema(LogEventView const& log_view) {
                 break;
             }
             default: {
-                // Variable string looks like a dictionary variable, so encode it as so
-                encoded_variable_t encoded_var;
-                variable_dictionary_id_t id;
-                m_var_dict.add_entry(token.to_string(), id);
-                encoded_var = EncodedVariableInterpreter::encode_var_dict_id(id);
-                m_var_ids.push_back(id);
+                auto const& lexer{log_view.get_log_parser().m_lexer};
+                auto capture_ids{lexer.get_capture_ids_from_rule_id(token_type)};
+
+                // If the variable token contains capture groups, we break the token up by storing
+                // each capture as a variable and any substrings surrounding the capture as part of
+                // the logtype. If there are no capture groups the entire variable token is stored
+                // as a variable.
+                if (false == capture_ids.has_value()) {
+                    variable_dictionary_id_t id{};
+                    m_var_dict.add_entry(token.to_string(), id);
+                    m_var_ids.push_back(id);
+                    m_encoded_vars.push_back(EncodedVariableInterpreter::encode_var_dict_id(id));
+                    m_logtype_dict_entry.add_dictionary_var();
+
+                    break;
+                }
 
+                auto const register_ids{lexer.get_reg_ids_from_capture_id(capture_ids.value().at(0))};
+                if (false == register_ids.has_value()) {
+                    throw(std::runtime_error(
+                            "No register IDs found for variable's capture group. Full token: "
+                            + token.to_string()
+                    ));
+                }
+                auto const [start_reg_id, end_reg_id]{register_ids.value()};
+                auto const capture_start{token.get_reversed_reg_positions(start_reg_id).front()};
+                auto const capture_end{token.get_reversed_reg_positions(end_reg_id).back()};
+                auto const token_view{token.to_string_view()};
+                size_t token_pos{0};
+
+                auto const before_capture{token_view.substr(token_pos, capture_start)};
+                m_logtype_dict_entry
+                        .add_constant(before_capture, 0, before_capture.length());
+                token_pos += before_capture.length();
+
+                // If a capture has repetition we store all instances as a single variable.
+                auto const capture_len = [&]() -> size_t {
+                    if (capture_start <= capture_end) {
+                        return capture_end - capture_start;
+                    }
+                    return token_view.length() - capture_start + capture_end;
+                }();
+                auto const capture{token_view.substr(token_pos, capture_len)};
+
+                variable_dictionary_id_t id{};
+                m_var_dict.add_entry(capture, id);
+                m_var_ids.push_back(id);
+                m_encoded_vars.push_back(
+                        EncodedVariableInterpreter::encode_var_dict_id(id)
+                );
                 m_logtype_dict_entry.add_dictionary_var();
-                m_encoded_vars.push_back(encoded_var);
+                token_pos += capture.length();
+
+                m_logtype_dict_entry
+                        .add_constant(token_view, token_pos, token_view.length() - token_pos);
+
                 break;
             }
         }
     }
     if (!m_logtype_dict_entry.get_value().empty()) {
-        logtype_dictionary_id_t logtype_id;
+        logtype_dictionary_id_t logtype_id{};
         m_logtype_dict.add_entry(m_logtype_dict_entry, logtype_id);
         m_file->write_encoded_msg(
                 timestamp,

@@ -179,3 +179,26 @@ TEST_CASE("Test lexer", "[Search]") {
         token = opt_token.value();
     }
 }
+
+TEST_CASE("Test schema with single capture group", "[load_lexer]") {
+    std::string const schema_path{"../tests/test_schema_files/single_capture_group.txt"};
+    ByteLexer lexer;
+    load_lexer_from_file(schema_path, lexer);
+
+    auto const rule_id{lexer.m_symbol_id.at("capture")};
+    auto const capture_ids{lexer.get_capture_ids_from_rule_id(rule_id)};
+    REQUIRE(capture_ids.has_value());
+    REQUIRE(1 == capture_ids.value().size());
+    REQUIRE("group" == lexer.m_id_symbol.at(capture_ids->at(0)));
+}
+
+TEST_CASE("Test error for schema rule with multiple capture groups", "[load_lexer]") {
+    std::string const schema_path{"../tests/test_schema_files/multiple_capture_groups.txt"};
+    ByteLexer lexer;
+    REQUIRE_THROWS_WITH(
+            load_lexer_from_file(schema_path, lexer),
+            schema_path
+                    + ":1: error: the schema rule 'multicapture' has a regex pattern containing > "
+                      "1 capture group.\n"
+    );
+}
@@ -0,0 +1 @@
+multicapture:text(?<group0>var0)text(?<group1>var1)text
@@ -0,0 +1 @@
+capture:text(?<group>var)text
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		multicapture:text(?<group0>var0)text(?<group1>var1)text
SharafMohamed marked this conversation as resolved. Show resolved Hide resolved
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		capture:text(?<group>var)text
SharafMohamed marked this conversation as resolved. Outdated Show resolved Hide resolved