y-scope · SharafMohamed · Feb 11, 2026 · Feb 11, 2026 · Feb 11, 2026 · Feb 11, 2026
@@ -1,17 +1,17 @@
 // Delimiters
 delimiters: \t\r\n!"#$%&'()*,:;<>?@[]^_`{}|~
 
-// Timestamps (using the `timestamp` keyword)
+// Headers (using the `timestamp` capture keyword)
 // E.g. 2015-01-31 15:50:45,392
 // E.g. 2015-01-31 15:50:45.392
 // E.g. 2015-01-31 15:50:45
-timestamp:\d{4}\-\d{2}\-\d{2} \d{2}:\d{2}:\d{2}(\.\d{3}){0,1}
+header:(?<timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}(\.\d{3}){0,1})
 // E.g. [20150131-15:50:45]
-timestamp:\[\d{8}\-\d{2}:\d{2}:\d{2}\]
+header:(?<timestamp>\[\d{8}-\d{2}:\d{2}:\d{2}\])
 
 // Specially-encoded variables (using the `int` and `float` keywords)
-int:\-{0,1}[0-9]+
-float:\-{0,1}[0-9]+\.[0-9]+
+int:-?\d+
+float:-?\d+\.\d+
 
 // Dictionary variables
 hex:[a-fA-F]+

@@ -1,31 +1,26 @@
 #include "Utils.hpp"
 
 #include <fcntl.h>
-#include <sys/mman.h>
 #include <sys/stat.h>
 
 #include <algorithm>
-#include <iostream>
 #include <memory>
-#include <set>
 #include <string>
 
-#include <boost/algorithm/string.hpp>
-#include <boost/lexical_cast.hpp>
 #include <log_surgeon/Constants.hpp>
 #include <log_surgeon/SchemaParser.hpp>
 #include <spdlog/spdlog.h>
-#include <string_utils/string_utils.hpp>
 
 #include "spdlog_with_specializations.hpp"
 
-using std::list;
+namespace clp {
+using log_surgeon::finite_automata::ByteNfaState;
+using log_surgeon::finite_automata::RegexASTLiteral;
 using std::make_unique;
 using std::string;
 using std::unique_ptr;
 using std::vector;
 
-namespace clp {
 ErrorCode create_directory(string const& path, mode_t mode, bool exist_ok) {
     int retval = mkdir(path.c_str(), mode);
     if (0 != retval) {
@@ -135,9 +130,8 @@ load_lexer_from_file(std::string const& schema_file_path, log_surgeon::lexers::B
     lexer.m_symbol_id[log_surgeon::cTokenEnd] = static_cast<int>(log_surgeon::SymbolId::TokenEnd);
     lexer.m_symbol_id[log_surgeon::cTokenUncaughtString]
             = static_cast<int>(log_surgeon::SymbolId::TokenUncaughtString);
-    // cTokenInt, cTokenFloat, cTokenFirstTimestamp, and cTokenNewlineTimestamp each have unknown
-    // rule(s) until specified by the user so can't be explicitly added and are done by looping over
-    // schema_vars (user schema)
+    // cTokenInt, cTokenFloat, and cTokenHeader each have unknown rule(s) until specified by the
+    // user so can't be explicitly added and are done by looping over schema_vars (user schema)
     lexer.m_symbol_id[log_surgeon::cTokenInt] = static_cast<int>(log_surgeon::SymbolId::TokenInt);
     lexer.m_symbol_id[log_surgeon::cTokenFloat]
             = static_cast<int>(log_surgeon::SymbolId::TokenFloat);
@@ -159,15 +153,7 @@ load_lexer_from_file(std::string const& schema_file_path, log_surgeon::lexers::B
     lexer.m_id_symbol[static_cast<int>(log_surgeon::SymbolId::TokenNewline)]
             = log_surgeon::cTokenNewline;
 
-    lexer.add_rule(
-            lexer.m_symbol_id["newLine"],
-            std::move(
-                    std::make_unique<log_surgeon::finite_automata::RegexASTLiteral<
-                            log_surgeon::finite_automata::ByteNfaState
-                    >>(log_surgeon::finite_automata::
-                               RegexASTLiteral<log_surgeon::finite_automata::ByteNfaState>('\n'))
-            )
-    );
+    lexer.add_rule(lexer.m_symbol_id["newLine"], make_unique<RegexASTLiteral<ByteNfaState>>('\n'));
 
     for (auto const& delimiters_ast : schema_ast->m_delimiters) {
         auto* delimiters_ptr = dynamic_cast<log_surgeon::DelimiterStringAST*>(delimiters_ast.get());
@@ -185,7 +171,12 @@ load_lexer_from_file(std::string const& schema_file_path, log_surgeon::lexers::B
         auto* rule = dynamic_cast<log_surgeon::SchemaVarAST*>(parser_ast.get());
 
         // Capture groups are temporarily disabled, until NFA intersection supports for search.
-        auto const num_captures{rule->m_regex_ptr->get_subtree_positive_captures().size()};
+        auto const& captures{rule->m_regex_ptr->get_subtree_positive_captures()};
+        auto const num_captures{captures.size()};
+        if ("header" == rule->m_name && 1 == num_captures && "timestamp" == captures[0]->get_name())
+        {
+            continue;
+        }
         if (0 < num_captures) {
             throw std::runtime_error(
                     schema_file_path + ":" + std::to_string(rule->m_line_num + 1)
@@ -195,55 +186,13 @@ load_lexer_from_file(std::string const& schema_file_path, log_surgeon::lexers::B
             );
         }
 
-        if ("timestamp" == rule->m_name) {
-            continue;
-        }
+        // transform '.' from any-character into any non-delimiter character
+        rule->m_regex_ptr->remove_delimiters_from_wildcard(delimiters);
 
         if (lexer.m_symbol_id.find(rule->m_name) == lexer.m_symbol_id.end()) {
             lexer.m_symbol_id[rule->m_name] = lexer.m_symbol_id.size();
             lexer.m_id_symbol[lexer.m_symbol_id[rule->m_name]] = rule->m_name;
         }
-
-        // transform '.' from any-character into any non-delimiter character
-        rule->m_regex_ptr->remove_delimiters_from_wildcard(delimiters);
-
-        std::array<bool, log_surgeon::cSizeOfUnicode> is_possible_input{};
-        rule->m_regex_ptr->set_possible_inputs_to_true(is_possible_input);
-        bool contains_delimiter = false;
-        uint32_t delimiter_name;
-        for (uint32_t delimiter : delimiters) {
-            if (is_possible_input[delimiter]) {
-                contains_delimiter = true;
-                delimiter_name = delimiter;
-                break;
-            }
-        }
-
-        if (contains_delimiter) {
-            FileReader schema_reader{schema_ast->m_file_path};
-            // more detailed debugging based on looking at the file
-            string line;
-            for (uint32_t i = 0; i <= rule->m_line_num; i++) {
-                schema_reader.read_to_delimiter('\n', false, false, line);
-            }
-            int colon_pos = 0;
-            for (char i : line) {
-                colon_pos++;
-                if (i == ':') {
-                    break;
-                }
-            }
-            string indent(10, ' ');
-            string spaces(colon_pos, ' ');
-            string arrows(line.size() - colon_pos, '^');
-
-            throw std::runtime_error(
-                    schema_file_path + ":" + std::to_string(rule->m_line_num + 1) + ": error: '"
-                    + rule->m_name + "' has regex pattern which contains delimiter '"
-                    + char(delimiter_name) + "'.\n" + indent + line + "\n" + indent + spaces
-                    + arrows + "\n"
-            );
-        }
         lexer.add_rule(lexer.m_symbol_id[rule->m_name], std::move(rule->m_regex_ptr));
     }
     lexer.generate();

@@ -64,12 +64,26 @@ int run(int argc, char const* argv[]) {
             // Capture groups are temporarily disabled, until NFA intersection support for search.
             auto const& lexer{reader_parser->get_log_parser().m_lexer};
             for (auto const& [rule_id, rule_name] : lexer.m_id_symbol) {
-                if (lexer.get_captures_from_rule_id(rule_id).has_value()) {
-                    throw std::runtime_error(
-                            schema_file_path + ": error: the schema rule '" + rule_name
-                            + "' has a regex pattern containing capture groups.\n"
-                    );
+                auto optional_captures{lexer.get_captures_from_rule_id(rule_id)};
+                if (false == optional_captures.has_value()) {
+                    continue;
                 }
+
+                auto const& captures{optional_captures.value()};
+                if (captures.empty()) {
+                    continue;
+                }
+
+                if ("header" == rule_name && 1 == captures.size()
+                    && "timestamp" == captures[0]->get_name())
+                {
+                    continue;
+                }
+
+                throw std::runtime_error(
+                        schema_file_path + ": error: the schema rule '" + rule_name
+                        + "' has a regex pattern containing capture groups.\n"
+                );
             }
         }
 

@@ -20,9 +20,7 @@
 #include <clp/FileReader.hpp>
 #include <clp/ir/types.hpp>
 #include <clp/LogSurgeonReader.hpp>
-#include <clp/streaming_archive/Constants.hpp>
 #include <clp/streaming_archive/reader/Archive.hpp>
-#include <clp/type_utils.hpp>
 #include <clp/Utils.hpp>
 
 #include "TestOutputCleaner.hpp"
@@ -85,6 +83,7 @@ auto run_clp_compress(
             input_path_str.data(),
             nullptr
     };
+    spdlog::drop_all();
     return clp::clp::run(static_cast<int>(argv.size() - 1), argv.data());
 }
 }  // namespace
@@ -191,7 +190,7 @@ TEST_CASE("Test lexer", "[Search]") {
     }
 }
 
-TEST_CASE("Error on schema rule with a single capture group", "[load_lexer]") {
+TEST_CASE("Error on schema rule with a single non-header capture group", "[load_lexer]") {
     auto const schema_file_path{get_test_schema_files_dir() / "single_capture_group.txt"};
     ByteLexer lexer;
     REQUIRE_THROWS_WITH(
@@ -202,7 +201,7 @@ TEST_CASE("Error on schema rule with a single capture group", "[load_lexer]") {
     );
 }
 
-TEST_CASE("Error on schema rule with multiple capture groups", "[load_lexer]") {
+TEST_CASE("Error on schema rule with multiple non-header capture groups", "[load_lexer]") {
     auto const schema_file_path{get_test_schema_files_dir() / "multiple_capture_groups.txt"};
     ByteLexer lexer;
     REQUIRE_THROWS_WITH(
@@ -213,7 +212,7 @@ TEST_CASE("Error on schema rule with multiple capture groups", "[load_lexer]") {
     );
 }
 
-TEST_CASE("Verify CLP compression fails with capture groups", "[Compression]") {
+TEST_CASE("Verify CLP compression fails with non-header capture groups", "[Compression]") {
     auto const log_file_path{get_test_log_dir() / "log_with_capture.txt"};
     auto const schema_file_path{get_test_schema_files_dir() / "single_capture_group.txt"};
     TestOutputCleaner const cleaner{{std::string{cTestArchiveDirectory}}};
@@ -226,3 +225,84 @@ TEST_CASE("Verify CLP compression fails with capture groups", "[Compression]") {
                       "groups.\n"
     );
 }
+
+TEST_CASE("Succeed on header rule with no capture", "[load_lexer]") {
+    auto const schema_file_path{get_test_schema_files_dir() / "header_with_no_capture.txt"};
+    ByteLexer lexer;
+    REQUIRE_NOTHROW(load_lexer_from_file(schema_file_path, lexer));
+}
+
+TEST_CASE("Succeed on header rule with a single timestamp capture", "[load_lexer]") {
+    auto const schema_file_path{get_test_schema_files_dir() / "header_with_timestamp.txt"};
+    ByteLexer lexer;
+    REQUIRE_NOTHROW(load_lexer_from_file(schema_file_path, lexer));
+}
+
+TEST_CASE("Error on header rule with a single non-timestamp capture", "[load_lexer]") {
+    auto const schema_file_path{get_test_schema_files_dir() / "header_with_int.txt"};
+    ByteLexer lexer;
+    REQUIRE_THROWS_WITH(
+            load_lexer_from_file(schema_file_path, lexer),
+            schema_file_path.string()
+                    + ":3: error: the schema rule 'header' has a regex pattern containing capture "
+                      "groups (found 1).\n"
+    );
+}
+
+
+TEST_CASE("Error on header rule with a timestamp and non-timestamp capture", "[load_lexer]") {
+    auto const schema_file_path{get_test_schema_files_dir() / "header_with_timestamp_and_int.txt"};
+    ByteLexer lexer;
+    REQUIRE_THROWS_WITH(
+            load_lexer_from_file(schema_file_path, lexer),
+            schema_file_path.string()
+                    + ":3: error: the schema rule 'header' has a regex pattern containing capture "
+                      "groups (found 2).\n"
+    );
+}
+
+TEST_CASE("Verify CLP compression succeeds with non-capture header", "[Compression]") {
+    auto const log_file_path{get_test_log_dir() / "log_with_capture.txt"};
+    auto const schema_file_path{get_test_schema_files_dir() / "header_with_no_capture.txt"};
+    TestOutputCleaner const cleaner{{std::string{cTestArchiveDirectory}}};
+    std::filesystem::create_directory(cTestArchiveDirectory);
+
+    REQUIRE(0 == run_clp_compress(schema_file_path, cTestArchiveDirectory, log_file_path));
+}
+
+TEST_CASE("Verify CLP compression succeeds with timestamp capture header", "[Compression]") {
+    auto const log_file_path{get_test_log_dir() / "log_with_capture.txt"};
+    auto const schema_file_path{get_test_schema_files_dir() / "header_with_timestamp.txt"};
+    TestOutputCleaner const cleaner{{std::string{cTestArchiveDirectory}}};
+    std::filesystem::create_directory(cTestArchiveDirectory);
+
+    REQUIRE(0 == run_clp_compress(schema_file_path, cTestArchiveDirectory, log_file_path));
+}
+
+TEST_CASE("Verify CLP compression fails with non-timestamp capture header", "[Compression]") {
+    auto const log_file_path{get_test_log_dir() / "log_with_capture.txt"};
+    auto const schema_file_path{get_test_schema_files_dir() / "header_with_int.txt"};
+    TestOutputCleaner const cleaner{{std::string{cTestArchiveDirectory}}};
+    std::filesystem::create_directory(cTestArchiveDirectory);
+
+    REQUIRE_THROWS_WITH(
+            run_clp_compress(schema_file_path, cTestArchiveDirectory, log_file_path),
+            schema_file_path.string()
+                    + ": error: the schema rule 'header' has a regex pattern containing capture "
+                      "groups.\n"
+    );
+}
+
+TEST_CASE("Verify CLP compression fails with multi-capture header", "[Compression]") {
+    auto const log_file_path{get_test_log_dir() / "log_with_capture.txt"};
+    auto const schema_file_path{get_test_schema_files_dir() / "header_with_timestamp_and_int.txt"};
+    TestOutputCleaner const cleaner{{std::string{cTestArchiveDirectory}}};
+    std::filesystem::create_directory(cTestArchiveDirectory);
+
+    REQUIRE_THROWS_WITH(
+            run_clp_compress(schema_file_path, cTestArchiveDirectory, log_file_path),
+            schema_file_path.string()
+                    + ": error: the schema rule 'header' has a regex pattern containing capture "
+                      "groups.\n"
+    );
+}
@@ -0,0 +1,3 @@
+delimiters: \r\n
+
+header:(?<int>\d+)
@@ -0,0 +1,3 @@
+delimiters: \r\n
+
+header:\d{2}-\d{2}-\d{4}:\d{2}:\d{2}:\d{2}\.\d{3}
@@ -0,0 +1,3 @@
+delimiters: \r\n
+
+header:(?<timestamp>\d{2}-\d{2}-\d{4}:\d{2}:\d{2}:\d{2}\.\d{3})
@@ -0,0 +1,3 @@
+delimiters: \r\n
+
+header:(?<timestamp>\d{2}-\d{2}-\d{4}:\d{2}:\d{2}:\d{2}\.\d{3}) and (?<int>\d+)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		delimiters: \r\n

		header:\d{2}-\d{2}-\d{4}:\d{2}:\d{2}:\d{2}\.\d{3}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		delimiters: \r\n

		header:(?<timestamp>\d{2}-\d{2}-\d{4}:\d{2}:\d{2}:\d{2}\.\d{3})