Skip to content
Open
10 changes: 5 additions & 5 deletions components/core/config/schemas.txt
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
// Delimiters
delimiters: \t\r\n!"#$%&'()*,:;<>?@[]^_`{}|~

// Timestamps (using the `timestamp` keyword)
// Headers (using the `timestamp` capture keyword)
// E.g. 2015-01-31 15:50:45,392
// E.g. 2015-01-31 15:50:45.392
// E.g. 2015-01-31 15:50:45
timestamp:\d{4}\-\d{2}\-\d{2} \d{2}:\d{2}:\d{2}(\.\d{3}){0,1}
header:(?<timestamp>\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}(\.\d{3}){0,1})
// E.g. [20150131-15:50:45]
timestamp:\[\d{8}\-\d{2}:\d{2}:\d{2}\]
header:(?<timestamp>\[\d{8}-\d{2}:\d{2}:\d{2}\])

// Specially-encoded variables (using the `int` and `float` keywords)
int:\-{0,1}[0-9]+
float:\-{0,1}[0-9]+\.[0-9]+
int:-?\d+
float:-?\d+\.\d+

// Dictionary variables
hex:[a-fA-F]+
Expand Down
79 changes: 14 additions & 65 deletions components/core/src/clp/Utils.cpp
Original file line number Diff line number Diff line change
@@ -1,31 +1,26 @@
#include "Utils.hpp"

#include <fcntl.h>
#include <sys/mman.h>
#include <sys/stat.h>

#include <algorithm>
#include <iostream>
#include <memory>
#include <set>
#include <string>

#include <boost/algorithm/string.hpp>
#include <boost/lexical_cast.hpp>
#include <log_surgeon/Constants.hpp>
#include <log_surgeon/SchemaParser.hpp>
#include <spdlog/spdlog.h>
#include <string_utils/string_utils.hpp>

#include "spdlog_with_specializations.hpp"

using std::list;
namespace clp {
using log_surgeon::finite_automata::ByteNfaState;
using log_surgeon::finite_automata::RegexASTLiteral;
using std::make_unique;
using std::string;
using std::unique_ptr;
using std::vector;

namespace clp {
ErrorCode create_directory(string const& path, mode_t mode, bool exist_ok) {
int retval = mkdir(path.c_str(), mode);
if (0 != retval) {
Expand Down Expand Up @@ -135,9 +130,8 @@ load_lexer_from_file(std::string const& schema_file_path, log_surgeon::lexers::B
lexer.m_symbol_id[log_surgeon::cTokenEnd] = static_cast<int>(log_surgeon::SymbolId::TokenEnd);
lexer.m_symbol_id[log_surgeon::cTokenUncaughtString]
= static_cast<int>(log_surgeon::SymbolId::TokenUncaughtString);
// cTokenInt, cTokenFloat, cTokenFirstTimestamp, and cTokenNewlineTimestamp each have unknown
// rule(s) until specified by the user so can't be explicitly added and are done by looping over
// schema_vars (user schema)
// cTokenInt, cTokenFloat, and cTokenHeader each have unknown rule(s) until specified by the
// user so can't be explicitly added and are done by looping over schema_vars (user schema)
lexer.m_symbol_id[log_surgeon::cTokenInt] = static_cast<int>(log_surgeon::SymbolId::TokenInt);
lexer.m_symbol_id[log_surgeon::cTokenFloat]
= static_cast<int>(log_surgeon::SymbolId::TokenFloat);
Expand All @@ -159,15 +153,7 @@ load_lexer_from_file(std::string const& schema_file_path, log_surgeon::lexers::B
lexer.m_id_symbol[static_cast<int>(log_surgeon::SymbolId::TokenNewline)]
= log_surgeon::cTokenNewline;

lexer.add_rule(
lexer.m_symbol_id["newLine"],
std::move(
std::make_unique<log_surgeon::finite_automata::RegexASTLiteral<
log_surgeon::finite_automata::ByteNfaState
>>(log_surgeon::finite_automata::
RegexASTLiteral<log_surgeon::finite_automata::ByteNfaState>('\n'))
)
);
lexer.add_rule(lexer.m_symbol_id["newLine"], make_unique<RegexASTLiteral<ByteNfaState>>('\n'));

for (auto const& delimiters_ast : schema_ast->m_delimiters) {
auto* delimiters_ptr = dynamic_cast<log_surgeon::DelimiterStringAST*>(delimiters_ast.get());
Expand All @@ -185,7 +171,12 @@ load_lexer_from_file(std::string const& schema_file_path, log_surgeon::lexers::B
auto* rule = dynamic_cast<log_surgeon::SchemaVarAST*>(parser_ast.get());

// Capture groups are temporarily disabled, until NFA intersection supports for search.
auto const num_captures{rule->m_regex_ptr->get_subtree_positive_captures().size()};
auto const& captures{rule->m_regex_ptr->get_subtree_positive_captures()};
auto const num_captures{captures.size()};
if ("header" == rule->m_name && 1 == num_captures && "timestamp" == captures[0]->get_name())
{
continue;
}
if (0 < num_captures) {
throw std::runtime_error(
schema_file_path + ":" + std::to_string(rule->m_line_num + 1)
Expand All @@ -195,55 +186,13 @@ load_lexer_from_file(std::string const& schema_file_path, log_surgeon::lexers::B
);
}

if ("timestamp" == rule->m_name) {
continue;
}
// transform '.' from any-character into any non-delimiter character
rule->m_regex_ptr->remove_delimiters_from_wildcard(delimiters);

if (lexer.m_symbol_id.find(rule->m_name) == lexer.m_symbol_id.end()) {
lexer.m_symbol_id[rule->m_name] = lexer.m_symbol_id.size();
lexer.m_id_symbol[lexer.m_symbol_id[rule->m_name]] = rule->m_name;
}

// transform '.' from any-character into any non-delimiter character
rule->m_regex_ptr->remove_delimiters_from_wildcard(delimiters);

std::array<bool, log_surgeon::cSizeOfUnicode> is_possible_input{};
rule->m_regex_ptr->set_possible_inputs_to_true(is_possible_input);
bool contains_delimiter = false;
uint32_t delimiter_name;
for (uint32_t delimiter : delimiters) {
if (is_possible_input[delimiter]) {
contains_delimiter = true;
delimiter_name = delimiter;
break;
}
}

if (contains_delimiter) {
FileReader schema_reader{schema_ast->m_file_path};
// more detailed debugging based on looking at the file
string line;
for (uint32_t i = 0; i <= rule->m_line_num; i++) {
schema_reader.read_to_delimiter('\n', false, false, line);
}
int colon_pos = 0;
for (char i : line) {
colon_pos++;
if (i == ':') {
break;
}
}
string indent(10, ' ');
string spaces(colon_pos, ' ');
string arrows(line.size() - colon_pos, '^');

throw std::runtime_error(
schema_file_path + ":" + std::to_string(rule->m_line_num + 1) + ": error: '"
+ rule->m_name + "' has regex pattern which contains delimiter '"
+ char(delimiter_name) + "'.\n" + indent + line + "\n" + indent + spaces
+ arrows + "\n"
);
}
lexer.add_rule(lexer.m_symbol_id[rule->m_name], std::move(rule->m_regex_ptr));
}
lexer.generate();
Expand Down
24 changes: 19 additions & 5 deletions components/core/src/clp/clp/run.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,12 +64,26 @@ int run(int argc, char const* argv[]) {
// Capture groups are temporarily disabled, until NFA intersection support for search.
auto const& lexer{reader_parser->get_log_parser().m_lexer};
for (auto const& [rule_id, rule_name] : lexer.m_id_symbol) {
if (lexer.get_captures_from_rule_id(rule_id).has_value()) {
throw std::runtime_error(
schema_file_path + ": error: the schema rule '" + rule_name
+ "' has a regex pattern containing capture groups.\n"
);
auto optional_captures{lexer.get_captures_from_rule_id(rule_id)};
if (false == optional_captures.has_value()) {
continue;
}

auto const& captures{optional_captures.value()};
if (captures.empty()) {
continue;
}

if ("header" == rule_name && 1 == captures.size()
&& "timestamp" == captures[0]->get_name())
{
continue;
}

throw std::runtime_error(
schema_file_path + ": error: the schema rule '" + rule_name
+ "' has a regex pattern containing capture groups.\n"
);
}
}

Expand Down
90 changes: 85 additions & 5 deletions components/core/tests/test-ParserWithUserSchema.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,7 @@
#include <clp/FileReader.hpp>
#include <clp/ir/types.hpp>
#include <clp/LogSurgeonReader.hpp>
#include <clp/streaming_archive/Constants.hpp>
#include <clp/streaming_archive/reader/Archive.hpp>
#include <clp/type_utils.hpp>
#include <clp/Utils.hpp>

#include "TestOutputCleaner.hpp"
Expand Down Expand Up @@ -85,6 +83,7 @@ auto run_clp_compress(
input_path_str.data(),
nullptr
};
spdlog::drop_all();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🧹 Nitpick | 🔵 Trivial

spdlog::drop_all() prevents logger re-registration conflicts across test runs — makes sense.

Consider adding a brief inline comment explaining why the drop is needed (e.g., clp::clp::run registers loggers that persist across Catch2 sections), so future maintainers don't remove it thinking it's unnecessary.

🤖 Prompt for AI Agents
In `@components/core/tests/test-ParserWithUserSchema.cpp` at line 86, Add a short
inline comment explaining why spdlog::drop_all() is called here to avoid future
removal: note that clp::clp::run registers spdlog loggers which persist across
Catch2 sections and test runs, so dropping all loggers prevents re-registration
conflicts and test flakiness; place the comment on the line with
spdlog::drop_all() near its current usage in test-ParserWithUserSchema.cpp.

return clp::clp::run(static_cast<int>(argv.size() - 1), argv.data());
}
} // namespace
Expand Down Expand Up @@ -191,7 +190,7 @@ TEST_CASE("Test lexer", "[Search]") {
}
}

TEST_CASE("Error on schema rule with a single capture group", "[load_lexer]") {
TEST_CASE("Error on schema rule with a single non-header capture group", "[load_lexer]") {
auto const schema_file_path{get_test_schema_files_dir() / "single_capture_group.txt"};
ByteLexer lexer;
REQUIRE_THROWS_WITH(
Expand All @@ -202,7 +201,7 @@ TEST_CASE("Error on schema rule with a single capture group", "[load_lexer]") {
);
}

TEST_CASE("Error on schema rule with multiple capture groups", "[load_lexer]") {
TEST_CASE("Error on schema rule with multiple non-header capture groups", "[load_lexer]") {
auto const schema_file_path{get_test_schema_files_dir() / "multiple_capture_groups.txt"};
ByteLexer lexer;
REQUIRE_THROWS_WITH(
Expand All @@ -213,7 +212,7 @@ TEST_CASE("Error on schema rule with multiple capture groups", "[load_lexer]") {
);
}

TEST_CASE("Verify CLP compression fails with capture groups", "[Compression]") {
TEST_CASE("Verify CLP compression fails with non-header capture groups", "[Compression]") {
auto const log_file_path{get_test_log_dir() / "log_with_capture.txt"};
auto const schema_file_path{get_test_schema_files_dir() / "single_capture_group.txt"};
TestOutputCleaner const cleaner{{std::string{cTestArchiveDirectory}}};
Expand All @@ -226,3 +225,84 @@ TEST_CASE("Verify CLP compression fails with capture groups", "[Compression]") {
"groups.\n"
);
}

TEST_CASE("Succeed on header rule with no capture", "[load_lexer]") {
auto const schema_file_path{get_test_schema_files_dir() / "header_with_no_capture.txt"};
ByteLexer lexer;
REQUIRE_NOTHROW(load_lexer_from_file(schema_file_path, lexer));
}

TEST_CASE("Succeed on header rule with a single timestamp capture", "[load_lexer]") {
auto const schema_file_path{get_test_schema_files_dir() / "header_with_timestamp.txt"};
ByteLexer lexer;
REQUIRE_NOTHROW(load_lexer_from_file(schema_file_path, lexer));
}

TEST_CASE("Error on header rule with a single non-timestamp capture", "[load_lexer]") {
auto const schema_file_path{get_test_schema_files_dir() / "header_with_int.txt"};
ByteLexer lexer;
REQUIRE_THROWS_WITH(
load_lexer_from_file(schema_file_path, lexer),
schema_file_path.string()
+ ":3: error: the schema rule 'header' has a regex pattern containing capture "
"groups (found 1).\n"
);
}


TEST_CASE("Error on header rule with a timestamp and non-timestamp capture", "[load_lexer]") {
auto const schema_file_path{get_test_schema_files_dir() / "header_with_timestamp_and_int.txt"};
ByteLexer lexer;
REQUIRE_THROWS_WITH(
load_lexer_from_file(schema_file_path, lexer),
schema_file_path.string()
+ ":3: error: the schema rule 'header' has a regex pattern containing capture "
"groups (found 2).\n"
);
}

TEST_CASE("Verify CLP compression succeeds with non-capture header", "[Compression]") {
auto const log_file_path{get_test_log_dir() / "log_with_capture.txt"};
auto const schema_file_path{get_test_schema_files_dir() / "header_with_no_capture.txt"};
TestOutputCleaner const cleaner{{std::string{cTestArchiveDirectory}}};
std::filesystem::create_directory(cTestArchiveDirectory);

REQUIRE(0 == run_clp_compress(schema_file_path, cTestArchiveDirectory, log_file_path));
}

TEST_CASE("Verify CLP compression succeeds with timestamp capture header", "[Compression]") {
auto const log_file_path{get_test_log_dir() / "log_with_capture.txt"};
auto const schema_file_path{get_test_schema_files_dir() / "header_with_timestamp.txt"};
TestOutputCleaner const cleaner{{std::string{cTestArchiveDirectory}}};
std::filesystem::create_directory(cTestArchiveDirectory);

REQUIRE(0 == run_clp_compress(schema_file_path, cTestArchiveDirectory, log_file_path));
}

TEST_CASE("Verify CLP compression fails with non-timestamp capture header", "[Compression]") {
auto const log_file_path{get_test_log_dir() / "log_with_capture.txt"};
auto const schema_file_path{get_test_schema_files_dir() / "header_with_int.txt"};
TestOutputCleaner const cleaner{{std::string{cTestArchiveDirectory}}};
std::filesystem::create_directory(cTestArchiveDirectory);

REQUIRE_THROWS_WITH(
run_clp_compress(schema_file_path, cTestArchiveDirectory, log_file_path),
schema_file_path.string()
+ ": error: the schema rule 'header' has a regex pattern containing capture "
"groups.\n"
);
}

TEST_CASE("Verify CLP compression fails with multi-capture header", "[Compression]") {
auto const log_file_path{get_test_log_dir() / "log_with_capture.txt"};
auto const schema_file_path{get_test_schema_files_dir() / "header_with_timestamp_and_int.txt"};
TestOutputCleaner const cleaner{{std::string{cTestArchiveDirectory}}};
std::filesystem::create_directory(cTestArchiveDirectory);

REQUIRE_THROWS_WITH(
run_clp_compress(schema_file_path, cTestArchiveDirectory, log_file_path),
schema_file_path.string()
+ ": error: the schema rule 'header' has a regex pattern containing capture "
"groups.\n"
);
}
3 changes: 3 additions & 0 deletions components/core/tests/test_schema_files/header_with_int.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
delimiters: \r\n

header:(?<int>\d+)
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
delimiters: \r\n

header:\d{2}-\d{2}-\d{4}:\d{2}:\d{2}:\d{2}\.\d{3}
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
delimiters: \r\n

header:(?<timestamp>\d{2}-\d{2}-\d{4}:\d{2}:\d{2}:\d{2}\.\d{3})
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
delimiters: \r\n

header:(?<timestamp>\d{2}-\d{2}-\d{4}:\d{2}:\d{2}:\d{2}\.\d{3}) and (?<int>\d+)
Loading