Skip to content
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
4931f6b
feat(log-surgeon)!: Add support for a single capture group in a schem…
davidlion Aug 28, 2025
a7a0be1
Merge remote-tracking branch 'upstream/main' into capture-support
davidlion Aug 28, 2025
61d5715
Add schema equals rule capture.
davidlion Aug 28, 2025
daf18ad
Merge remote-tracking branch 'upstream/main' into capture-support
davidlion Sep 17, 2025
20b26f9
Address some review comments.
davidlion Sep 18, 2025
b8089f1
Tweak tests.
davidlion Sep 18, 2025
98d1088
Fix wrapped token logic.
davidlion Sep 19, 2025
dc591cd
Merge remote-tracking branch 'upstream/main' into capture-support
davidlion Sep 19, 2025
dec9f6e
Fix formatting.
davidlion Sep 19, 2025
03a1f02
Remove wrapping logic by manipulating the token.
davidlion Sep 19, 2025
dee547a
Merge remote-tracking branch 'upstream/main' into capture-support
davidlion Nov 19, 2025
65541fb
Update log-surgeon.
davidlion Nov 19, 2025
56bb7a6
Update GrepCore.
davidlion Nov 19, 2025
610e76f
Update log surgeon code + tests with new version.
davidlion Nov 20, 2025
d52a8c3
Merge branch 'main' into capture-support
davidlion Nov 21, 2025
ec2c8f2
Merge remote-tracking branch 'upstream/main' into capture-support
davidlion Nov 24, 2025
578601a
Refactor writing a token to dictionaries into a private helper.
davidlion Nov 24, 2025
08d5c0c
Merge remote-tracking branch 'upstream/main' into capture-support
davidlion Nov 26, 2025
834fdb7
Add unit test; refactor test-ParserWithUserSchema.cpp.
davidlion Nov 27, 2025
ea1ad3b
Add null + empty check for token type ids.
davidlion Nov 27, 2025
8ce06ba
Merge remote-tracking branch 'upstream/main' into capture-support
davidlion Nov 27, 2025
3b36328
Fix coderabbit nits.
davidlion Nov 27, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion components/core/config/schemas.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,4 @@ float:\-{0,1}[0-9]+\.[0-9]+
// Dictionary variables
hex:[a-fA-F]+
hasNumber:.*\d.*
equals:.*=.*[a-zA-Z0-9].*
equals:.*=(?<var>.*[a-zA-Z0-9].*)
11 changes: 11 additions & 0 deletions components/core/src/clp/Utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include <iostream>
#include <memory>
#include <set>
#include <string>

#include <boost/algorithm/string.hpp>
#include <boost/lexical_cast.hpp>
Expand Down Expand Up @@ -188,6 +189,16 @@ load_lexer_from_file(std::string const& schema_file_path, log_surgeon::lexers::B
for (std::unique_ptr<log_surgeon::ParserAST> const& parser_ast : schema_ast->m_schema_vars) {
auto* rule = dynamic_cast<log_surgeon::SchemaVarAST*>(parser_ast.get());

// Currently, we only support at most a single capture group in each variable. If a capture
// group is present its match will be treated as the variable rather than the full match.
if (1 < rule->m_regex_ptr->get_subtree_positive_captures().size()) {
throw std::runtime_error(
schema_file_path + ":" + std::to_string(rule->m_line_num + 1)
+ ": error: the schema rule '" + rule->m_name
+ "' has a regex pattern containing > 1 capture group.\n"
);
}

if ("timestamp" == rule->m_name) {
continue;
}
Expand Down
92 changes: 69 additions & 23 deletions components/core/src/clp/streaming_archive/writer/Archive.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,17 @@

#include <sys/stat.h>

#include <cstdint>
#include <filesystem>
#include <fstream>
#include <iostream>
#include <stdexcept>
#include <string>
#include <vector>

#include <boost/asio.hpp>
#include <boost/uuid/uuid.hpp>
#include <boost/uuid/uuid_generators.hpp>
#include <boost/uuid/uuid_io.hpp>
#include <log_surgeon/Constants.hpp>
#include <log_surgeon/LogEvent.hpp>
#include <log_surgeon/LogParser.hpp>
#include <nlohmann/json.hpp>
Expand All @@ -23,11 +26,7 @@

using clp::ir::eight_byte_encoded_variable_t;
using clp::ir::four_byte_encoded_variable_t;
using log_surgeon::LogEventView;
using std::list;
using std::make_unique;
using std::string;
using std::unordered_set;
using std::vector;

namespace clp::streaming_archive::writer {
Expand Down Expand Up @@ -315,13 +314,13 @@ Archive::write_msg(epochtime_t timestamp, string const& message, size_t num_unco
update_segment_indices(logtype_id, var_ids);
}

void Archive::write_msg_using_schema(LogEventView const& log_view) {
void Archive::write_msg_using_schema(log_surgeon::LogEventView const& log_view) {
epochtime_t timestamp = 0;
TimestampPattern* timestamp_pattern = nullptr;
auto const& log_output_buffer = log_view.get_log_output_buffer();
if (log_output_buffer->has_timestamp()) {
size_t start;
size_t end;
size_t start{};
size_t end{};
timestamp_pattern = (TimestampPattern*)TimestampPattern::search_known_ts_patterns(
log_output_buffer->get_mutable_token(0).to_string(),
timestamp,
Expand Down Expand Up @@ -360,7 +359,7 @@ void Archive::write_msg_using_schema(LogEventView const& log_view) {
if (timestamp_pattern == nullptr) {
start_pos = log_output_buffer->get_token(1).m_start_pos;
}
uint32_t end_pos = log_output_buffer->get_token(log_output_buffer->pos() - 1).m_end_pos;
uint32_t const end_pos = log_output_buffer->get_token(log_output_buffer->pos() - 1).m_end_pos;
if (start_pos <= end_pos) {
num_uncompressed_bytes = end_pos - start_pos;
} else {
Expand All @@ -369,7 +368,7 @@ void Archive::write_msg_using_schema(LogEventView const& log_view) {
}
for (uint32_t i = 1; i < log_output_buffer->pos(); i++) {
log_surgeon::Token& token = log_output_buffer->get_mutable_token(i);
int token_type = token.m_type_ids_ptr->at(0);
auto const token_type{token.m_type_ids_ptr->at(0)};
if (log_output_buffer->has_delimiters() && (timestamp_pattern != nullptr || i > 1)
&& token_type != static_cast<int>(log_surgeon::SymbolId::TokenUncaughtString)
&& token_type != static_cast<int>(log_surgeon::SymbolId::TokenNewline))
Expand All @@ -388,13 +387,13 @@ void Archive::write_msg_using_schema(LogEventView const& log_view) {
break;
}
case static_cast<int>(log_surgeon::SymbolId::TokenInt): {
encoded_variable_t encoded_var;
encoded_variable_t encoded_var{};
if (!EncodedVariableInterpreter::convert_string_to_representable_integer_var(
token.to_string(),
encoded_var
))
{
variable_dictionary_id_t id;
variable_dictionary_id_t id{};
m_var_dict.add_entry(token.to_string(), id);
encoded_var = EncodedVariableInterpreter::encode_var_dict_id(id);
m_logtype_dict_entry.add_dictionary_var();
Expand All @@ -405,13 +404,13 @@ void Archive::write_msg_using_schema(LogEventView const& log_view) {
break;
}
case static_cast<int>(log_surgeon::SymbolId::TokenFloat): {
encoded_variable_t encoded_var;
encoded_variable_t encoded_var{};
if (!EncodedVariableInterpreter::convert_string_to_representable_float_var(
token.to_string(),
encoded_var
))
{
variable_dictionary_id_t id;
variable_dictionary_id_t id{};
m_var_dict.add_entry(token.to_string(), id);
encoded_var = EncodedVariableInterpreter::encode_var_dict_id(id);
m_logtype_dict_entry.add_dictionary_var();
Expand All @@ -422,21 +421,68 @@ void Archive::write_msg_using_schema(LogEventView const& log_view) {
break;
}
default: {
// Variable string looks like a dictionary variable, so encode it as so
encoded_variable_t encoded_var;
variable_dictionary_id_t id;
m_var_dict.add_entry(token.to_string(), id);
encoded_var = EncodedVariableInterpreter::encode_var_dict_id(id);
m_var_ids.push_back(id);
auto const& lexer{log_view.get_log_parser().m_lexer};
auto capture_ids{lexer.get_capture_ids_from_rule_id(token_type)};

// If the variable token contains capture groups, we break the token up by storing
// each capture as a variable and any substrings surrounding the capture as part of
// the logtype. If there are no capture groups the entire variable token is stored
// as a variable.
if (false == capture_ids.has_value()) {
variable_dictionary_id_t id{};
m_var_dict.add_entry(token.to_string(), id);
m_var_ids.push_back(id);
m_encoded_vars.push_back(EncodedVariableInterpreter::encode_var_dict_id(id));
m_logtype_dict_entry.add_dictionary_var();

break;
}

auto const register_ids{lexer.get_reg_ids_from_capture_id(capture_ids.value().at(0))};
if (false == register_ids.has_value()) {
throw(std::runtime_error(
"No register IDs found for variable's capture group. Full token: "
+ token.to_string()
));
}
auto const [start_reg_id, end_reg_id]{register_ids.value()};
auto const capture_start{token.get_reversed_reg_positions(start_reg_id).front()};
auto const capture_end{token.get_reversed_reg_positions(end_reg_id).back()};
auto const token_view{token.to_string_view()};
size_t token_pos{0};

auto const before_capture{token_view.substr(token_pos, capture_start)};
m_logtype_dict_entry
.add_constant(before_capture, 0, before_capture.length());
token_pos += before_capture.length();

// If a capture has repetition we store all instances as a single variable.
auto const capture_len = [&]() -> size_t {
if (capture_start <= capture_end) {
return capture_end - capture_start;
}
return token_view.length() - capture_start + capture_end;
}();
auto const capture{token_view.substr(token_pos, capture_len)};

variable_dictionary_id_t id{};
m_var_dict.add_entry(capture, id);
m_var_ids.push_back(id);
m_encoded_vars.push_back(
EncodedVariableInterpreter::encode_var_dict_id(id)
);
m_logtype_dict_entry.add_dictionary_var();
m_encoded_vars.push_back(encoded_var);
token_pos += capture.length();

m_logtype_dict_entry
.add_constant(token_view, token_pos, token_view.length() - token_pos);

break;
}
}
}
if (!m_logtype_dict_entry.get_value().empty()) {
logtype_dictionary_id_t logtype_id;
logtype_dictionary_id_t logtype_id{};
m_logtype_dict.add_entry(m_logtype_dict_entry, logtype_id);
m_file->write_encoded_msg(
timestamp,
Expand Down
23 changes: 23 additions & 0 deletions components/core/tests/test-ParserWithUserSchema.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -179,3 +179,26 @@ TEST_CASE("Test lexer", "[Search]") {
token = opt_token.value();
}
}

TEST_CASE("Test schema with single capture group", "[load_lexer]") {
std::string const schema_path{"../tests/test_schema_files/single_capture_group.txt"};
ByteLexer lexer;
load_lexer_from_file(schema_path, lexer);

auto const rule_id{lexer.m_symbol_id.at("capture")};
auto const capture_ids{lexer.get_capture_ids_from_rule_id(rule_id)};
REQUIRE(capture_ids.has_value());
REQUIRE(1 == capture_ids.value().size());
REQUIRE("group" == lexer.m_id_symbol.at(capture_ids->at(0)));
}

TEST_CASE("Test error for schema rule with multiple capture groups", "[load_lexer]") {
std::string const schema_path{"../tests/test_schema_files/multiple_capture_groups.txt"};
ByteLexer lexer;
REQUIRE_THROWS_WITH(
load_lexer_from_file(schema_path, lexer),
schema_path
+ ":1: error: the schema rule 'multicapture' has a regex pattern containing > "
"1 capture group.\n"
);
}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
multicapture:text(?<group0>var0)text(?<group1>var1)text
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
capture:text(?<group>var)text
Loading