From 4931f6b17cf863e7cfdf59e38df28d48dc92fed2 Mon Sep 17 00:00:00 2001
From: davidlion <david.lion@yscope.com>
Date: Thu, 28 Aug 2025 11:20:07 -0400
Subject: [PATCH 01/14] feat(log-surgeon)!: Add support for a single capture
 group in a schema rule to have parity with the heuristic parser.

---
 components/core/src/clp/Utils.cpp             | 11 +++
 .../clp/streaming_archive/writer/Archive.cpp  | 92 ++++++++++++++-----
 .../core/tests/test-ParserWithUserSchema.cpp  | 23 +++++
 .../multiple_capture_groups.txt               |  1 +
 .../single_capture_group.txt                  |  1 +
 5 files changed, 105 insertions(+), 23 deletions(-)
 create mode 100644 components/core/tests/test_schema_files/multiple_capture_groups.txt
 create mode 100644 components/core/tests/test_schema_files/single_capture_group.txt
diff --git a/components/core/src/clp/Utils.cpp b/components/core/src/clp/Utils.cpp
index f52cb87cdf..8fcf17ff26 100644
--- a/components/core/src/clp/Utils.cpp
+++ b/components/core/src/clp/Utils.cpp
@@ -8,6 +8,7 @@
 #include <iostream>
 #include <memory>
 #include <set>
+#include <string>
 
 #include <boost/algorithm/string.hpp>
 #include <boost/lexical_cast.hpp>
@@ -188,6 +189,16 @@ load_lexer_from_file(std::string const& schema_file_path, log_surgeon::lexers::B
     for (std::unique_ptr<log_surgeon::ParserAST> const& parser_ast : schema_ast->m_schema_vars) {
         auto* rule = dynamic_cast<log_surgeon::SchemaVarAST*>(parser_ast.get());
 
+        // Currently, we only support at most a single capture group in each variable. If a capture
+        // group is present its match will be treated as the variable rather than the full match.
+        if (1 < rule->m_regex_ptr->get_subtree_positive_captures().size()) {
+            throw std::runtime_error(
+                    schema_file_path + ":" + std::to_string(rule->m_line_num + 1)
+                    + ": error: the schema rule '" + rule->m_name
+                    + "' has a regex pattern containing > 1 capture group.\n"
+            );
+        }
+
         if ("timestamp" == rule->m_name) {
             continue;
         }
diff --git a/components/core/src/clp/streaming_archive/writer/Archive.cpp b/components/core/src/clp/streaming_archive/writer/Archive.cpp
index a5fe8a29f9..b1edd0c9ce 100644
--- a/components/core/src/clp/streaming_archive/writer/Archive.cpp
+++ b/components/core/src/clp/streaming_archive/writer/Archive.cpp
@@ -2,14 +2,17 @@
 
 #include <sys/stat.h>
 
+#include <cstdint>
 #include <filesystem>
-#include <fstream>
-#include <iostream>
+#include <stdexcept>
+#include <string>
+#include <vector>
 
 #include <boost/asio.hpp>
 #include <boost/uuid/uuid.hpp>
 #include <boost/uuid/uuid_generators.hpp>
 #include <boost/uuid/uuid_io.hpp>
+#include <log_surgeon/Constants.hpp>
 #include <log_surgeon/LogEvent.hpp>
 #include <log_surgeon/LogParser.hpp>
 #include <nlohmann/json.hpp>
@@ -23,11 +26,7 @@
 
 using clp::ir::eight_byte_encoded_variable_t;
 using clp::ir::four_byte_encoded_variable_t;
-using log_surgeon::LogEventView;
-using std::list;
-using std::make_unique;
 using std::string;
-using std::unordered_set;
 using std::vector;
 
 namespace clp::streaming_archive::writer {
@@ -315,13 +314,13 @@ Archive::write_msg(epochtime_t timestamp, string const& message, size_t num_unco
     update_segment_indices(logtype_id, var_ids);
 }
 
-void Archive::write_msg_using_schema(LogEventView const& log_view) {
+void Archive::write_msg_using_schema(log_surgeon::LogEventView const& log_view) {
     epochtime_t timestamp = 0;
     TimestampPattern* timestamp_pattern = nullptr;
     auto const& log_output_buffer = log_view.get_log_output_buffer();
     if (log_output_buffer->has_timestamp()) {
-        size_t start;
-        size_t end;
+        size_t start{};
+        size_t end{};
         timestamp_pattern = (TimestampPattern*)TimestampPattern::search_known_ts_patterns(
                 log_output_buffer->get_mutable_token(0).to_string(),
                 timestamp,
@@ -360,7 +359,7 @@ void Archive::write_msg_using_schema(LogEventView const& log_view) {
     if (timestamp_pattern == nullptr) {
         start_pos = log_output_buffer->get_token(1).m_start_pos;
     }
-    uint32_t end_pos = log_output_buffer->get_token(log_output_buffer->pos() - 1).m_end_pos;
+    uint32_t const end_pos = log_output_buffer->get_token(log_output_buffer->pos() - 1).m_end_pos;
     if (start_pos <= end_pos) {
         num_uncompressed_bytes = end_pos - start_pos;
     } else {
@@ -369,7 +368,7 @@ void Archive::write_msg_using_schema(LogEventView const& log_view) {
     }
     for (uint32_t i = 1; i < log_output_buffer->pos(); i++) {
         log_surgeon::Token& token = log_output_buffer->get_mutable_token(i);
-        int token_type = token.m_type_ids_ptr->at(0);
+        auto const token_type{token.m_type_ids_ptr->at(0)};
         if (log_output_buffer->has_delimiters() && (timestamp_pattern != nullptr || i > 1)
             && token_type != static_cast<int>(log_surgeon::SymbolId::TokenUncaughtString)
             && token_type != static_cast<int>(log_surgeon::SymbolId::TokenNewline))
@@ -388,13 +387,13 @@ void Archive::write_msg_using_schema(LogEventView const& log_view) {
                 break;
             }
             case static_cast<int>(log_surgeon::SymbolId::TokenInt): {
-                encoded_variable_t encoded_var;
+                encoded_variable_t encoded_var{};
                 if (!EncodedVariableInterpreter::convert_string_to_representable_integer_var(
                             token.to_string(),
                             encoded_var
                     ))
                 {
-                    variable_dictionary_id_t id;
+                    variable_dictionary_id_t id{};
                     m_var_dict.add_entry(token.to_string(), id);
                     encoded_var = EncodedVariableInterpreter::encode_var_dict_id(id);
                     m_logtype_dict_entry.add_dictionary_var();
@@ -405,13 +404,13 @@ void Archive::write_msg_using_schema(LogEventView const& log_view) {
                 break;
             }
             case static_cast<int>(log_surgeon::SymbolId::TokenFloat): {
-                encoded_variable_t encoded_var;
+                encoded_variable_t encoded_var{};
                 if (!EncodedVariableInterpreter::convert_string_to_representable_float_var(
                             token.to_string(),
                             encoded_var
                     ))
                 {
-                    variable_dictionary_id_t id;
+                    variable_dictionary_id_t id{};
                     m_var_dict.add_entry(token.to_string(), id);
                     encoded_var = EncodedVariableInterpreter::encode_var_dict_id(id);
                     m_logtype_dict_entry.add_dictionary_var();
@@ -422,21 +421,68 @@ void Archive::write_msg_using_schema(LogEventView const& log_view) {
                 break;
             }
             default: {
-                // Variable string looks like a dictionary variable, so encode it as so
-                encoded_variable_t encoded_var;
-                variable_dictionary_id_t id;
-                m_var_dict.add_entry(token.to_string(), id);
-                encoded_var = EncodedVariableInterpreter::encode_var_dict_id(id);
-                m_var_ids.push_back(id);
+                auto const& lexer{log_view.get_log_parser().m_lexer};
+                auto capture_ids{lexer.get_capture_ids_from_rule_id(token_type)};
+
+                // If the variable token contains capture groups, we break the token up by storing
+                // each capture as a variable and any substrings surrounding the capture as part of
+                // the logtype. If there are no capture groups the entire variable token is stored
+                // as a variable.
+                if (false == capture_ids.has_value()) {
+                    variable_dictionary_id_t id{};
+                    m_var_dict.add_entry(token.to_string(), id);
+                    m_var_ids.push_back(id);
+                    m_encoded_vars.push_back(EncodedVariableInterpreter::encode_var_dict_id(id));
+                    m_logtype_dict_entry.add_dictionary_var();
+
+                    break;
+                }
 
+                auto const register_ids{lexer.get_reg_ids_from_capture_id(capture_ids.value().at(0))};
+                if (false == register_ids.has_value()) {
+                    throw(std::runtime_error(
+                            "No register IDs found for variable's capture group. Full token: "
+                            + token.to_string()
+                    ));
+                }
+                auto const [start_reg_id, end_reg_id]{register_ids.value()};
+                auto const capture_start{token.get_reversed_reg_positions(start_reg_id).front()};
+                auto const capture_end{token.get_reversed_reg_positions(end_reg_id).back()};
+                auto const token_view{token.to_string_view()};
+                size_t token_pos{0};
+
+                auto const before_capture{token_view.substr(token_pos, capture_start)};
+                m_logtype_dict_entry
+                        .add_constant(before_capture, 0, before_capture.length());
+                token_pos += before_capture.length();
+
+                // If a capture has repetition we store all instances as a single variable.
+                auto const capture_len = [&]() -> size_t {
+                    if (capture_start <= capture_end) {
+                        return capture_end - capture_start;
+                    }
+                    return token_view.length() - capture_start + capture_end;
+                }();
+                auto const capture{token_view.substr(token_pos, capture_len)};
+
+                variable_dictionary_id_t id{};
+                m_var_dict.add_entry(capture, id);
+                m_var_ids.push_back(id);
+                m_encoded_vars.push_back(
+                        EncodedVariableInterpreter::encode_var_dict_id(id)
+                );
                 m_logtype_dict_entry.add_dictionary_var();
-                m_encoded_vars.push_back(encoded_var);
+                token_pos += capture.length();
+
+                m_logtype_dict_entry
+                        .add_constant(token_view, token_pos, token_view.length() - token_pos);
+
                 break;
             }
         }
     }
     if (!m_logtype_dict_entry.get_value().empty()) {
-        logtype_dictionary_id_t logtype_id;
+        logtype_dictionary_id_t logtype_id{};
         m_logtype_dict.add_entry(m_logtype_dict_entry, logtype_id);
         m_file->write_encoded_msg(
                 timestamp,
diff --git a/components/core/tests/test-ParserWithUserSchema.cpp b/components/core/tests/test-ParserWithUserSchema.cpp
index 61ca7ffdbe..9cc99efb1f 100644
--- a/components/core/tests/test-ParserWithUserSchema.cpp
+++ b/components/core/tests/test-ParserWithUserSchema.cpp
@@ -179,3 +179,26 @@ TEST_CASE("Test lexer", "[Search]") {
         token = opt_token.value();
     }
 }
+
+TEST_CASE("Test schema with single capture group", "[load_lexer]") {
+    std::string const schema_path{"../tests/test_schema_files/single_capture_group.txt"};
+    ByteLexer lexer;
+    load_lexer_from_file(schema_path, lexer);
+
+    auto const rule_id{lexer.m_symbol_id.at("capture")};
+    auto const capture_ids{lexer.get_capture_ids_from_rule_id(rule_id)};
+    REQUIRE(capture_ids.has_value());
+    REQUIRE(1 == capture_ids.value().size());
+    REQUIRE("group" == lexer.m_id_symbol.at(capture_ids->at(0)));
+}
+
+TEST_CASE("Test error for schema rule with multiple capture groups", "[load_lexer]") {
+    std::string const schema_path{"../tests/test_schema_files/multiple_capture_groups.txt"};
+    ByteLexer lexer;
+    REQUIRE_THROWS_WITH(
+            load_lexer_from_file(schema_path, lexer),
+            schema_path
+                    + ":1: error: the schema rule 'multicapture' has a regex pattern containing > "
+                      "1 capture group.\n"
+    );
+}
diff --git a/components/core/tests/test_schema_files/multiple_capture_groups.txt b/components/core/tests/test_schema_files/multiple_capture_groups.txt
new file mode 100644
index 0000000000..e55c56a9be
--- /dev/null
+++ b/components/core/tests/test_schema_files/multiple_capture_groups.txt
@@ -0,0 +1 @@
+multicapture:text(?<group0>var0)text(?<group1>var1)text
diff --git a/components/core/tests/test_schema_files/single_capture_group.txt b/components/core/tests/test_schema_files/single_capture_group.txt
new file mode 100644
index 0000000000..6984421a10
--- /dev/null
+++ b/components/core/tests/test_schema_files/single_capture_group.txt
@@ -0,0 +1 @@
+capture:text(?<group>var)text

From 61d5715f41dc26fee19c1c060804de922a12a20f Mon Sep 17 00:00:00 2001
From: davidlion <david.lion@yscope.com>
Date: Thu, 28 Aug 2025 11:43:28 -0400
Subject: [PATCH 02/14] Add schema equals rule capture.

---
 components/core/config/schemas.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/components/core/config/schemas.txt b/components/core/config/schemas.txt
index e0b777859d..9c09fb5d45 100644
--- a/components/core/config/schemas.txt
+++ b/components/core/config/schemas.txt
@@ -16,4 +16,4 @@ float:\-{0,1}[0-9]+\.[0-9]+
 // Dictionary variables
 hex:[a-fA-F]+
 hasNumber:.*\d.*
-equals:.*=.*[a-zA-Z0-9].*
+equals:.*=(?<var>.*[a-zA-Z0-9].*)

From 20b26f9956fe37a30309583c3b77755ee4ec5a75 Mon Sep 17 00:00:00 2001
From: davidlion <david.lion@yscope.com>
Date: Thu, 18 Sep 2025 13:22:10 -0400
Subject: [PATCH 03/14] Address some review comments.

---
 .../clp/streaming_archive/writer/Archive.cpp  | 27 +++++++++----------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/components/core/src/clp/streaming_archive/writer/Archive.cpp b/components/core/src/clp/streaming_archive/writer/Archive.cpp
index b1edd0c9ce..f14c6df87a 100644
--- a/components/core/src/clp/streaming_archive/writer/Archive.cpp
+++ b/components/core/src/clp/streaming_archive/writer/Archive.cpp
@@ -421,13 +421,13 @@ void Archive::write_msg_using_schema(log_surgeon::LogEventView const& log_view)
                 break;
             }
             default: {
-                auto const& lexer{log_view.get_log_parser().m_lexer};
-                auto capture_ids{lexer.get_capture_ids_from_rule_id(token_type)};
-
+                // If there are no capture groups the entire variable token is stored as a variable.
                 // If the variable token contains capture groups, we break the token up by storing
                 // each capture as a variable and any substrings surrounding the capture as part of
-                // the logtype. If there are no capture groups the entire variable token is stored
-                // as a variable.
+                // the logtype.
+
+                auto const& lexer{log_view.get_log_parser().m_lexer};
+                auto capture_ids{lexer.get_capture_ids_from_rule_id(token_type)};
                 if (false == capture_ids.has_value()) {
                     variable_dictionary_id_t id{};
                     m_var_dict.add_entry(token.to_string(), id);
@@ -438,7 +438,9 @@ void Archive::write_msg_using_schema(log_surgeon::LogEventView const& log_view)
                     break;
                 }
 
-                auto const register_ids{lexer.get_reg_ids_from_capture_id(capture_ids.value().at(0))};
+                auto const register_ids{
+                        lexer.get_reg_ids_from_capture_id(capture_ids.value().at(0))
+                };
                 if (false == register_ids.has_value()) {
                     throw(std::runtime_error(
                             "No register IDs found for variable's capture group. Full token: "
@@ -446,14 +448,13 @@ void Archive::write_msg_using_schema(log_surgeon::LogEventView const& log_view)
                     ));
                 }
                 auto const [start_reg_id, end_reg_id]{register_ids.value()};
-                auto const capture_start{token.get_reversed_reg_positions(start_reg_id).front()};
-                auto const capture_end{token.get_reversed_reg_positions(end_reg_id).back()};
+                auto const capture_start{token.get_reversed_reg_positions(start_reg_id).back()};
+                auto const capture_end{token.get_reversed_reg_positions(end_reg_id).front()};
                 auto const token_view{token.to_string_view()};
                 size_t token_pos{0};
 
                 auto const before_capture{token_view.substr(token_pos, capture_start)};
-                m_logtype_dict_entry
-                        .add_constant(before_capture, 0, before_capture.length());
+                m_logtype_dict_entry.add_constant(before_capture, 0, before_capture.length());
                 token_pos += before_capture.length();
 
                 // If a capture has repetition we store all instances as a single variable.
@@ -468,9 +469,7 @@ void Archive::write_msg_using_schema(log_surgeon::LogEventView const& log_view)
                 variable_dictionary_id_t id{};
                 m_var_dict.add_entry(capture, id);
                 m_var_ids.push_back(id);
-                m_encoded_vars.push_back(
-                        EncodedVariableInterpreter::encode_var_dict_id(id)
-                );
+                m_encoded_vars.push_back(EncodedVariableInterpreter::encode_var_dict_id(id));
                 m_logtype_dict_entry.add_dictionary_var();
                 token_pos += capture.length();
 
@@ -481,7 +480,7 @@ void Archive::write_msg_using_schema(log_surgeon::LogEventView const& log_view)
             }
         }
     }
-    if (!m_logtype_dict_entry.get_value().empty()) {
+    if (false == m_logtype_dict_entry.get_value().empty()) {
         logtype_dictionary_id_t logtype_id{};
         m_logtype_dict.add_entry(m_logtype_dict_entry, logtype_id);
         m_file->write_encoded_msg(

From b8089f107617b2627bc272d5f4672119c3890d1f Mon Sep 17 00:00:00 2001
From: davidlion <david.lion@yscope.com>
Date: Thu, 18 Sep 2025 13:22:54 -0400
Subject: [PATCH 04/14] Tweak tests.

---
 components/core/src/clp/Utils.cpp                      | 10 ++++++----
 components/core/tests/test-ParserWithUserSchema.cpp    |  6 +++---
 .../test_schema_files/multiple_capture_groups.txt      |  2 ++
 .../tests/test_schema_files/single_capture_group.txt   |  2 ++
 4 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/components/core/src/clp/Utils.cpp b/components/core/src/clp/Utils.cpp
index 8fcf17ff26..8f22bda569 100644
--- a/components/core/src/clp/Utils.cpp
+++ b/components/core/src/clp/Utils.cpp
@@ -191,13 +191,15 @@ load_lexer_from_file(std::string const& schema_file_path, log_surgeon::lexers::B
 
         // Currently, we only support at most a single capture group in each variable. If a capture
         // group is present its match will be treated as the variable rather than the full match.
-        if (1 < rule->m_regex_ptr->get_subtree_positive_captures().size()) {
+        auto const num_captures = rule->m_regex_ptr->get_subtree_positive_captures().size();
+        if (1 < num_captures) {
             throw std::runtime_error(
                     schema_file_path + ":" + std::to_string(rule->m_line_num + 1)
                     + ": error: the schema rule '" + rule->m_name
-                    + "' has a regex pattern containing > 1 capture group.\n"
-            );
-        }
+                    + "' has a regex pattern containing > 1 capture groups (found "
+                    + std::to_string(num_captures) + ").\n"
+             );
+         }
 
         if ("timestamp" == rule->m_name) {
             continue;
diff --git a/components/core/tests/test-ParserWithUserSchema.cpp b/components/core/tests/test-ParserWithUserSchema.cpp
index 9cc99efb1f..d2f5adfd74 100644
--- a/components/core/tests/test-ParserWithUserSchema.cpp
+++ b/components/core/tests/test-ParserWithUserSchema.cpp
@@ -188,7 +188,7 @@ TEST_CASE("Test schema with single capture group", "[load_lexer]") {
     auto const rule_id{lexer.m_symbol_id.at("capture")};
     auto const capture_ids{lexer.get_capture_ids_from_rule_id(rule_id)};
     REQUIRE(capture_ids.has_value());
-    REQUIRE(1 == capture_ids.value().size());
+    REQUIRE(1 == capture_ids->size());
     REQUIRE("group" == lexer.m_id_symbol.at(capture_ids->at(0)));
 }
 
@@ -198,7 +198,7 @@ TEST_CASE("Test error for schema rule with multiple capture groups", "[load_lexe
     REQUIRE_THROWS_WITH(
             load_lexer_from_file(schema_path, lexer),
             schema_path
-                    + ":1: error: the schema rule 'multicapture' has a regex pattern containing > "
-                      "1 capture group.\n"
+                    + ":3: error: the schema rule 'multicapture' has a regex pattern containing > "
+                      "1 capture groups (found 2).\n"
     );
 }
diff --git a/components/core/tests/test_schema_files/multiple_capture_groups.txt b/components/core/tests/test_schema_files/multiple_capture_groups.txt
index e55c56a9be..09998bc19d 100644
--- a/components/core/tests/test_schema_files/multiple_capture_groups.txt
+++ b/components/core/tests/test_schema_files/multiple_capture_groups.txt
@@ -1 +1,3 @@
+delimiters: \r\n
+
 multicapture:text(?<group0>var0)text(?<group1>var1)text
diff --git a/components/core/tests/test_schema_files/single_capture_group.txt b/components/core/tests/test_schema_files/single_capture_group.txt
index 6984421a10..7c4bf17688 100644
--- a/components/core/tests/test_schema_files/single_capture_group.txt
+++ b/components/core/tests/test_schema_files/single_capture_group.txt
@@ -1 +1,3 @@
+delimiters: \r\n
+
 capture:text(?<group>var)text

From 98d10885ab8bf3fa92cde98473fa0ef1484013e6 Mon Sep 17 00:00:00 2001
From: davidlion <david.lion@yscope.com>
Date: Fri, 19 Sep 2025 10:01:31 -0400
Subject: [PATCH 05/14] Fix wrapped token logic.

---
 .../clp/streaming_archive/writer/Archive.cpp  | 36 ++++++++++---------
 1 file changed, 19 insertions(+), 17 deletions(-)

diff --git a/components/core/src/clp/streaming_archive/writer/Archive.cpp b/components/core/src/clp/streaming_archive/writer/Archive.cpp
index f14c6df87a..041b41157d 100644
--- a/components/core/src/clp/streaming_archive/writer/Archive.cpp
+++ b/components/core/src/clp/streaming_archive/writer/Archive.cpp
@@ -424,7 +424,8 @@ void Archive::write_msg_using_schema(log_surgeon::LogEventView const& log_view)
                 // If there are no capture groups the entire variable token is stored as a variable.
                 // If the variable token contains capture groups, we break the token up by storing
                 // each capture as a variable and any substrings surrounding the capture as part of
-                // the logtype.
+                // the logtype. If a capture has repetition we store all instances as a single
+                // variable.
 
                 auto const& lexer{log_view.get_log_parser().m_lexer};
                 auto capture_ids{lexer.get_capture_ids_from_rule_id(token_type)};
@@ -450,31 +451,32 @@ void Archive::write_msg_using_schema(log_surgeon::LogEventView const& log_view)
                 auto const [start_reg_id, end_reg_id]{register_ids.value()};
                 auto const capture_start{token.get_reversed_reg_positions(start_reg_id).back()};
                 auto const capture_end{token.get_reversed_reg_positions(end_reg_id).front()};
-                auto const token_view{token.to_string_view()};
-                size_t token_pos{0};
-
-                auto const before_capture{token_view.substr(token_pos, capture_start)};
-                m_logtype_dict_entry.add_constant(before_capture, 0, before_capture.length());
-                token_pos += before_capture.length();
 
-                // If a capture has repetition we store all instances as a single variable.
-                auto const capture_len = [&]() -> size_t {
-                    if (capture_start <= capture_end) {
-                        return capture_end - capture_start;
+                auto const get_wrapped_size = [&](size_t start, size_t end) -> size_t {
+                    if (start <= end) {
+                        return end - start;
                     }
-                    return token_view.length() - capture_start + capture_end;
-                }();
-                auto const capture{token_view.substr(token_pos, capture_len)};
+                    return token.m_buffer_size - start + end;
+                };
+
+                auto const token_view{token.to_string_view()};
+                auto const before_capture_size{get_wrapped_size(token.m_start_pos, capture_start)};
+                m_logtype_dict_entry.add_constant(token_view, 0, before_capture_size);
 
+                auto const capture_size{get_wrapped_size(capture_start, capture_end)};
+                auto const capture{token_view.substr(before_capture_size, capture_size)};
                 variable_dictionary_id_t id{};
                 m_var_dict.add_entry(capture, id);
                 m_var_ids.push_back(id);
                 m_encoded_vars.push_back(EncodedVariableInterpreter::encode_var_dict_id(id));
                 m_logtype_dict_entry.add_dictionary_var();
-                token_pos += capture.length();
 
-                m_logtype_dict_entry
-                        .add_constant(token_view, token_pos, token_view.length() - token_pos);
+                auto const capture_end_pos{before_capture_size + capture_size};
+                m_logtype_dict_entry.add_constant(
+                        token_view,
+                        capture_end_pos,
+                        token_view.length() - capture_end_pos
+                );
 
                 break;
             }

From dec9f6e97d9319b6ade25ee8ebcafb6cb96685e2 Mon Sep 17 00:00:00 2001
From: davidlion <david.lion@yscope.com>
Date: Fri, 19 Sep 2025 10:28:46 -0400
Subject: [PATCH 06/14] Fix formatting.

---
 components/core/src/clp/Utils.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/components/core/src/clp/Utils.cpp b/components/core/src/clp/Utils.cpp
index 8f22bda569..f8449bc4ca 100644
--- a/components/core/src/clp/Utils.cpp
+++ b/components/core/src/clp/Utils.cpp
@@ -198,8 +198,8 @@ load_lexer_from_file(std::string const& schema_file_path, log_surgeon::lexers::B
                     + ": error: the schema rule '" + rule->m_name
                     + "' has a regex pattern containing > 1 capture groups (found "
                     + std::to_string(num_captures) + ").\n"
-             );
-         }
+            );
+        }
 
         if ("timestamp" == rule->m_name) {
             continue;

From 03a1f029a7808bb2273684dce38e8d817cfa0c46 Mon Sep 17 00:00:00 2001
From: davidlion <david.lion@yscope.com>
Date: Fri, 19 Sep 2025 10:29:13 -0400
Subject: [PATCH 07/14] Remove wrapping logic by manipulating the token.

---
 .../clp/streaming_archive/writer/Archive.cpp  | 32 ++++++++-----------
 1 file changed, 13 insertions(+), 19 deletions(-)

diff --git a/components/core/src/clp/streaming_archive/writer/Archive.cpp b/components/core/src/clp/streaming_archive/writer/Archive.cpp
index 041b41157d..7d125847ea 100644
--- a/components/core/src/clp/streaming_archive/writer/Archive.cpp
+++ b/components/core/src/clp/streaming_archive/writer/Archive.cpp
@@ -451,32 +451,26 @@ void Archive::write_msg_using_schema(log_surgeon::LogEventView const& log_view)
                 auto const [start_reg_id, end_reg_id]{register_ids.value()};
                 auto const capture_start{token.get_reversed_reg_positions(start_reg_id).back()};
                 auto const capture_end{token.get_reversed_reg_positions(end_reg_id).front()};
+                auto token_view{token};
+                auto const token_end_pos{token_view.m_end_pos};
 
-                auto const get_wrapped_size = [&](size_t start, size_t end) -> size_t {
-                    if (start <= end) {
-                        return end - start;
-                    }
-                    return token.m_buffer_size - start + end;
-                };
-
-                auto const token_view{token.to_string_view()};
-                auto const before_capture_size{get_wrapped_size(token.m_start_pos, capture_start)};
-                m_logtype_dict_entry.add_constant(token_view, 0, before_capture_size);
+                token_view.m_end_pos = capture_start;
+                auto const text_before_capture{token_view.to_string_view()};
+                m_logtype_dict_entry
+                        .add_constant(text_before_capture, 0, text_before_capture.size());
 
-                auto const capture_size{get_wrapped_size(capture_start, capture_end)};
-                auto const capture{token_view.substr(before_capture_size, capture_size)};
+                token_view.m_start_pos = capture_start;
+                token_view.m_end_pos = capture_end;
                 variable_dictionary_id_t id{};
-                m_var_dict.add_entry(capture, id);
+                m_var_dict.add_entry(token_view.to_string_view(), id);
                 m_var_ids.push_back(id);
                 m_encoded_vars.push_back(EncodedVariableInterpreter::encode_var_dict_id(id));
                 m_logtype_dict_entry.add_dictionary_var();
 
-                auto const capture_end_pos{before_capture_size + capture_size};
-                m_logtype_dict_entry.add_constant(
-                        token_view,
-                        capture_end_pos,
-                        token_view.length() - capture_end_pos
-                );
+                token_view.m_start_pos = capture_end;
+                token_view.m_end_pos = token_end_pos;
+                auto const text_after_capture{token_view.to_string_view()};
+                m_logtype_dict_entry.add_constant(text_after_capture, 0, text_after_capture.size());
 
                 break;
             }

From 65541fbe09a1078418a0d70deb6301d12d416f3e Mon Sep 17 00:00:00 2001
From: davidlion <david.lion@yscope.com>
Date: Tue, 18 Nov 2025 21:41:07 -0500
Subject: [PATCH 08/14] Update log-surgeon.

---
 taskfiles/deps/main.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/taskfiles/deps/main.yaml b/taskfiles/deps/main.yaml
index 927cedb2a1..a5a4785b26 100644
--- a/taskfiles/deps/main.yaml
+++ b/taskfiles/deps/main.yaml
@@ -367,8 +367,8 @@ tasks:
             - "-DCMAKE_INSTALL_MESSAGE=LAZY"
             - "-Dlog_surgeon_BUILD_TESTING=OFF"
           LIB_NAME: "log_surgeon"
-          TARBALL_SHA256: "4551ea50cd22e8423770fd66a167e1c86053b1f4957f72c582a2da93e7820210"
-          TARBALL_URL: "https://github.com/y-scope/log-surgeon/archive/840f262.tar.gz"
+          TARBALL_SHA256: "396ef8822e687b0bb8ca433a360e9e1d39a51620df89cac606db5e0f49d1b265"
+          TARBALL_URL: "https://github.com/y-scope/log-surgeon/archive/1135c2e.tar.gz"
 
   lz4:
     internal: true

From 56bb7a6526d29e10782d27af8379787e47936dc4 Mon Sep 17 00:00:00 2001
From: davidlion <david.lion@yscope.com>
Date: Tue, 18 Nov 2025 22:42:07 -0500
Subject: [PATCH 09/14] Update GrepCore.

---
 components/core/src/clp/GrepCore.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/components/core/src/clp/GrepCore.cpp b/components/core/src/clp/GrepCore.cpp
index 1a4bf499e2..b8abf5a980 100644
--- a/components/core/src/clp/GrepCore.cpp
+++ b/components/core/src/clp/GrepCore.cpp
@@ -257,9 +257,9 @@ bool GrepCore::get_bounds_of_next_potential_var(
                         return false;
                     }
                     search_token = SearchToken{token.value()};
-                    search_token.m_type_ids_set.insert(search_token.m_type_ids_ptr->at(0));
+                    search_token.m_type_ids_set.insert(search_token.get_type_ids()->at(0));
                 }
-                auto const& type = search_token.m_type_ids_ptr->at(0);
+                auto const& type = search_token.get_type_ids()->at(0);
                 if (type != static_cast<int>(log_surgeon::SymbolId::TokenUncaughtString)
                     && type != static_cast<int>(log_surgeon::SymbolId::TokenEnd))
                 {

From 610e76f218201262773d8f2e12ac2b6703517cb1 Mon Sep 17 00:00:00 2001
From: davidlion <david.lion@yscope.com>
Date: Wed, 19 Nov 2025 23:03:12 -0500
Subject: [PATCH 10/14] Update log surgeon code + tests with new version.

---
 .../clp/streaming_archive/writer/Archive.cpp  | 109 +++++++++---------
 .../core/tests/test-ParserWithUserSchema.cpp  |  15 +--
 .../tests/test_schema_files/search_schema.txt |   2 +-
 3 files changed, 65 insertions(+), 61 deletions(-)

diff --git a/components/core/src/clp/streaming_archive/writer/Archive.cpp b/components/core/src/clp/streaming_archive/writer/Archive.cpp
index 9934683f5f..ce71ee7a91 100644
--- a/components/core/src/clp/streaming_archive/writer/Archive.cpp
+++ b/components/core/src/clp/streaming_archive/writer/Archive.cpp
@@ -22,6 +22,7 @@
 #include "../../spdlog_with_specializations.hpp"
 #include "../../Utils.hpp"
 #include "../Constants.hpp"
+#include "TimestampPattern.hpp"
 #include "utils.hpp"
 
 using clp::ir::eight_byte_encoded_variable_t;
@@ -315,14 +316,14 @@ Archive::write_msg(epochtime_t timestamp, string const& message, size_t num_unco
 }
 
 void Archive::write_msg_using_schema(log_surgeon::LogEventView const& log_view) {
-    epochtime_t timestamp = 0;
-    TimestampPattern* timestamp_pattern = nullptr;
-    auto const& log_output_buffer = log_view.get_log_output_buffer();
-    if (log_output_buffer->has_timestamp()) {
+    epochtime_t timestamp{0};
+    TimestampPattern const* timestamp_pattern{nullptr};
+    auto const& log_buf = log_view.get_log_output_buffer();
+    if (log_buf->has_timestamp()) {
         size_t start{};
         size_t end{};
-        timestamp_pattern = (TimestampPattern*)TimestampPattern::search_known_ts_patterns(
-                log_output_buffer->get_mutable_token(0).to_string(),
+        timestamp_pattern = TimestampPattern::search_known_ts_patterns(
+                log_buf->get_mutable_token(0).to_string(),
                 timestamp,
                 start,
                 end
@@ -330,14 +331,16 @@ void Archive::write_msg_using_schema(log_surgeon::LogEventView const& log_view)
         if (nullptr == timestamp_pattern) {
             throw(std::runtime_error(
                     "Schema contains a timestamp regex that matches "
-                    + log_output_buffer->get_mutable_token(0).to_string()
+                    + log_buf->get_mutable_token(0).to_string()
                     + " which does not match any known timestamp pattern."
             ));
         }
         if (m_old_ts_pattern != timestamp_pattern) {
             change_ts_pattern(timestamp_pattern);
-            m_old_ts_pattern = timestamp_pattern;
+            m_old_ts_pattern = const_cast<TimestampPattern*>(timestamp_pattern);
         }
+    } else {
+        timestamp_pattern = nullptr;
     }
     if (get_data_size_of_dictionaries() >= m_target_data_size_of_dicts) {
         split_file_and_archive(
@@ -353,48 +356,45 @@ void Archive::write_msg_using_schema(log_surgeon::LogEventView const& log_view)
     m_encoded_vars.clear();
     m_var_ids.clear();
     m_logtype_dict_entry.clear();
-    size_t num_uncompressed_bytes = 0;
+
+    size_t num_uncompressed_bytes{0};
     // Timestamp is included in the uncompressed message size
-    uint32_t start_pos = log_output_buffer->get_token(0).m_start_pos;
+    auto start_pos{log_buf->get_token(0).get_start_pos()};
     if (timestamp_pattern == nullptr) {
-        start_pos = log_output_buffer->get_token(1).m_start_pos;
+        start_pos = log_buf->get_token(1).get_start_pos();
     }
-    uint32_t const end_pos = log_output_buffer->get_token(log_output_buffer->pos() - 1).m_end_pos;
+    auto const end_pos{log_buf->get_token(log_buf->pos() - 1).get_end_pos()};
     if (start_pos <= end_pos) {
         num_uncompressed_bytes = end_pos - start_pos;
     } else {
-        num_uncompressed_bytes
-                = log_output_buffer->get_token(0).m_buffer_size - start_pos + end_pos;
+        num_uncompressed_bytes = log_buf->get_token(0).get_buffer_size() - start_pos + end_pos;
     }
-    for (uint32_t i = 1; i < log_output_buffer->pos(); i++) {
-        log_surgeon::Token& token = log_output_buffer->get_mutable_token(i);
-        auto const token_type{token.m_type_ids_ptr->at(0)};
-        if (log_output_buffer->has_delimiters() && (timestamp_pattern != nullptr || i > 1)
+    for (auto token_idx{1}; token_idx < log_buf->pos(); token_idx++) {
+        auto token_view{log_buf->get_token(token_idx)};
+        auto const token_type{token_view.get_type_ids()->at(0)};
+        if (log_buf->has_delimiters() && (timestamp_pattern != nullptr || token_idx > 1)
             && token_type != static_cast<int>(log_surgeon::SymbolId::TokenUncaughtString)
             && token_type != static_cast<int>(log_surgeon::SymbolId::TokenNewline))
         {
-            m_logtype_dict_entry.add_constant(token.get_delimiter(), 0, 1);
-            if (token.m_start_pos == token.m_buffer_size - 1) {
-                token.m_start_pos = 0;
-            } else {
-                token.m_start_pos++;
-            }
+            m_logtype_dict_entry.add_constant(token_view.get_delimiter(), 0, 1);
+            token_view.increment_start_pos();
         }
         switch (token_type) {
             case static_cast<int>(log_surgeon::SymbolId::TokenNewline):
             case static_cast<int>(log_surgeon::SymbolId::TokenUncaughtString): {
-                m_logtype_dict_entry.add_constant(token.to_string(), 0, token.get_length());
+                m_logtype_dict_entry
+                        .add_constant(token_view.to_string(), 0, token_view.get_length());
                 break;
             }
             case static_cast<int>(log_surgeon::SymbolId::TokenInt): {
                 encoded_variable_t encoded_var{};
                 if (!EncodedVariableInterpreter::convert_string_to_representable_integer_var(
-                            token.to_string(),
+                            token_view.to_string(),
                             encoded_var
                     ))
                 {
                     variable_dictionary_id_t id{};
-                    m_var_dict.add_entry(token.to_string(), id);
+                    m_var_dict.add_entry(token_view.to_string(), id);
                     encoded_var = EncodedVariableInterpreter::encode_var_dict_id(id);
                     m_logtype_dict_entry.add_dictionary_var();
                 } else {
@@ -406,12 +406,12 @@ void Archive::write_msg_using_schema(log_surgeon::LogEventView const& log_view)
             case static_cast<int>(log_surgeon::SymbolId::TokenFloat): {
                 encoded_variable_t encoded_var{};
                 if (!EncodedVariableInterpreter::convert_string_to_representable_float_var(
-                            token.to_string(),
+                            token_view.to_string(),
                             encoded_var
                     ))
                 {
                     variable_dictionary_id_t id{};
-                    m_var_dict.add_entry(token.to_string(), id);
+                    m_var_dict.add_entry(token_view.to_string(), id);
                     encoded_var = EncodedVariableInterpreter::encode_var_dict_id(id);
                     m_logtype_dict_entry.add_dictionary_var();
                 } else {
@@ -424,18 +424,17 @@ void Archive::write_msg_using_schema(log_surgeon::LogEventView const& log_view)
                 // If there are no capture groups the entire variable token is stored as a variable.
                 // If the variable token contains capture groups, we break the token up by storing
                 // each capture as a variable and any substrings surrounding the capture as part of
-                // the logtype. If a capture has repetition we store all instances as a single
-                // variable.
+                // the logtype. Capture repetition currently does not work so we explicitly only
+                // store the first capture.
 
                 auto const& lexer{log_view.get_log_parser().m_lexer};
                 auto capture_ids{lexer.get_capture_ids_from_rule_id(token_type)};
                 if (false == capture_ids.has_value()) {
                     variable_dictionary_id_t id{};
-                    m_var_dict.add_entry(token.to_string(), id);
+                    m_var_dict.add_entry(token_view.to_string(), id);
                     m_var_ids.push_back(id);
                     m_encoded_vars.push_back(EncodedVariableInterpreter::encode_var_dict_id(id));
                     m_logtype_dict_entry.add_dictionary_var();
-
                     break;
                 }
 
@@ -445,33 +444,37 @@ void Archive::write_msg_using_schema(log_surgeon::LogEventView const& log_view)
                 if (false == register_ids.has_value()) {
                     throw(std::runtime_error(
                             "No register IDs found for variable's capture group. Full token: "
-                            + token.to_string()
+                            + token_view.to_string()
                     ));
                 }
+
                 auto const [start_reg_id, end_reg_id]{register_ids.value()};
-                auto const capture_start{token.get_reversed_reg_positions(start_reg_id).back()};
-                auto const capture_end{token.get_reversed_reg_positions(end_reg_id).front()};
-                auto token_view{token};
-                auto const token_end_pos{token_view.m_end_pos};
+                auto const start_positions{token_view.get_reversed_reg_positions(start_reg_id)};
+                auto const end_positions{token_view.get_reversed_reg_positions(end_reg_id)};
 
-                token_view.m_end_pos = capture_start;
-                auto const text_before_capture{token_view.to_string_view()};
-                m_logtype_dict_entry
-                        .add_constant(text_before_capture, 0, text_before_capture.size());
+                if (false == start_positions.empty() && -1 < start_positions[0]
+                    && false == end_positions.empty() && -1 < end_positions[0])
+                {
+                    auto token_end{token_view.get_end_pos()};
 
-                token_view.m_start_pos = capture_start;
-                token_view.m_end_pos = capture_end;
-                variable_dictionary_id_t id{};
-                m_var_dict.add_entry(token_view.to_string_view(), id);
-                m_var_ids.push_back(id);
-                m_encoded_vars.push_back(EncodedVariableInterpreter::encode_var_dict_id(id));
-                m_logtype_dict_entry.add_dictionary_var();
+                    token_view.set_end_pos(start_positions[0]);
+                    auto const before_capture{token_view.to_string_view()};
+                    m_logtype_dict_entry.add_constant(before_capture, 0, before_capture.size());
 
-                token_view.m_start_pos = capture_end;
-                token_view.m_end_pos = token_end_pos;
-                auto const text_after_capture{token_view.to_string_view()};
-                m_logtype_dict_entry.add_constant(text_after_capture, 0, text_after_capture.size());
+                    token_view.set_start_pos(start_positions[0]);
+                    token_view.set_end_pos(end_positions[0]);
 
+                    variable_dictionary_id_t id{};
+                    m_var_dict.add_entry(token_view.to_string_view(), id);
+                    m_var_ids.push_back(id);
+                    m_encoded_vars.push_back(EncodedVariableInterpreter::encode_var_dict_id(id));
+                    m_logtype_dict_entry.add_dictionary_var();
+
+                    token_view.set_start_pos(end_positions[0]);
+                    token_view.set_end_pos(token_end);
+                    auto const after_capture{token_view.to_string_view()};
+                    m_logtype_dict_entry.add_constant(after_capture, 0, after_capture.size());
+                }
                 break;
             }
         }
diff --git a/components/core/tests/test-ParserWithUserSchema.cpp b/components/core/tests/test-ParserWithUserSchema.cpp
index 491b4b1c79..a18ff00f28 100644
--- a/components/core/tests/test-ParserWithUserSchema.cpp
+++ b/components/core/tests/test-ParserWithUserSchema.cpp
@@ -192,10 +192,11 @@ TEST_CASE("Test lexer", "[Search]") {
     auto [error_code, opt_token] = lexer.scan(parser_input_buffer);
     REQUIRE(error_code == log_surgeon::ErrorCode::Success);
     Token token{opt_token.value()};
-    while (token.m_type_ids_ptr->at(0) != static_cast<int>(log_surgeon::SymbolId::TokenEnd)) {
+    while (token.get_type_ids()->at(0) != static_cast<int>(log_surgeon::SymbolId::TokenEnd)) {
         SPDLOG_INFO("token:" + token.to_string() + "\n");
         SPDLOG_INFO(
-                "token.m_type_ids->back():" + lexer.m_id_symbol[token.m_type_ids_ptr->back()] + "\n"
+                "token.get_type_ids()->back():" + lexer.m_id_symbol[token.get_type_ids()->back()]
+                + "\n"
         );
         auto [error_code, opt_token] = lexer.scan(parser_input_buffer);
         REQUIRE(error_code == log_surgeon::ErrorCode::Success);
@@ -204,9 +205,9 @@ TEST_CASE("Test lexer", "[Search]") {
 }
 
 TEST_CASE("Test schema with single capture group", "[load_lexer]") {
-    std::string const schema_path{"../tests/test_schema_files/single_capture_group.txt"};
+    auto const schema_file_path{get_test_schema_files_dir() / "single_capture_group.txt"};
     ByteLexer lexer;
-    load_lexer_from_file(schema_path, lexer);
+    load_lexer_from_file(schema_file_path, lexer);
 
     auto const rule_id{lexer.m_symbol_id.at("capture")};
     auto const capture_ids{lexer.get_capture_ids_from_rule_id(rule_id)};
@@ -216,11 +217,11 @@ TEST_CASE("Test schema with single capture group", "[load_lexer]") {
 }
 
 TEST_CASE("Test error for schema rule with multiple capture groups", "[load_lexer]") {
-    std::string const schema_path{"../tests/test_schema_files/multiple_capture_groups.txt"};
+    auto const schema_file_path{get_test_schema_files_dir() / "multiple_capture_groups.txt"};
     ByteLexer lexer;
     REQUIRE_THROWS_WITH(
-            load_lexer_from_file(schema_path, lexer),
-            schema_path
+            load_lexer_from_file(schema_file_path, lexer),
+            schema_file_path.string()
                     + ":3: error: the schema rule 'multicapture' has a regex pattern containing > "
                       "1 capture groups (found 2).\n"
     );
diff --git a/components/core/tests/test_schema_files/search_schema.txt b/components/core/tests/test_schema_files/search_schema.txt
index 60d0c12f00..f49a6dbfa4 100644
--- a/components/core/tests/test_schema_files/search_schema.txt
+++ b/components/core/tests/test_schema_files/search_schema.txt
@@ -1,5 +1,5 @@
 // Delimiters 
-delimiters: \r\n:,=!;%\?
+delimiters: \r\n:,=!;%?
 
 // First set of variables
 timestamp:[0-9]{4}\-[0-9]{2}\-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}(\.[0-9]{3}){0,1}

From 578601a356e3eb029683996d5d28a1022c54985e Mon Sep 17 00:00:00 2001
From: davidlion <david.lion@yscope.com>
Date: Mon, 24 Nov 2025 11:43:45 -0500
Subject: [PATCH 11/14] Refactor writing a token to dictionaries into a private
 helper.

---
 .../clp/streaming_archive/writer/Archive.cpp  | 204 +++++++++---------
 .../clp/streaming_archive/writer/Archive.hpp  |  12 +-
 2 files changed, 115 insertions(+), 101 deletions(-)

diff --git a/components/core/src/clp/streaming_archive/writer/Archive.cpp b/components/core/src/clp/streaming_archive/writer/Archive.cpp
index ce71ee7a91..19d6223be7 100644
--- a/components/core/src/clp/streaming_archive/writer/Archive.cpp
+++ b/components/core/src/clp/streaming_archive/writer/Archive.cpp
@@ -15,6 +15,7 @@
 #include <log_surgeon/Constants.hpp>
 #include <log_surgeon/LogEvent.hpp>
 #include <log_surgeon/LogParser.hpp>
+#include <log_surgeon/Token.hpp>
 #include <nlohmann/json.hpp>
 
 #include "../../EncodedVariableInterpreter.hpp"
@@ -315,6 +316,109 @@ Archive::write_msg(epochtime_t timestamp, string const& message, size_t num_unco
     update_segment_indices(logtype_id, var_ids);
 }
 
+auto Archive::add_token_to_dicts(
+        log_surgeon::LogEventView const& log_view,
+        log_surgeon::Token token_view
+) -> void {
+    switch (token_view.get_type_ids()->at(0)) {
+        case static_cast<int>(log_surgeon::SymbolId::TokenNewline):
+        case static_cast<int>(log_surgeon::SymbolId::TokenUncaughtString): {
+            m_logtype_dict_entry.add_constant(token_view.to_string(), 0, token_view.get_length());
+            break;
+        }
+        case static_cast<int>(log_surgeon::SymbolId::TokenInt): {
+            encoded_variable_t encoded_var{};
+            if (!EncodedVariableInterpreter::convert_string_to_representable_integer_var(
+                        token_view.to_string(),
+                        encoded_var
+                ))
+            {
+                variable_dictionary_id_t id{};
+                m_var_dict.add_entry(token_view.to_string(), id);
+                encoded_var = EncodedVariableInterpreter::encode_var_dict_id(id);
+                m_logtype_dict_entry.add_dictionary_var();
+            } else {
+                m_logtype_dict_entry.add_int_var();
+            }
+            m_encoded_vars.push_back(encoded_var);
+            break;
+        }
+        case static_cast<int>(log_surgeon::SymbolId::TokenFloat): {
+            encoded_variable_t encoded_var{};
+            if (!EncodedVariableInterpreter::convert_string_to_representable_float_var(
+                        token_view.to_string(),
+                        encoded_var
+                ))
+            {
+                variable_dictionary_id_t id{};
+                m_var_dict.add_entry(token_view.to_string(), id);
+                encoded_var = EncodedVariableInterpreter::encode_var_dict_id(id);
+                m_logtype_dict_entry.add_dictionary_var();
+            } else {
+                m_logtype_dict_entry.add_float_var();
+            }
+            m_encoded_vars.push_back(encoded_var);
+            break;
+        }
+        default: {
+            // If there are no capture groups the entire variable token is stored as a variable.
+            // If the variable token contains capture groups, we break the token up by storing
+            // each capture as a variable and any substrings surrounding the capture as part of
+            // the logtype. Capture repetition currently does not work so we explicitly only
+            // store the first capture.
+
+            auto const token_type{token_view.get_type_ids()->at(0)};
+            auto const& lexer{log_view.get_log_parser().m_lexer};
+            auto capture_ids{lexer.get_capture_ids_from_rule_id(token_type)};
+            if (false == capture_ids.has_value()) {
+                variable_dictionary_id_t id{};
+                m_var_dict.add_entry(token_view.to_string(), id);
+                m_var_ids.push_back(id);
+                m_encoded_vars.push_back(EncodedVariableInterpreter::encode_var_dict_id(id));
+                m_logtype_dict_entry.add_dictionary_var();
+                break;
+            }
+
+            auto const register_ids{lexer.get_reg_ids_from_capture_id(capture_ids.value().at(0))};
+            if (false == register_ids.has_value()) {
+                throw(std::runtime_error(
+                        "No register IDs found for variable's capture group. Full token: "
+                        + token_view.to_string()
+                ));
+            }
+
+            auto const [start_reg_id, end_reg_id]{register_ids.value()};
+            auto const start_positions{token_view.get_reversed_reg_positions(start_reg_id)};
+            auto const end_positions{token_view.get_reversed_reg_positions(end_reg_id)};
+
+            if (false == start_positions.empty() && -1 < start_positions[0]
+                && false == end_positions.empty() && -1 < end_positions[0])
+            {
+                auto token_end{token_view.get_end_pos()};
+
+                token_view.set_end_pos(start_positions[0]);
+                auto const before_capture{token_view.to_string_view()};
+                m_logtype_dict_entry.add_constant(before_capture, 0, before_capture.size());
+
+                token_view.set_start_pos(start_positions[0]);
+                token_view.set_end_pos(end_positions[0]);
+
+                variable_dictionary_id_t id{};
+                m_var_dict.add_entry(token_view.to_string_view(), id);
+                m_var_ids.push_back(id);
+                m_encoded_vars.push_back(EncodedVariableInterpreter::encode_var_dict_id(id));
+                m_logtype_dict_entry.add_dictionary_var();
+
+                token_view.set_start_pos(end_positions[0]);
+                token_view.set_end_pos(token_end);
+                auto const after_capture{token_view.to_string_view()};
+                m_logtype_dict_entry.add_constant(after_capture, 0, after_capture.size());
+            }
+            break;
+        }
+    }
+}
+
 void Archive::write_msg_using_schema(log_surgeon::LogEventView const& log_view) {
     epochtime_t timestamp{0};
     TimestampPattern const* timestamp_pattern{nullptr};
@@ -379,105 +483,7 @@ void Archive::write_msg_using_schema(log_surgeon::LogEventView const& log_view)
             m_logtype_dict_entry.add_constant(token_view.get_delimiter(), 0, 1);
             token_view.increment_start_pos();
         }
-        switch (token_type) {
-            case static_cast<int>(log_surgeon::SymbolId::TokenNewline):
-            case static_cast<int>(log_surgeon::SymbolId::TokenUncaughtString): {
-                m_logtype_dict_entry
-                        .add_constant(token_view.to_string(), 0, token_view.get_length());
-                break;
-            }
-            case static_cast<int>(log_surgeon::SymbolId::TokenInt): {
-                encoded_variable_t encoded_var{};
-                if (!EncodedVariableInterpreter::convert_string_to_representable_integer_var(
-                            token_view.to_string(),
-                            encoded_var
-                    ))
-                {
-                    variable_dictionary_id_t id{};
-                    m_var_dict.add_entry(token_view.to_string(), id);
-                    encoded_var = EncodedVariableInterpreter::encode_var_dict_id(id);
-                    m_logtype_dict_entry.add_dictionary_var();
-                } else {
-                    m_logtype_dict_entry.add_int_var();
-                }
-                m_encoded_vars.push_back(encoded_var);
-                break;
-            }
-            case static_cast<int>(log_surgeon::SymbolId::TokenFloat): {
-                encoded_variable_t encoded_var{};
-                if (!EncodedVariableInterpreter::convert_string_to_representable_float_var(
-                            token_view.to_string(),
-                            encoded_var
-                    ))
-                {
-                    variable_dictionary_id_t id{};
-                    m_var_dict.add_entry(token_view.to_string(), id);
-                    encoded_var = EncodedVariableInterpreter::encode_var_dict_id(id);
-                    m_logtype_dict_entry.add_dictionary_var();
-                } else {
-                    m_logtype_dict_entry.add_float_var();
-                }
-                m_encoded_vars.push_back(encoded_var);
-                break;
-            }
-            default: {
-                // If there are no capture groups the entire variable token is stored as a variable.
-                // If the variable token contains capture groups, we break the token up by storing
-                // each capture as a variable and any substrings surrounding the capture as part of
-                // the logtype. Capture repetition currently does not work so we explicitly only
-                // store the first capture.
-
-                auto const& lexer{log_view.get_log_parser().m_lexer};
-                auto capture_ids{lexer.get_capture_ids_from_rule_id(token_type)};
-                if (false == capture_ids.has_value()) {
-                    variable_dictionary_id_t id{};
-                    m_var_dict.add_entry(token_view.to_string(), id);
-                    m_var_ids.push_back(id);
-                    m_encoded_vars.push_back(EncodedVariableInterpreter::encode_var_dict_id(id));
-                    m_logtype_dict_entry.add_dictionary_var();
-                    break;
-                }
-
-                auto const register_ids{
-                        lexer.get_reg_ids_from_capture_id(capture_ids.value().at(0))
-                };
-                if (false == register_ids.has_value()) {
-                    throw(std::runtime_error(
-                            "No register IDs found for variable's capture group. Full token: "
-                            + token_view.to_string()
-                    ));
-                }
-
-                auto const [start_reg_id, end_reg_id]{register_ids.value()};
-                auto const start_positions{token_view.get_reversed_reg_positions(start_reg_id)};
-                auto const end_positions{token_view.get_reversed_reg_positions(end_reg_id)};
-
-                if (false == start_positions.empty() && -1 < start_positions[0]
-                    && false == end_positions.empty() && -1 < end_positions[0])
-                {
-                    auto token_end{token_view.get_end_pos()};
-
-                    token_view.set_end_pos(start_positions[0]);
-                    auto const before_capture{token_view.to_string_view()};
-                    m_logtype_dict_entry.add_constant(before_capture, 0, before_capture.size());
-
-                    token_view.set_start_pos(start_positions[0]);
-                    token_view.set_end_pos(end_positions[0]);
-
-                    variable_dictionary_id_t id{};
-                    m_var_dict.add_entry(token_view.to_string_view(), id);
-                    m_var_ids.push_back(id);
-                    m_encoded_vars.push_back(EncodedVariableInterpreter::encode_var_dict_id(id));
-                    m_logtype_dict_entry.add_dictionary_var();
-
-                    token_view.set_start_pos(end_positions[0]);
-                    token_view.set_end_pos(token_end);
-                    auto const after_capture{token_view.to_string_view()};
-                    m_logtype_dict_entry.add_constant(after_capture, 0, after_capture.size());
-                }
-                break;
-            }
-        }
+        add_token_to_dicts(log_view, token_view);
     }
     if (false == m_logtype_dict_entry.get_value().empty()) {
         logtype_dictionary_id_t logtype_id{};
diff --git a/components/core/src/clp/streaming_archive/writer/Archive.hpp b/components/core/src/clp/streaming_archive/writer/Archive.hpp
index 2b589881a5..2fa1d4661f 100644
--- a/components/core/src/clp/streaming_archive/writer/Archive.hpp
+++ b/components/core/src/clp/streaming_archive/writer/Archive.hpp
@@ -12,7 +12,6 @@
 #include <boost/uuid/random_generator.hpp>
 #include <boost/uuid/uuid.hpp>
 #include <log_surgeon/LogEvent.hpp>
-#include <log_surgeon/ReaderParser.hpp>
 
 #include "../../ArrayBackedPosIntSet.hpp"
 #include "../../ErrorCode.hpp"
@@ -150,7 +149,7 @@ class Archive {
      * @param log_event_view
      * @throw FileWriter::OperationFailed if any write fails
      */
-    void write_msg_using_schema(log_surgeon::LogEventView const& log_event_view);
+    void write_msg_using_schema(log_surgeon::LogEventView const& log_view);
 
     /**
      * Writes an IR log event to the current encoded file
@@ -290,6 +289,15 @@ class Archive {
      */
     auto update_global_metadata() -> void;
 
+    /**
+     * Inspect a log surgeon token and add its information to the logtype and variable dictionaries.
+     * @param log_view The log event containing the token.
+     * @param token_view The token to add to the dictionaries.
+     */
+    auto
+    add_token_to_dicts(log_surgeon::LogEventView const& log_view, log_surgeon::Token token_view)
+            -> void;
+
     // Variables
     boost::uuids::uuid m_id;
     std::string m_id_as_string;

From 834fdb780e83ef23f472acb407cc5dcac595908b Mon Sep 17 00:00:00 2001
From: davidlion <david.lion@yscope.com>
Date: Wed, 26 Nov 2025 23:19:48 -0500
Subject: [PATCH 12/14] Add unit test; refactor test-ParserWithUserSchema.cpp.

---
 .../core/tests/test-ParserWithUserSchema.cpp  | 164 ++++++++++--------
 .../tests/test_log_files/log_with_capture.txt |   1 +
 .../single_capture_group.txt                  |   2 +-
 3 files changed, 90 insertions(+), 77 deletions(-)
 create mode 100644 components/core/tests/test_log_files/log_with_capture.txt

diff --git a/components/core/tests/test-ParserWithUserSchema.cpp b/components/core/tests/test-ParserWithUserSchema.cpp
index a18ff00f28..b51d212f9a 100644
--- a/components/core/tests/test-ParserWithUserSchema.cpp
+++ b/components/core/tests/test-ParserWithUserSchema.cpp
@@ -1,37 +1,49 @@
 // TODO: move this test to log_surgeon
 // TODO: move load_lexer_from_file into SearchParser in log_surgeon
 
-#include <sys/stat.h>
-
 #include <filesystem>
 #include <string>
+#include <string_view>
 #include <utility>
+#include <vector>
 
 #include <catch2/catch_test_macros.hpp>
 #include <catch2/matchers/catch_matchers.hpp>
+#include <fmt/format.h>
+#include <log_surgeon/Lexer.hpp>
 #include <log_surgeon/LogParser.hpp>
 #include <log_surgeon/SchemaParser.hpp>
+#include <log_surgeon/Token.hpp>
+#include <spdlog/spdlog.h>
+
+#include <clp/clp/run.hpp>
+#include <clp/FileReader.hpp>
+#include <clp/ir/types.hpp>
+#include <clp/LogSurgeonReader.hpp>
+#include <clp/streaming_archive/Constants.hpp>
+#include <clp/streaming_archive/reader/Archive.hpp>
+#include <clp/type_utils.hpp>
+#include <clp/Utils.hpp>
 
-#include "../src/clp/clp/run.hpp"
-#include "../src/clp/GlobalMySQLMetadataDB.hpp"
-#include "../src/clp/LogSurgeonReader.hpp"
-#include "../src/clp/Utils.hpp"
+#include "TestOutputCleaner.hpp"
 
-using clp::FileReader;
 using clp::load_lexer_from_file;
-using clp::LogSurgeonReader;
-using log_surgeon::DelimiterStringAST;
 using log_surgeon::lexers::ByteLexer;
 using log_surgeon::LogParser;
-using log_surgeon::ParserAST;
 using log_surgeon::SchemaAST;
-using log_surgeon::SchemaVarAST;
-using log_surgeon::Token;
 
 namespace {
+constexpr std::string_view cTestArchiveDirectory{"test-parser-with-user-schema-archive"};
+
+auto run_clp_compress(
+        std::filesystem::path const& schema_path,
+        std::filesystem::path const& output_path,
+        std::filesystem::path const& input_path
+) -> int;
 [[nodiscard]] auto get_tests_dir() -> std::filesystem::path;
 [[nodiscard]] auto get_test_schema_files_dir() -> std::filesystem::path;
 [[nodiscard]] auto get_test_queries_dir() -> std::filesystem::path;
+[[nodiscard]] auto get_test_log_dir() -> std::filesystem::path;
 
 auto get_tests_dir() -> std::filesystem::path {
     std::filesystem::path const current_file_path{__FILE__};
@@ -45,6 +57,30 @@ auto get_test_schema_files_dir() -> std::filesystem::path {
 auto get_test_queries_dir() -> std::filesystem::path {
     return get_tests_dir() / "test_search_queries";
 }
+
+auto get_test_log_dir() -> std::filesystem::path {
+    return get_tests_dir() / "test_log_files";
+}
+
+auto run_clp_compress(
+        std::filesystem::path const& schema_path,
+        std::filesystem::path const& output_path,
+        std::filesystem::path const& input_path
+) -> int {
+    auto const schema_path_str{schema_path.string()};
+    auto const output_path_str{output_path.string()};
+    auto const input_path_str{input_path.string()};
+    std::vector<char const*> argv{
+            "clp",
+            "c",
+            "--schema-path",
+            schema_path_str.data(),
+            output_path_str.data(),
+            input_path_str.data(),
+            nullptr
+    };
+    return clp::clp::run(static_cast<int>(argv.size() - 1), argv.data());
+}
 }  // namespace
 
 std::unique_ptr<SchemaAST> generate_schema_ast(std::string const& schema_file) {
@@ -60,43 +96,6 @@ std::unique_ptr<LogParser> generate_log_parser(std::string const& schema_file) {
     return log_parser;
 }
 
-void compress(
-        std::string const& output_dir,
-        std::string const& file_to_compress,
-        std::string schema_file,
-        bool old = false
-) {
-    std::vector<std::string> arguments;
-    if (old) {
-        arguments = {"main.cpp", "c", output_dir, file_to_compress};
-    } else {
-        arguments
-                = {"main.cpp",
-                   "c",
-                   output_dir,
-                   file_to_compress,
-                   "--schema-path",
-                   std::move(schema_file)};
-    }
-    std::vector<char const*> argv;
-    for (auto const& arg : arguments) {
-        argv.push_back(arg.data());
-    }
-    argv.push_back(nullptr);
-    clp::clp::run(argv.size() - 1, argv.data());
-}
-
-void decompress(std::string archive_dir, std::string output_dir) {
-    std::vector<std::string> arguments
-            = {"main.cpp", "x", std::move(archive_dir), std::move(output_dir)};
-    std::vector<char const*> argv;
-    for (auto const& arg : arguments) {
-        argv.push_back(arg.data());
-    }
-    argv.push_back(nullptr);
-    clp::clp::run(argv.size() - 1, argv.data());
-}
-
 TEST_CASE("Test error for missing schema file", "[LALR1Parser][SchemaParser]") {
     auto const file_path = get_test_schema_files_dir() / "missing_schema.txt";
     auto const file_path_string = file_path.string();
@@ -156,42 +155,19 @@ TEST_CASE("Test creating log parser without delimiters", "[LALR1Parser][LogParse
     );
 }
 
-// TODO: This test doesn't currently work because delimiters are allowed in
-// schema files, and there is no option to disable this yet
-// TEST_CASE("Test error for creating log file with delimiter in regex pattern",
-//          "[LALR1Parser]SchemaParser]") {
-//    std::string file_path = "../tests/test_schema_files/schema_with_delimiter_in_regex_error.txt";
-//    std::string file_name = boost::filesystem::canonical(file_path).string();
-//    REQUIRE_THROWS_WITH(generate_log_parser(file_path),
-//                        file_name +
-//                        ":2: error: 'equals' has regex pattern which contains delimiter '='.\n"
-//                        + "          equals:.*=.*\n"
-//                        + "                 ^^^^^\n");
-//}
-
-// TODO: This error check is performed correctly by CLP, but it is handled by
-// something different now so this test will fail as is
-// TEST_CASE("Test error for missing log file", "[LALR1Parser][LogParser]") {
-//    std::string file_name = "../tests/test_log_files/missing_log.txt";
-//    std::string file_path = boost::filesystem::weakly_canonical(file_name).string();
-//    REQUIRE_THROWS(compress("../tests/test_archives", file_name,
-//                            "../tests/test_schema_files/schema_that_does_not_exist.txt"),
-//                   "Specified schema file does not exist.");
-//}
-
 TEST_CASE("Test lexer", "[Search]") {
     ByteLexer lexer;
     auto const schema_file_path = get_test_schema_files_dir() / "search_schema.txt";
     load_lexer_from_file(schema_file_path.string(), lexer);
     auto const query_file_path = get_test_queries_dir() / "easy.txt";
-    FileReader file_reader{query_file_path.string()};
-    LogSurgeonReader reader_wrapper(file_reader);
+    clp::FileReader file_reader{query_file_path.string()};
+    clp::LogSurgeonReader reader_wrapper(file_reader);
     log_surgeon::ParserInputBuffer parser_input_buffer;
     parser_input_buffer.read_if_safe(reader_wrapper);
     lexer.reset();
     auto [error_code, opt_token] = lexer.scan(parser_input_buffer);
     REQUIRE(error_code == log_surgeon::ErrorCode::Success);
-    Token token{opt_token.value()};
+    auto token{opt_token.value()};
     while (token.get_type_ids()->at(0) != static_cast<int>(log_surgeon::SymbolId::TokenEnd)) {
         SPDLOG_INFO("token:" + token.to_string() + "\n");
         SPDLOG_INFO(
@@ -216,7 +192,7 @@ TEST_CASE("Test schema with single capture group", "[load_lexer]") {
     REQUIRE("group" == lexer.m_id_symbol.at(capture_ids->at(0)));
 }
 
-TEST_CASE("Test error for schema rule with multiple capture groups", "[load_lexer]") {
+TEST_CASE("Error on schema rule with multiple capture groups", "[load_lexer]") {
     auto const schema_file_path{get_test_schema_files_dir() / "multiple_capture_groups.txt"};
     ByteLexer lexer;
     REQUIRE_THROWS_WITH(
@@ -226,3 +202,39 @@ TEST_CASE("Test error for schema rule with multiple capture groups", "[load_lexe
                       "1 capture groups (found 2).\n"
     );
 }
+
+TEST_CASE("Verify dictionary contents", "[Compression]") {
+    auto const log_file_path{get_test_log_dir() / "log_with_capture.txt"};
+    auto const schema_file_path{get_test_schema_files_dir() / "single_capture_group.txt"};
+    TestOutputCleaner const cleaner{{std::string{cTestArchiveDirectory}}};
+    std::filesystem::create_directory(cTestArchiveDirectory);
+
+    REQUIRE(0 == run_clp_compress(schema_file_path, cTestArchiveDirectory, log_file_path));
+
+    std::vector<std::filesystem::path> archive_paths;
+    for (auto const& entry : std::filesystem::directory_iterator{cTestArchiveDirectory}) {
+        auto const& path{entry.path()};
+        if (false == path.string().ends_with(clp::streaming_archive::cMetadataDBFileName)) {
+            archive_paths.emplace_back(path);
+        }
+    }
+    REQUIRE(1 == archive_paths.size());
+
+    clp::streaming_archive::reader::Archive archive_reader;
+    archive_reader.open(archive_paths.at(0));
+    archive_reader.refresh_dictionaries();
+
+    auto const& logtype_dict{archive_reader.get_logtype_dictionary()};
+    REQUIRE(1 == logtype_dict.get_entries().size());
+    REQUIRE(fmt::format(
+                    "2016-05-08 07:34:05.251 MyDog{} APet{} test.txt\n",
+                    clp::enum_to_underlying_type(clp::ir::VariablePlaceholder::Dictionary),
+                    clp::enum_to_underlying_type(clp::ir::VariablePlaceholder::Dictionary)
+            )
+            == logtype_dict.get_value(0));
+
+    auto const& var_dict{archive_reader.get_var_dictionary()};
+    REQUIRE(2 == var_dict.get_entries().size());
+    REQUIRE("123" == var_dict.get_value(0));
+    REQUIRE("4123" == var_dict.get_value(1));
+}
diff --git a/components/core/tests/test_log_files/log_with_capture.txt b/components/core/tests/test_log_files/log_with_capture.txt
new file mode 100644
index 0000000000..6210ea8bdc
--- /dev/null
+++ b/components/core/tests/test_log_files/log_with_capture.txt
@@ -0,0 +1 @@
+2016-05-08 07:34:05.251 MyDog123 APet4123 test.txt
diff --git a/components/core/tests/test_schema_files/single_capture_group.txt b/components/core/tests/test_schema_files/single_capture_group.txt
index 7c4bf17688..b9543fbdcc 100644
--- a/components/core/tests/test_schema_files/single_capture_group.txt
+++ b/components/core/tests/test_schema_files/single_capture_group.txt
@@ -1,3 +1,3 @@
 delimiters: \r\n
 
-capture:text(?<group>var)text
+capture:[A-Za-z]+(?<group>\d+)

From ea1ad3b63b22a1ddb5418b3bd0b03429cca9b6e0 Mon Sep 17 00:00:00 2001
From: davidlion <david.lion@yscope.com>
Date: Thu, 27 Nov 2025 09:14:31 -0500
Subject: [PATCH 13/14] Add null + empty check for token type ids.

---
 .../src/clp/streaming_archive/writer/Archive.cpp     | 12 ++++++++++--
 .../src/clp/streaming_archive/writer/Archive.hpp     |  2 +-
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/components/core/src/clp/streaming_archive/writer/Archive.cpp b/components/core/src/clp/streaming_archive/writer/Archive.cpp
index 19d6223be7..194c55488e 100644
--- a/components/core/src/clp/streaming_archive/writer/Archive.cpp
+++ b/components/core/src/clp/streaming_archive/writer/Archive.cpp
@@ -320,7 +320,11 @@ auto Archive::add_token_to_dicts(
         log_surgeon::LogEventView const& log_view,
         log_surgeon::Token token_view
 ) -> void {
-    switch (token_view.get_type_ids()->at(0)) {
+    auto const* type_ids{token_view.get_type_ids()};
+    if (nullptr == type_ids || type_ids->empty()) {
+        throw std::runtime_error("Token has no type IDs: " + token_view.to_string());
+    }
+    switch (type_ids->at(0)) {
         case static_cast<int>(log_surgeon::SymbolId::TokenNewline):
         case static_cast<int>(log_surgeon::SymbolId::TokenUncaughtString): {
             m_logtype_dict_entry.add_constant(token_view.to_string(), 0, token_view.get_length());
@@ -475,7 +479,11 @@ void Archive::write_msg_using_schema(log_surgeon::LogEventView const& log_view)
     }
     for (auto token_idx{1}; token_idx < log_buf->pos(); token_idx++) {
         auto token_view{log_buf->get_token(token_idx)};
-        auto const token_type{token_view.get_type_ids()->at(0)};
+        auto const* type_ids{token_view.get_type_ids()};
+        if (nullptr == type_ids || type_ids->empty()) {
+            throw std::runtime_error("Token has no type IDs: " + token_view.to_string());
+        }
+        auto const token_type{type_ids->at(0)};
         if (log_buf->has_delimiters() && (timestamp_pattern != nullptr || token_idx > 1)
             && token_type != static_cast<int>(log_surgeon::SymbolId::TokenUncaughtString)
             && token_type != static_cast<int>(log_surgeon::SymbolId::TokenNewline))
diff --git a/components/core/src/clp/streaming_archive/writer/Archive.hpp b/components/core/src/clp/streaming_archive/writer/Archive.hpp
index 2fa1d4661f..b1351199a2 100644
--- a/components/core/src/clp/streaming_archive/writer/Archive.hpp
+++ b/components/core/src/clp/streaming_archive/writer/Archive.hpp
@@ -149,7 +149,7 @@ class Archive {
      * @param log_event_view
      * @throw FileWriter::OperationFailed if any write fails
      */
-    void write_msg_using_schema(log_surgeon::LogEventView const& log_view);
+    auto write_msg_using_schema(log_surgeon::LogEventView const& log_view) -> void;
 
     /**
      * Writes an IR log event to the current encoded file

From 3b36328baba7de064484e6af5bf839d63ee0e43a Mon Sep 17 00:00:00 2001
From: davidlion <david.lion@yscope.com>
Date: Thu, 27 Nov 2025 09:25:27 -0500
Subject: [PATCH 14/14] Fix coderabbit nits.

---
 .../core/src/clp/streaming_archive/writer/Archive.cpp  | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/components/core/src/clp/streaming_archive/writer/Archive.cpp b/components/core/src/clp/streaming_archive/writer/Archive.cpp
index 194c55488e..dcc191f5bd 100644
--- a/components/core/src/clp/streaming_archive/writer/Archive.cpp
+++ b/components/core/src/clp/streaming_archive/writer/Archive.cpp
@@ -324,7 +324,8 @@ auto Archive::add_token_to_dicts(
     if (nullptr == type_ids || type_ids->empty()) {
         throw std::runtime_error("Token has no type IDs: " + token_view.to_string());
     }
-    switch (type_ids->at(0)) {
+    auto const token_type{type_ids->at(0)};
+    switch (token_type) {
         case static_cast<int>(log_surgeon::SymbolId::TokenNewline):
         case static_cast<int>(log_surgeon::SymbolId::TokenUncaughtString): {
             m_logtype_dict_entry.add_constant(token_view.to_string(), 0, token_view.get_length());
@@ -332,7 +333,8 @@ auto Archive::add_token_to_dicts(
         }
         case static_cast<int>(log_surgeon::SymbolId::TokenInt): {
             encoded_variable_t encoded_var{};
-            if (!EncodedVariableInterpreter::convert_string_to_representable_integer_var(
+            if (false
+                == EncodedVariableInterpreter::convert_string_to_representable_integer_var(
                         token_view.to_string(),
                         encoded_var
                 ))
@@ -349,7 +351,8 @@ auto Archive::add_token_to_dicts(
         }
         case static_cast<int>(log_surgeon::SymbolId::TokenFloat): {
             encoded_variable_t encoded_var{};
-            if (!EncodedVariableInterpreter::convert_string_to_representable_float_var(
+            if (false
+                == EncodedVariableInterpreter::convert_string_to_representable_float_var(
                         token_view.to_string(),
                         encoded_var
                 ))
@@ -371,7 +374,6 @@ auto Archive::add_token_to_dicts(
             // the logtype. Capture repetition currently does not work so we explicitly only
             // store the first capture.
 
-            auto const token_type{token_view.get_type_ids()->at(0)};
             auto const& lexer{log_view.get_log_parser().m_lexer};
             auto capture_ids{lexer.get_capture_ids_from_rule_id(token_type)};
             if (false == capture_ids.has_value()) {