diff --git a/components/core/CMakeLists.txt b/components/core/CMakeLists.txt index 9e9738f05d..f183f15e48 100644 --- a/components/core/CMakeLists.txt +++ b/components/core/CMakeLists.txt @@ -704,6 +704,7 @@ set(SOURCE_FILES_unitTest tests/test-clp_s-end_to_end.cpp tests/test-clp_s-range_index.cpp tests/test-clp_s-search.cpp + tests/test-clp_s-StringUtils.cpp tests/test-EncodedVariableInterpreter.cpp tests/test-encoding_methods.cpp tests/test-ffi_IrUnitHandlerReq.cpp diff --git a/components/core/src/clp_s/CommandLineArguments.cpp b/components/core/src/clp_s/CommandLineArguments.cpp index 72c0d50d80..fef29442f6 100644 --- a/components/core/src/clp_s/CommandLineArguments.cpp +++ b/components/core/src/clp_s/CommandLineArguments.cpp @@ -256,6 +256,12 @@ CommandLineArguments::parse_arguments(int argc, char const** argv) { "structurize-arrays", po::bool_switch(&m_structurize_arrays), "Structurize arrays instead of compressing them as clp strings." + )( + "sanitize-invalid-json", + po::bool_switch(&m_sanitize_invalid_json), + "Sanitize invalid JSON by escaping unescaped control characters (0x00-0x1F)," + " replacing invalid UTF-8 sequences with U+FFFD, and handling invalid" + " surrogate escapes. When disabled (default), parsing fails on invalid JSON." )( "disable-log-order", po::bool_switch(&m_disable_log_order), diff --git a/components/core/src/clp_s/CommandLineArguments.hpp b/components/core/src/clp_s/CommandLineArguments.hpp index b35124981a..2e32b57a68 100644 --- a/components/core/src/clp_s/CommandLineArguments.hpp +++ b/components/core/src/clp_s/CommandLineArguments.hpp @@ -113,6 +113,8 @@ class CommandLineArguments { bool get_structurize_arrays() const { return m_structurize_arrays; } + bool get_sanitize_invalid_json() const { return m_sanitize_invalid_json; } + bool get_ordered_decompression() const { return m_ordered_decompression; } size_t get_target_ordered_chunk_size() const { return m_target_ordered_chunk_size; } @@ -202,6 +204,7 @@ class CommandLineArguments { bool m_no_retain_float_format{false}; bool m_single_file_archive{false}; bool m_structurize_arrays{false}; + bool m_sanitize_invalid_json{false}; bool m_ordered_decompression{false}; size_t m_target_ordered_chunk_size{}; bool m_print_ordered_chunk_stats{false}; diff --git a/components/core/src/clp_s/JsonFileIterator.cpp b/components/core/src/clp_s/JsonFileIterator.cpp index a0a003d9f7..447e73617d 100644 --- a/components/core/src/clp_s/JsonFileIterator.cpp +++ b/components/core/src/clp_s/JsonFileIterator.cpp @@ -3,18 +3,25 @@ #include #include +#include #include +#include "Utils.hpp" + namespace clp_s { JsonFileIterator::JsonFileIterator( clp::ReaderInterface& reader, size_t max_document_size, + bool sanitize_invalid_json, + std::string path, size_t buf_size ) : m_buf_size(buf_size), m_max_document_size(max_document_size), m_buf(new char[buf_size + simdjson::SIMDJSON_PADDING]), - m_reader(reader) { + m_reader(reader), + m_path(std::move(path)), + m_sanitize_invalid_json(sanitize_invalid_json) { read_new_json(); } @@ -64,6 +71,26 @@ bool JsonFileIterator::read_new_json() { ) .get(m_stream); + // If sanitization is enabled and we encounter errors that can be fixed by sanitization, + // sanitize the buffer and retry parsing + if (m_sanitize_invalid_json) { + // Handle invalid UTF-8 sequences by replacing with U+FFFD + if (simdjson::error_code::UTF8_ERROR == error) { + // Return value intentionally ignored - in read_new_json we always retry after + // sanitization regardless of whether changes were made + static_cast(sanitize_invalid_utf8_and_log()); + error = m_parser.iterate_many(m_buf, m_buf_occupied, m_buf_occupied).get(m_stream); + } + + // Handle unescaped control characters by escaping them to \u00XX format + if (simdjson::error_code::UNESCAPED_CHARS == error) { + // Return value intentionally ignored - in read_new_json we always retry after + // sanitization regardless of whether changes were made + static_cast(sanitize_control_chars_and_log()); + error = m_parser.iterate_many(m_buf, m_buf_occupied, m_buf_occupied).get(m_stream); + } + } + if (error) { m_error_code = error; return false; @@ -95,6 +122,90 @@ size_t JsonFileIterator::skip_whitespace_and_get_truncated_bytes() { return m_buf_occupied - m_next_document_position; } +bool JsonFileIterator::sanitize_invalid_utf8_and_log() { + size_t const old_buf_occupied = m_buf_occupied; + // Note: sanitize_utf8_buffer may reallocate m_buf and will update m_buf_size by reference if + // reallocation is needed. This keeps m_buf_size in sync with the actual allocated buffer size. + auto const result = StringUtils::sanitize_utf8_buffer( + m_buf, + m_buf_size, + m_buf_occupied, + simdjson::SIMDJSON_PADDING + ); + m_buf_occupied = result.new_buf_occupied; + m_sanitization_bytes_added += m_buf_occupied - old_buf_occupied; + + if (result.sanitized_char_counts.empty()) { + return false; + } + + size_t total_replaced = 0; + for (auto const& [ch, count] : result.sanitized_char_counts) { + total_replaced += count; + } + SPDLOG_WARN( + "Replaced {} invalid UTF-8 sequence(s) with U+FFFD{}. Buffer expanded by {} bytes " + "({} -> {}).", + total_replaced, + m_path.empty() ? "" : fmt::format(" in file '{}'", m_path), + static_cast(m_buf_occupied) - static_cast(old_buf_occupied), + old_buf_occupied, + m_buf_occupied + ); + return true; +} + +bool JsonFileIterator::sanitize_control_chars_and_log() { + size_t const old_buf_occupied = m_buf_occupied; + // Note: sanitize_json_buffer may reallocate m_buf and will update m_buf_size by reference if + // reallocation is needed. This keeps m_buf_size in sync with the actual allocated buffer size. + auto const result = StringUtils::sanitize_json_buffer( + m_buf, + m_buf_size, + m_buf_occupied, + simdjson::SIMDJSON_PADDING + ); + m_buf_occupied = result.new_buf_occupied; + m_sanitization_bytes_added += m_buf_occupied - old_buf_occupied; + + if (result.sanitized_char_counts.empty()) { + return false; + } + + size_t total_sanitized = 0; + std::string char_details; + for (auto const& [ch, count] : result.sanitized_char_counts) { + if (false == char_details.empty()) { + char_details += ", "; + } + char_details += fmt::format("0x{:02x} ({})", static_cast(ch), count); + total_sanitized += count; + } + SPDLOG_WARN( + "Escaped {} control character(s) in JSON{}: [{}]. Buffer expanded by {} bytes " + "({} -> {}).", + total_sanitized, + m_path.empty() ? "" : fmt::format(" in file '{}'", m_path), + char_details, + static_cast(m_buf_occupied) - static_cast(old_buf_occupied), + old_buf_occupied, + m_buf_occupied + ); + return true; +} + +bool JsonFileIterator::reinitialize_document_stream() { + auto error = m_parser.iterate_many(m_buf, m_buf_occupied, m_buf_occupied).get(m_stream); + if (error) { + m_error_code = error; + return false; + } + m_doc_it = m_stream.begin(); + m_first_doc_in_buffer = true; + m_next_document_position = 0; + return true; +} + bool JsonFileIterator::get_json(simdjson::ondemand::document_stream::iterator& it) { if (false == m_first_read) { ++m_doc_it; @@ -118,6 +229,22 @@ bool JsonFileIterator::get_json(simdjson::ondemand::document_stream::iterator& i return true; } else if (m_doc_it.error() == simdjson::error_code::UTF8_ERROR) { maybe_utf8_edge_case = true; + } else if (m_sanitize_invalid_json + && m_doc_it.error() == simdjson::error_code::UNESCAPED_CHARS) + { + // Unescaped control characters detected during document iteration. Sanitize the + // buffer and re-setup the document stream to restart from the beginning. + if (false == sanitize_control_chars_and_log()) { + // Sanitization made no changes - report the original error to avoid infinite + // loop + m_error_code = m_doc_it.error(); + return false; + } + + if (false == reinitialize_document_stream()) { + return false; + } + continue; } else { m_error_code = m_doc_it.error(); return false; @@ -137,8 +264,22 @@ bool JsonFileIterator::get_json(simdjson::ondemand::document_stream::iterator& i // If we hit a UTF-8 error and either we have reached eof or the buffer occupancy is // greater than the maximum document size we assume that the UTF-8 error must have been // in the middle of the stream. Note: it is possible that the UTF-8 error is at the end - // of the stream and that this is actualy a truncation error. Unfortunately the only way - // to check is to parse it ourselves, so we rely on this heuristic for now. + // of the stream and that this is actually a truncation error. Unfortunately the only + // way to check is to parse it ourselves, so we rely on this heuristic for now. + if (m_sanitize_invalid_json) { + // Sanitize invalid UTF-8 sequences and retry + if (false == sanitize_invalid_utf8_and_log()) { + // Sanitization made no changes - report the original error to avoid infinite + // loop + m_error_code = simdjson::error_code::UTF8_ERROR; + return false; + } + + if (false == reinitialize_document_stream()) { + return false; + } + continue; + } m_error_code = simdjson::error_code::UTF8_ERROR; return false; } else if (maybe_utf8_edge_case) { @@ -151,10 +292,12 @@ bool JsonFileIterator::get_json(simdjson::ondemand::document_stream::iterator& i size_t JsonFileIterator::get_num_bytes_consumed() { // If there are more documents left in the current buffer account for how much of the // buffer has been consumed, otherwise report the total number of bytes read so that we - // capture trailing whitespace. + // capture trailing whitespace. Include bytes added by sanitization since the sanitized + // content is what gets compressed. if (m_doc_it != m_stream.end()) { - return m_bytes_read - (m_buf_occupied - m_next_document_position); + return m_bytes_read + m_sanitization_bytes_added + - (m_buf_occupied - m_next_document_position); } - return m_bytes_read; + return m_bytes_read + m_sanitization_bytes_added; } } // namespace clp_s diff --git a/components/core/src/clp_s/JsonFileIterator.hpp b/components/core/src/clp_s/JsonFileIterator.hpp index 9102087637..2b8be94a4c 100644 --- a/components/core/src/clp_s/JsonFileIterator.hpp +++ b/components/core/src/clp_s/JsonFileIterator.hpp @@ -1,6 +1,8 @@ #ifndef CLP_S_JSONFILEITERATOR_HPP #define CLP_S_JSONFILEITERATOR_HPP +#include + #include #include "../clp/ReaderInterface.hpp" @@ -11,7 +13,7 @@ class JsonFileIterator { /** * An iterator over an input stream containing json objects. JSON is parsed * using simdjson::parse_many. This allows simdjson to efficiently find - * delimeters between JSON objects, and if enabled parse JSON ahead of time + * delimiters between JSON objects, and if enabled parse JSON ahead of time * in another thread while the JSON is being iterated over. * * The buffer grows automatically if there are JSON objects larger than the buffer size. @@ -19,11 +21,15 @@ class JsonFileIterator { * @param reader the input stream containing JSON * @param max_document_size the maximum allowed size of a single document + * @param sanitize_invalid_json whether to sanitize invalid JSON (control chars, invalid UTF-8) + * @param path optional path to the file being read (used for logging) * @param buf_size the initial buffer size */ explicit JsonFileIterator( clp::ReaderInterface& reader, size_t max_document_size, + bool sanitize_invalid_json, + std::string path = {}, size_t buf_size = 1024 * 1024 /* 1 MiB default */ ); ~JsonFileIterator(); @@ -41,9 +47,11 @@ class JsonFileIterator { [[nodiscard]] size_t truncated_bytes() const { return m_truncated_bytes; } /** - * @return total number of bytes read from the file + * @return total number of bytes read from the file, plus any bytes added by sanitization */ - [[nodiscard]] size_t get_num_bytes_read() const { return m_bytes_read; } + [[nodiscard]] size_t get_num_bytes_read() const { + return m_bytes_read + m_sanitization_bytes_added; + } /** * Note: this method can not be const because checking if a simdjson iterator is at the end @@ -73,14 +81,40 @@ class JsonFileIterator { */ [[nodiscard]] size_t skip_whitespace_and_get_truncated_bytes(); + /** + * Sanitizes invalid UTF-8 sequences in the buffer by replacing them with U+FFFD, + * updates buffer tracking variables, and logs a warning if changes were made. + * @return true if sanitization made changes, false otherwise + * @note May reallocate m_buf if buffer expansion is needed. m_buf_size is updated accordingly. + */ + [[nodiscard]] bool sanitize_invalid_utf8_and_log(); + + /** + * Sanitizes unescaped control characters in the buffer by escaping them to \\u00XX format, + * updates buffer tracking variables, and logs a warning if changes were made. + * @return true if sanitization made changes, false otherwise + * @note May reallocate m_buf if buffer expansion is needed. m_buf_size is updated accordingly. + */ + [[nodiscard]] bool sanitize_control_chars_and_log(); + + /** + * Reinitializes the document stream after buffer sanitization. + * Resets iteration state to start from the beginning of the buffer. + * @return true on success, false if iteration setup fails (m_error_code is set on failure) + */ + [[nodiscard]] bool reinitialize_document_stream(); + size_t m_truncated_bytes{0}; size_t m_next_document_position{0}; size_t m_bytes_read{0}; + size_t m_sanitization_bytes_added{0}; size_t m_buf_size{0}; size_t m_buf_occupied{0}; size_t m_max_document_size{0}; char* m_buf{nullptr}; clp::ReaderInterface& m_reader; + std::string m_path; + bool m_sanitize_invalid_json{false}; simdjson::ondemand::parser m_parser; simdjson::ondemand::document_stream m_stream; bool m_eof{false}; diff --git a/components/core/src/clp_s/JsonParser.cpp b/components/core/src/clp_s/JsonParser.cpp index c0e2123e7f..93687beab1 100644 --- a/components/core/src/clp_s/JsonParser.cpp +++ b/components/core/src/clp_s/JsonParser.cpp @@ -144,6 +144,7 @@ JsonParser::JsonParser(JsonParserOption const& option) m_structurize_arrays(option.structurize_arrays), m_record_log_order(option.record_log_order), m_retain_float_format(option.retain_float_format), + m_sanitize_invalid_json(option.sanitize_invalid_json), m_input_paths(option.input_paths), m_network_auth(option.network_auth) { if (false == m_timestamp_key.empty()) { @@ -223,7 +224,7 @@ void JsonParser::parse_obj_in_array(simdjson::ondemand::object line, int32_t par } cur_field = *object_it_stack.top(); - cur_key = cur_field.unescaped_key(true); + cur_key = cur_field.unescaped_key(m_sanitize_invalid_json); cur_value = cur_field.value(); switch (cur_value.type()) { @@ -302,7 +303,7 @@ void JsonParser::parse_obj_in_array(simdjson::ondemand::object line, int32_t par break; } case simdjson::ondemand::json_type::string: { - std::string_view value = cur_value.get_string(true); + std::string_view value = cur_value.get_string(m_sanitize_invalid_json); if (value.find(' ') != std::string::npos) { node_id = m_archive_writer ->add_node(node_id_stack.top(), NodeType::ClpString, cur_key); @@ -407,7 +408,7 @@ void JsonParser::parse_array(simdjson::ondemand::array array, int32_t parent_nod break; } case simdjson::ondemand::json_type::string: { - std::string_view value = cur_value.get_string(true); + std::string_view value = cur_value.get_string(m_sanitize_invalid_json); if (value.find(' ') != std::string::npos) { node_id = m_archive_writer->add_node(parent_node_id, NodeType::ClpString, ""); } else { @@ -452,7 +453,7 @@ void JsonParser::parse_line( do { if (false == object_stack.empty()) { cur_field = *object_it_stack.top(); - cur_key = cur_field.unescaped_key(true); + cur_key = cur_field.unescaped_key(m_sanitize_invalid_json); line = cur_field.value(); } @@ -555,7 +556,7 @@ void JsonParser::parse_line( break; } case simdjson::ondemand::json_type::string: { - std::string_view value = line.get_string(true); + std::string_view value = line.get_string(m_sanitize_invalid_json); auto const matches_timestamp = m_archive_writer->matches_timestamp(node_id_stack.top(), cur_key); if (matches_timestamp) { @@ -668,7 +669,8 @@ auto JsonParser::ingest_json( Path const& path, std::string const& archive_creator_id ) -> bool { - JsonFileIterator json_file_iterator(*reader, m_max_document_size); + JsonFileIterator + json_file_iterator(*reader, m_max_document_size, m_sanitize_invalid_json, path.path); if (simdjson::error_code::SUCCESS != json_file_iterator.get_error()) { SPDLOG_ERROR( "Encountered error - {} - while trying to parse {} after parsing 0 bytes", diff --git a/components/core/src/clp_s/JsonParser.hpp b/components/core/src/clp_s/JsonParser.hpp index 82f8d04786..e763d5fe71 100644 --- a/components/core/src/clp_s/JsonParser.hpp +++ b/components/core/src/clp_s/JsonParser.hpp @@ -40,6 +40,7 @@ struct JsonParserOption { bool record_log_order{true}; bool retain_float_format{false}; bool single_file_archive{false}; + bool sanitize_invalid_json{false}; NetworkAuthOption network_auth{}; }; @@ -239,6 +240,7 @@ class JsonParser { bool m_structurize_arrays{false}; bool m_record_log_order{true}; bool m_retain_float_format{false}; + bool m_sanitize_invalid_json{false}; absl::flat_hash_map, std::pair> m_ir_node_to_archive_node_id_mapping; diff --git a/components/core/src/clp_s/Utils.cpp b/components/core/src/clp_s/Utils.cpp index 59b18a5bca..321032788f 100644 --- a/components/core/src/clp_s/Utils.cpp +++ b/components/core/src/clp_s/Utils.cpp @@ -2,6 +2,7 @@ #include #include +#include #include #include #include @@ -163,6 +164,241 @@ bool UriUtils::get_last_uri_component(std::string_view const uri, std::string& n return true; } +BufferSanitizeResult StringUtils::sanitize_json_buffer( + char*& buf, + size_t& buf_size, + size_t buf_occupied, + size_t simdjson_padding +) { + // Early return for empty or null buffer + if (nullptr == buf || 0 == buf_occupied) { + return {buf_occupied, {}}; + } + + // Sanitize by escaping control characters inside JSON strings. Uses a lazy-copy approach: + // only allocate the output string when the first control character is found, and copy + // valid ranges in bulk rather than byte-by-byte for efficiency. + // + // Note: If the buffer contains unmatched quotes (e.g., truncated JSON), the string state + // tracking may be incorrect, potentially escaping control characters outside of actual + // JSON strings. This is acceptable since such malformed JSON will fail parsing anyway. + std::string sanitized; + std::map sanitized_char_counts; + bool in_string = false; + bool escape_next = false; + size_t copy_start = 0; + + for (size_t i = 0; i < buf_occupied; ++i) { + char const c = buf[i]; + + if (escape_next) { + escape_next = false; + continue; + } + + if ('\\' == c && in_string) { + escape_next = true; + continue; + } + + if ('"' == c) { + in_string = !in_string; + continue; + } + + // Escape control characters (0x00-0x1F) inside strings to \u00XX format + if (in_string && static_cast(c) < 0x20) { + if (sanitized.empty()) { + // First control char found - allocate with extra space for escapes + sanitized.reserve(buf_occupied + 64); + } + // Copy the valid range before this control character + sanitized.append(buf + copy_start, i - copy_start); + char_to_escaped_four_char_hex(sanitized, c); + ++sanitized_char_counts[c]; + copy_start = i + 1; + } + } + + // If no sanitization was needed, return early without any allocation + if (sanitized.empty()) { + return {buf_occupied, {}}; + } + + // Copy any remaining content after the last control character + sanitized.append(buf + copy_start, buf_occupied - copy_start); + + // Grow buffer if needed to hold sanitized content + if (sanitized.size() > buf_size) { + size_t const new_buf_size = sanitized.size(); + char* new_buf = new char[new_buf_size + simdjson_padding]; + delete[] buf; + buf = new_buf; + buf_size = new_buf_size; + } + + // Copy sanitized content to buffer + std::memcpy(buf, sanitized.data(), sanitized.size()); + return {sanitized.size(), std::move(sanitized_char_counts)}; +} + +namespace { +/** + * Checks if a byte is a valid UTF-8 continuation byte (10xxxxxx pattern). + * @param byte The byte to check + * @return true if the byte is a valid continuation byte (0x80-0xBF) + */ +constexpr bool is_continuation_byte(unsigned char byte) { + return (byte & 0xC0) == 0x80; +} + +/** + * Validates a UTF-8 sequence starting at the given position and returns the sequence length. + * @param buf The buffer containing the UTF-8 data + * @param pos Current position in the buffer + * @param buf_occupied Total bytes in the buffer + * @return The length of the valid UTF-8 sequence (1-4), or 0 if invalid + */ +size_t validate_utf8_sequence(char const* buf, size_t pos, size_t buf_occupied) { + auto const byte = static_cast(buf[pos]); + size_t remaining = buf_occupied - pos; + + // ASCII (0x00-0x7F) + if (byte <= 0x7F) { + return 1; + } + + // Invalid: continuation byte without leader, or invalid lead bytes + if (byte < 0xC2 || byte > 0xF4) { + return 0; + } + + // 2-byte sequence (0xC2-0xDF) + if (byte <= 0xDF) { + if (remaining < 2 || !is_continuation_byte(static_cast(buf[pos + 1]))) { + return 0; + } + return 2; + } + + // 3-byte sequence (0xE0-0xEF) + if (byte <= 0xEF) { + if (remaining < 3) { + return 0; + } + auto const byte2 = static_cast(buf[pos + 1]); + auto const byte3 = static_cast(buf[pos + 2]); + + if (!is_continuation_byte(byte2) || !is_continuation_byte(byte3)) { + return 0; + } + + // Check for overlong encoding (E0 requires second byte >= 0xA0) + if (byte == 0xE0 && byte2 < 0xA0) { + return 0; + } + + // Check for surrogate code points (ED with second byte >= 0xA0 means 0xD800-0xDFFF) + if (byte == 0xED && byte2 >= 0xA0) { + return 0; + } + + return 3; + } + + // 4-byte sequence (0xF0-0xF4) + if (remaining < 4) { + return 0; + } + auto const byte2 = static_cast(buf[pos + 1]); + auto const byte3 = static_cast(buf[pos + 2]); + auto const byte4 = static_cast(buf[pos + 3]); + + if (!is_continuation_byte(byte2) || !is_continuation_byte(byte3) + || !is_continuation_byte(byte4)) + { + return 0; + } + + // Check for overlong encoding (F0 requires second byte >= 0x90) + if (byte == 0xF0 && byte2 < 0x90) { + return 0; + } + + // Check for code points > U+10FFFF (F4 requires second byte <= 0x8F) + if (byte == 0xF4 && byte2 > 0x8F) { + return 0; + } + + return 4; +} +} // namespace + +BufferSanitizeResult StringUtils::sanitize_utf8_buffer( + char*& buf, + size_t& buf_size, + size_t buf_occupied, + size_t simdjson_padding +) { + // Early return for empty or null buffer + if (nullptr == buf || 0 == buf_occupied) { + return {buf_occupied, {}}; + } + + // Sanitize by replacing invalid UTF-8 sequences with U+FFFD. Uses a lazy-copy approach: + // only allocate the output string when the first invalid sequence is found, and copy + // valid ranges in bulk rather than byte-by-byte for efficiency. + constexpr std::string_view cUtf8ReplacementChar{"\xEF\xBF\xBD", 3}; + constexpr char cInvalidUtf8Key = static_cast(0xFF); + + std::string sanitized; + std::map sanitized_char_counts; + size_t copy_start = 0; + + size_t i = 0; + while (i < buf_occupied) { + size_t const seq_len = validate_utf8_sequence(buf, i, buf_occupied); + if (seq_len > 0) { + // Valid sequence - advance position (will be copied in bulk later) + i += seq_len; + } else { + // Invalid sequence - need to sanitize + if (sanitized.empty()) { + // First invalid sequence found - allocate with extra space for replacements + sanitized.reserve(buf_occupied + 64); + } + // Copy the valid range before this invalid byte + sanitized.append(buf + copy_start, i - copy_start); + sanitized.append(cUtf8ReplacementChar); + ++sanitized_char_counts[cInvalidUtf8Key]; + // Skip one byte and continue (maximal subpart replacement strategy) + ++i; + copy_start = i; + } + } + + // If no sanitization was needed, return early without any allocation + if (sanitized.empty()) { + return {buf_occupied, {}}; + } + + // Copy any remaining valid content after the last invalid byte + sanitized.append(buf + copy_start, buf_occupied - copy_start); + + // Grow buffer if needed to hold sanitized content + if (sanitized.size() > buf_size) { + size_t const new_buf_size = sanitized.size(); + char* new_buf = new char[new_buf_size + simdjson_padding]; + delete[] buf; + buf = new_buf; + buf_size = new_buf_size; + } + + // Copy sanitized content to buffer + std::memcpy(buf, sanitized.data(), sanitized.size()); + return {sanitized.size(), std::move(sanitized_char_counts)}; +} + void StringUtils::escape_json_string(std::string& destination, std::string_view const source) { // Escaping is implemented using this `append_unescaped_slice` approach to offer a fast path // when strings are mostly or entirely valid escaped JSON. Benchmarking shows that this offers diff --git a/components/core/src/clp_s/Utils.hpp b/components/core/src/clp_s/Utils.hpp index a711eb8ad6..95033e18c3 100644 --- a/components/core/src/clp_s/Utils.hpp +++ b/components/core/src/clp_s/Utils.hpp @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -63,6 +64,18 @@ class UriUtils { static bool get_last_uri_component(std::string_view const uri, std::string& name); }; +/** + * Result of sanitizing a buffer, including statistics about what was sanitized. + */ +struct BufferSanitizeResult { + /// New number of bytes occupied in the buffer after sanitization. + size_t new_buf_occupied; + /// Map of sanitized characters to their occurrence counts. + /// For JSON control char sanitization: key is the actual control character (0x00-0x1F). + /// For UTF-8 sanitization: key is 0xFF (sentinel value for all invalid sequences). + std::map sanitized_char_counts; +}; + class StringUtils { public: /** @@ -81,6 +94,57 @@ class StringUtils { */ static void escape_json_string(std::string& destination, std::string_view const source); + /** + * Sanitizes a JSON buffer by escaping unescaped control characters (0x00-0x1F) inside JSON + * strings. This is used to fix malformed JSON that contains raw control characters which + * would cause parsing errors. + * + * Only control characters inside JSON string values (between unescaped double quotes) are + * escaped. Control characters outside strings are left unchanged as they would cause + * structural JSON errors anyway. + * + * Characters are escaped using unicode escape sequences (e.g., \x00 becomes \u0000). + * + * @param buf Reference to pointer to the buffer. May be reallocated if expansion is needed. + * The caller's pointer will be updated to point to the new buffer if reallocation + * occurs. The caller is responsible for deleting the buffer. + * @param buf_size Current allocated size of the buffer (excluding simdjson padding). + * This parameter is modified by reference and will be updated if the buffer + * is reallocated to reflect the new buffer size. + * @param buf_occupied Number of bytes currently used in the buffer. + * @param simdjson_padding Amount of padding required after buffer content. + * @return Result containing new buf_occupied and counts of each sanitized character. + */ + static BufferSanitizeResult sanitize_json_buffer( + char*& buf, + size_t& buf_size, + size_t buf_occupied, + size_t simdjson_padding + ); + + /** + * Sanitizes a buffer by replacing invalid UTF-8 sequences with the Unicode replacement + * character (U+FFFD). This is used to fix malformed data that contains invalid UTF-8 + * which would cause JSON parsing errors. + * + * @param buf Reference to pointer to the buffer. May be reallocated if expansion is needed. + * The caller's pointer will be updated to point to the new buffer if reallocation + * occurs. The caller is responsible for deleting the buffer. + * @param buf_size Current allocated size of the buffer (excluding simdjson padding). + * This parameter is modified by reference and will be updated if the buffer + * is reallocated to reflect the new buffer size. + * @param buf_occupied Number of bytes currently used in the buffer. + * @param simdjson_padding Amount of padding required after buffer content. + * @return Result containing new buf_occupied and counts of each type of invalid sequence + * replaced. + */ + static BufferSanitizeResult sanitize_utf8_buffer( + char*& buf, + size_t& buf_size, + size_t buf_occupied, + size_t simdjson_padding + ); + private: /** * Converts a character into its two byte hexadecimal representation. @@ -88,7 +152,6 @@ class StringUtils { * @return the two byte hexadecimal representation of c as an array of two characters. */ static std::array char_to_hex(char c) { - std::array ret; auto nibble_to_hex = [](char nibble) -> char { if ('\x00' <= nibble && nibble <= '\x09') { return '0' + (nibble - '\x00'); @@ -97,7 +160,7 @@ class StringUtils { } }; - return std::array{nibble_to_hex(0x0F & (c >> 4)), nibble_to_hex(0x0f & c)}; + return {nibble_to_hex(0x0F & (c >> 4)), nibble_to_hex(0x0f & c)}; } /** diff --git a/components/core/src/clp_s/clp-s.cpp b/components/core/src/clp_s/clp-s.cpp index c2cf7b6abd..fd24634680 100644 --- a/components/core/src/clp_s/clp-s.cpp +++ b/components/core/src/clp_s/clp-s.cpp @@ -107,6 +107,7 @@ bool compress(CommandLineArguments const& command_line_arguments) { option.single_file_archive = command_line_arguments.get_single_file_archive(); option.structurize_arrays = command_line_arguments.get_structurize_arrays(); option.record_log_order = command_line_arguments.get_record_log_order(); + option.sanitize_invalid_json = command_line_arguments.get_sanitize_invalid_json(); clp_s::JsonParser parser(option); if (false == parser.ingest()) { diff --git a/components/core/tests/test-clp_s-StringUtils.cpp b/components/core/tests/test-clp_s-StringUtils.cpp new file mode 100644 index 0000000000..e3ef434ecb --- /dev/null +++ b/components/core/tests/test-clp_s-StringUtils.cpp @@ -0,0 +1,539 @@ +#include +#include +#include +#include + +#include +#include + +#include "../src/clp_s/Utils.hpp" + +using clp_s::BufferSanitizeResult; +using clp_s::StringUtils; + +// We use C++14 string literals (the `s` suffix) to construct strings containing embedded null bytes +// and control characters. Without the `s` suffix, std::string's constructor from a C-string literal +// stops at the first null byte, truncating the test input. Additionally, we use string +// concatenation (e.g., "\x00" "end") to prevent hex escape sequences from being extended by +// following hex digits. +using namespace std::string_literals; + +namespace { +/** + * Result structure containing both the sanitization result and the output string. + */ +struct SanitizeResultWithOutput { + BufferSanitizeResult result; + std::string output; +}; + +/** + * Helper to create a buffer with simdjson padding and run JSON sanitization. + * Returns both the result (with character counts) and the sanitized output string. + * @param input The input string to sanitize + * @return SanitizeResultWithOutput containing result metadata and sanitized string + * @note sanitize_json_buffer may reallocate the buffer (passed by reference). This helper + * handles both reallocation and no-reallocation cases correctly. + */ +auto sanitize_json_with_result(std::string_view input) -> SanitizeResultWithOutput { + size_t buf_size = input.size(); + size_t buf_occupied = input.size(); + // Use raw pointer since sanitize_json_buffer may delete and reallocate + char* buf = new char[buf_size + simdjson::SIMDJSON_PADDING]; + std::memcpy(buf, input.data(), input.size()); + + auto result = StringUtils::sanitize_json_buffer( + buf, + buf_size, + buf_occupied, + simdjson::SIMDJSON_PADDING + ); + std::string output(buf, result.new_buf_occupied); + // buf now points to the final buffer (may have been reallocated) + delete[] buf; + return {std::move(result), std::move(output)}; +} + +/** + * Helper to create a buffer with simdjson padding and run JSON sanitization. + * @param input The input string to sanitize + * @return The sanitized string + */ +auto sanitize_string(std::string_view input) -> std::string { + return sanitize_json_with_result(input).output; +} + +/** + * Helper to verify that a sanitized JSON string can be parsed by simdjson. + * @param json The JSON string to parse + * @return true if parsing succeeds, false otherwise + */ +auto can_parse_json(std::string_view json) -> bool { + simdjson::ondemand::parser parser; + simdjson::padded_string padded(json); + auto result = parser.iterate(padded); + return false == result.error(); +} +} // namespace + +TEST_CASE("sanitize_json_buffer_no_changes", "[clp_s][StringUtils]") { + // Valid JSON without control characters should pass through unchanged + SECTION("simple object") { + std::string input = R"({"key": "value"})"; + REQUIRE(sanitize_string(input) == input); + } + + SECTION("object with escaped characters") { + std::string input = R"({"key": "line1\nline2\ttab"})"; + REQUIRE(sanitize_string(input) == input); + } + + SECTION("object with unicode escapes") { + std::string input = R"({"key": "null char: \u0000"})"; + REQUIRE(sanitize_string(input) == input); + } + + SECTION("multiple objects") { + std::string input = R"({"a": "1"}{"b": "2"})"; + REQUIRE(sanitize_string(input) == input); + } + + SECTION("control chars outside strings are unchanged") { + // Control chars outside strings aren't valid JSON anyway, + // but we don't touch them - let the JSON parser report the error + std::string input = "{\x01\"key\": \"value\"}"; + REQUIRE(sanitize_string(input) == input); + } +} + +TEST_CASE("sanitize_json_buffer_escapes_control_chars", "[clp_s][StringUtils]") { + SECTION("null byte in string value") { + auto input = "{\"key\": \"val\x00ue\"}"s; + std::string expected = R"({"key": "val\u0000ue"})"; + REQUIRE(sanitize_string(input) == expected); + } + + SECTION("multiple null bytes") { + auto input = "{\"key\": \"\x00\x00\x00\"}"s; + std::string expected = R"({"key": "\u0000\u0000\u0000"})"; + REQUIRE(sanitize_string(input) == expected); + } + + SECTION("various control characters") { + // Test 0x01 (SOH), 0x02 (STX), 0x1F (US) + auto input = "{\"key\": \"a\x01" + "b\x02" + "c\x1F" + "d\"}"s; + std::string expected = R"({"key": "a\u0001b\u0002c\u001fd"})"; + REQUIRE(sanitize_string(input) == expected); + } + + SECTION("control char in key") { + auto input = "{\"ke\x00y\": \"value\"}"s; + std::string expected = R"({"ke\u0000y": "value"})"; + REQUIRE(sanitize_string(input) == expected); + } + + SECTION("mixed valid escapes and control chars") { + auto input = "{\"key\": \"line1\\nhas\x00null\"}"s; + std::string expected = R"({"key": "line1\nhas\u0000null"})"; + REQUIRE(sanitize_string(input) == expected); + } +} + +TEST_CASE("sanitize_json_buffer_handles_escapes_correctly", "[clp_s][StringUtils]") { + SECTION("escaped quote should not toggle string state") { + // The \" should not end the string + std::string input = R"({"key": "quote:\"value"})"; + REQUIRE(sanitize_string(input) == input); + } + + SECTION("escaped backslash before quote") { + // \\" means escaped backslash followed by end of string + std::string input = R"({"key": "backslash:\\"})"; + REQUIRE(sanitize_string(input) == input); + } + + SECTION("control char after escaped backslash") { + // \\\x00 means escaped backslash followed by a control char still inside the string + auto input = "{\"key\": \"slash:\\\\\x00" + "end\"}"s; + std::string expected = R"({"key": "slash:\\\u0000end"})"; + REQUIRE(sanitize_string(input) == expected); + } + + SECTION("control char after escaped quote") { + auto input = "{\"key\": \"quote:\\\"\x00" + "after\"}"s; + std::string expected = R"({"key": "quote:\"\u0000after"})"; + REQUIRE(sanitize_string(input) == expected); + } + + SECTION("control char as invalid escape sequence target") { + // \ is not a valid JSON escape sequence, but we leave it as-is + // since the sanitizer treats any char after backslash as an escape target. + // This JSON is invalid anyway (bad escape sequence) and will fail parsing. + auto input = "{\"key\": \"bad:\\\x00" + "end\"}"s; + auto expected = "{\"key\": \"bad:\\\x00" + "end\"}"s; + REQUIRE(sanitize_string(input) == expected); + } + + SECTION("triple backslash followed by control char") { + // \\\ followed by \x00: first \\ is escaped backslash, third \ starts escape sequence. + // The third backslash makes the null appear to be an escape target, so it's not escaped. + auto input = "{\"key\": \"\\\\\\\x00" + "end\"}"s; + auto expected = "{\"key\": \"\\\\\\\x00" + "end\"}"s; + REQUIRE(sanitize_string(input) == expected); + } +} + +TEST_CASE("sanitize_json_buffer_result_is_valid_json", "[clp_s][StringUtils]") { + SECTION("sanitized null byte produces valid JSON") { + auto input = "{\"key\": \"val\x00ue\"}"s; + std::string sanitized = sanitize_string(input); + REQUIRE(can_parse_json(sanitized)); + } + + SECTION("sanitized multiple control chars produces valid JSON") { + auto input = "{\"data\": \"\x01\x02\x03\x04\x05\"}"s; + std::string sanitized = sanitize_string(input); + REQUIRE(can_parse_json(sanitized)); + } +} + +TEST_CASE("sanitize_json_buffer_handles_buffer_growth", "[clp_s][StringUtils]") { + // Each control char expands from 1 byte to 6 bytes (\u00XX) + // Create input that will require buffer growth + SECTION("many control chars requiring expansion") { + // 100 null bytes -> 600 bytes after escaping + std::string value(100, '\x00'); + std::string input = "{\"key\": \"" + value + "\"}"; + + std::string sanitized = sanitize_string(input); + + // Verify all nulls were escaped + REQUIRE(sanitized.find('\x00') == std::string::npos); + // Verify correct number of escape sequences + size_t count = 0; + size_t pos = 0; + while ((pos = sanitized.find("\\u0000", pos)) != std::string::npos) { + ++count; + pos += 6; + } + REQUIRE(count == 100); + // Verify result is valid JSON + REQUIRE(can_parse_json(sanitized)); + } +} + +TEST_CASE("sanitize_json_buffer_jsonl", "[clp_s][StringUtils]") { + SECTION("multiple JSON objects with control chars") { + auto input = "{\"a\": \"x\x00y\"}\n{\"b\": \"p\x01q\"}"s; + std::string expected = "{\"a\": \"x\\u0000y\"}\n{\"b\": \"p\\u0001q\"}"; + REQUIRE(sanitize_string(input) == expected); + } +} + +TEST_CASE("sanitize_json_buffer_truncated_json", "[clp_s][StringUtils]") { + SECTION("truncated JSON with unmatched quote - control chars may be escaped incorrectly") { + // This tests the edge case documented in the code: if JSON is truncated with unmatched + // quotes, string state tracking may be incorrect, potentially escaping control chars + // outside of actual JSON strings. This is acceptable since malformed JSON will fail + // parsing anyway. + // + // Input: truncated JSON where the string is not closed, followed by control chars + // The sanitizer may incorrectly think it's still in a string and escape the control chars + auto input = "{\"key\": \"unclosed string\x00\x01" + "after\"}"s; + // The sanitizer will escape the control chars because it thinks we're still in a string + // (the quote after "unclosed string" is escaped, so the string continues) + std::string sanitized = sanitize_string(input); + // The control chars should be escaped (behavior may vary based on quote matching) + // The important thing is that the function doesn't crash and handles it gracefully + REQUIRE( + (sanitized.find('\x00') == std::string::npos + || sanitized.find("\\u0000") != std::string::npos) + ); + } + + SECTION("truncated JSON ending mid-string") { + // JSON truncated in the middle of a string with control chars (no closing quote) + // This simulates a real truncation scenario where the buffer ends mid-string + auto input = "{\"key\": \"value\x00\x01" + "truncated"s; + std::string sanitized = sanitize_string(input); + // Should handle gracefully without crashing + // Control chars inside the (unclosed) string should be escaped + REQUIRE(sanitized.size() >= input.size()); + // Verify no raw control characters remain (they should be escaped) + REQUIRE(sanitized.find('\x00') == std::string::npos); + REQUIRE(sanitized.find('\x01') == std::string::npos); + } + + SECTION("truncated JSON with control chars after unmatched quote") { + // JSON with unmatched quote followed by control chars outside the string + // The sanitizer may incorrectly escape these if it thinks we're still in a string + auto input = "{\"key\": \"value\x00" + "}\x01\x02"s; + std::string sanitized = sanitize_string(input); + // Function should not crash - behavior may vary but should be consistent + REQUIRE(sanitized.size() >= input.size()); + } +} + +// ===================================================================================== +// UTF-8 Sanitization Tests +// ===================================================================================== + +namespace { +/** + * Helper to create a buffer with simdjson padding and run UTF-8 sanitization. + * Returns both the result (with character counts) and the sanitized output string. + * @param input The input string to sanitize + * @return SanitizeResultWithOutput containing result metadata and sanitized string + * @note sanitize_utf8_buffer may reallocate the buffer (passed by reference). This helper + * handles both reallocation and no-reallocation cases correctly. + */ +auto sanitize_utf8_with_result(std::string_view input) -> SanitizeResultWithOutput { + size_t buf_size = input.size(); + size_t buf_occupied = input.size(); + // Use raw pointer since sanitize_utf8_buffer may delete and reallocate + char* buf = new char[buf_size + simdjson::SIMDJSON_PADDING]; + std::memcpy(buf, input.data(), input.size()); + + auto result = StringUtils::sanitize_utf8_buffer( + buf, + buf_size, + buf_occupied, + simdjson::SIMDJSON_PADDING + ); + std::string output(buf, result.new_buf_occupied); + // buf now points to the final buffer (may have been reallocated) + delete[] buf; + return {std::move(result), std::move(output)}; +} + +/** + * Helper to create a buffer with simdjson padding and run UTF-8 sanitization. + * @param input The input string to sanitize + * @return The sanitized string + */ +auto sanitize_utf8_string(std::string_view input) -> std::string { + return sanitize_utf8_with_result(input).output; +} + +/** + * Count occurrences of U+FFFD replacement character (0xEF 0xBF 0xBD) in a string. + */ +auto count_replacement_chars(std::string_view s) -> size_t { + size_t count = 0; + size_t i = 0; + while (i + 2 < s.size()) { + if (static_cast(s[i]) == 0xEF && static_cast(s[i + 1]) == 0xBF + && static_cast(s[i + 2]) == 0xBD) + { + ++count; + i += 3; // Skip past the replacement char (3 bytes for U+FFFD) + } else { + ++i; + } + } + return count; +} +} // namespace + +TEST_CASE("sanitize_utf8_buffer_no_changes", "[clp_s][StringUtils]") { + SECTION("valid ASCII") { + std::string input = "Hello, World!"; + REQUIRE(sanitize_utf8_string(input) == input); + } + + SECTION("valid UTF-8 multibyte characters") { + // 2-byte: é (U+00E9) = 0xC3 0xA9 + // 3-byte: € (U+20AC) = 0xE2 0x82 0xAC + // 4-byte: 𝄞 (U+1D11E) = 0xF0 0x9D 0x84 0x9E + std::string input = "café € 𝄞"; + REQUIRE(sanitize_utf8_string(input) == input); + } + + SECTION("valid JSON with UTF-8") { + std::string input = R"({"msg": "日本語テスト"})"; + REQUIRE(sanitize_utf8_string(input) == input); + } + + SECTION("empty string") { + std::string input = ""; + REQUIRE(sanitize_utf8_string(input) == input); + } +} + +TEST_CASE("sanitize_utf8_buffer_replaces_invalid_bytes", "[clp_s][StringUtils]") { + // U+FFFD is encoded as 0xEF 0xBF 0xBD in UTF-8 + constexpr char cReplacementChar[] = "\xEF\xBF\xBD"; + + SECTION("single invalid byte 0xFF") { + auto input = "hello\xFF world"s; + std::string expected = "hello" + std::string(cReplacementChar) + " world"; + REQUIRE(sanitize_utf8_string(input) == expected); + } + + SECTION("single invalid byte 0xFE") { + auto input = "test\xFE"s; + std::string expected = "test" + std::string(cReplacementChar); + REQUIRE(sanitize_utf8_string(input) == expected); + } + + SECTION("multiple invalid bytes") { + auto input = "\xFF\xFE\xFF"s; + std::string expected = std::string(cReplacementChar) + cReplacementChar + cReplacementChar; + REQUIRE(sanitize_utf8_string(input) == expected); + } + + SECTION("continuation byte without leader (0x80-0xBF)") { + auto input = "test\x80 value"s; + std::string expected = "test" + std::string(cReplacementChar) + " value"; + REQUIRE(sanitize_utf8_string(input) == expected); + } + + SECTION("truncated 2-byte sequence") { + // 0xC3 expects one continuation byte, but we end the string + auto input = "test\xC3"s; + std::string expected = "test" + std::string(cReplacementChar); + REQUIRE(sanitize_utf8_string(input) == expected); + } + + SECTION("truncated 3-byte sequence") { + // 0xE2 expects two continuation bytes + auto input = "test\xE2\x82"s; + std::string expected + = "test" + std::string(cReplacementChar) + std::string(cReplacementChar); + REQUIRE(sanitize_utf8_string(input) == expected); + } + + SECTION("truncated 4-byte sequence") { + // 0xF0 expects three continuation bytes + auto input = "test\xF0\x9D\x84"s; + std::string expected = "test" + std::string(cReplacementChar) + + std::string(cReplacementChar) + std::string(cReplacementChar); + REQUIRE(sanitize_utf8_string(input) == expected); + } + + SECTION("overlong 2-byte encoding") { + // 0xC0 0x80 is overlong encoding of NUL (should be just 0x00) + // 0xC0 and 0xC1 are never valid UTF-8 lead bytes + auto input = "test\xC0\x80"s; + // Both bytes are invalid + std::string expected + = "test" + std::string(cReplacementChar) + std::string(cReplacementChar); + REQUIRE(sanitize_utf8_string(input) == expected); + } + + SECTION("surrogate code points (U+D800-U+DFFF)") { + // 0xED 0xA0 0x80 encodes U+D800 (invalid surrogate) + auto input = "test\xED\xA0\x80 end"s; + // The sequence is invalid, bytes replaced individually + std::string expected = "test" + std::string(cReplacementChar) + + std::string(cReplacementChar) + std::string(cReplacementChar) + + " end"; + REQUIRE(sanitize_utf8_string(input) == expected); + } + + SECTION("code point above U+10FFFF") { + // 0xF4 0x90 0x80 0x80 would encode U+110000 (above max) + auto input = "test\xF4\x90\x80\x80 end"s; + std::string sanitized = sanitize_utf8_string(input); + // Should contain replacement characters + REQUIRE(count_replacement_chars(sanitized) >= 1); + } +} + +TEST_CASE("sanitize_utf8_buffer_mixed_valid_invalid", "[clp_s][StringUtils]") { + constexpr char cReplacementChar[] = "\xEF\xBF\xBD"; + + SECTION("invalid byte between valid UTF-8") { + // café with invalid byte in middle + auto input = "caf\xC3\xA9\xFF test"s; // café + 0xFF + " test" + std::string expected = "caf\xC3\xA9" + std::string(cReplacementChar) + " test"; + REQUIRE(sanitize_utf8_string(input) == expected); + } + + SECTION("JSON with invalid UTF-8 in value") { + auto input = "{\"msg\": \"test\xFF value\"}"s; + std::string expected = "{\"msg\": \"test" + std::string(cReplacementChar) + " value\"}"; + REQUIRE(sanitize_utf8_string(input) == expected); + } +} + +TEST_CASE("sanitize_utf8_buffer_returns_correct_counts", "[clp_s][StringUtils]") { + SECTION("counts invalid sequences") { + auto input = "\xFF\xFE\xFF"s; // 3 invalid bytes + + auto [result, output] = sanitize_utf8_with_result(input); + + // The count uses 0xFF as a special key for invalid UTF-8 sequences + REQUIRE_FALSE(result.sanitized_char_counts.empty()); + size_t total = 0; + for (auto const& [ch, count] : result.sanitized_char_counts) { + total += count; + } + REQUIRE(total == 3); + } + + SECTION("returns empty counts when no sanitization needed") { + std::string input = "valid UTF-8 string"; + + auto [result, output] = sanitize_utf8_with_result(input); + + REQUIRE(result.sanitized_char_counts.empty()); + REQUIRE(result.new_buf_occupied == input.size()); + } +} + +TEST_CASE("sanitize_utf8_buffer_handles_buffer_growth", "[clp_s][StringUtils]") { + // Each invalid byte (1 byte) becomes U+FFFD (3 bytes), so buffer may need to grow + SECTION("many invalid bytes requiring expansion") { + // 100 invalid bytes -> 300 bytes after replacement with U+FFFD + std::string input(100, '\xFF'); + + std::string sanitized = sanitize_utf8_string(input); + + // Verify all bytes were replaced + REQUIRE(sanitized.find('\xFF') == std::string::npos); + // Verify correct number of replacement characters + REQUIRE(count_replacement_chars(sanitized) == 100); + // Verify size: 100 * 3 = 300 bytes + REQUIRE(sanitized.size() == 300); + } +} + +// ===================================================================================== +// JSON Sanitization Character Count Tests +// ===================================================================================== + +TEST_CASE("sanitize_json_buffer_returns_correct_char_counts", "[clp_s][StringUtils]") { + SECTION("counts multiple different control characters") { + // Input with: 3x \x00, 2x \x01, 1x \x1f + auto input = "{\"a\": \"\x00\x00\x01\x00\x01\x1f\"}"s; + + auto [result, output] = sanitize_json_with_result(input); + + REQUIRE(result.sanitized_char_counts.size() == 3); + REQUIRE(result.sanitized_char_counts.at('\x00') == 3); + REQUIRE(result.sanitized_char_counts.at('\x01') == 2); + REQUIRE(result.sanitized_char_counts.at('\x1f') == 1); + } + + SECTION("returns empty counts when no sanitization needed") { + std::string input = R"({"key": "valid value"})"; + + auto [result, output] = sanitize_json_with_result(input); + + REQUIRE(result.sanitized_char_counts.empty()); + REQUIRE(result.new_buf_occupied == input.size()); + } +}