diff --git a/components/core/CMakeLists.txt b/components/core/CMakeLists.txt
index 9e9738f05d..f183f15e48 100644
--- a/components/core/CMakeLists.txt
+++ b/components/core/CMakeLists.txt
@@ -704,6 +704,7 @@ set(SOURCE_FILES_unitTest
         tests/test-clp_s-end_to_end.cpp
         tests/test-clp_s-range_index.cpp
         tests/test-clp_s-search.cpp
+        tests/test-clp_s-StringUtils.cpp
         tests/test-EncodedVariableInterpreter.cpp
         tests/test-encoding_methods.cpp
         tests/test-ffi_IrUnitHandlerReq.cpp
diff --git a/components/core/src/clp_s/CommandLineArguments.cpp b/components/core/src/clp_s/CommandLineArguments.cpp
index 72c0d50d80..fef29442f6 100644
--- a/components/core/src/clp_s/CommandLineArguments.cpp
+++ b/components/core/src/clp_s/CommandLineArguments.cpp
@@ -256,6 +256,12 @@ CommandLineArguments::parse_arguments(int argc, char const** argv) {
                     "structurize-arrays",
                     po::bool_switch(&m_structurize_arrays),
                     "Structurize arrays instead of compressing them as clp strings."
+            )(
+                    "sanitize-invalid-json",
+                    po::bool_switch(&m_sanitize_invalid_json),
+                    "Sanitize invalid JSON by escaping unescaped control characters (0x00-0x1F),"
+                    " replacing invalid UTF-8 sequences with U+FFFD, and handling invalid"
+                    " surrogate escapes. When disabled (default), parsing fails on invalid JSON."
             )(
                     "disable-log-order",
                     po::bool_switch(&m_disable_log_order),
diff --git a/components/core/src/clp_s/CommandLineArguments.hpp b/components/core/src/clp_s/CommandLineArguments.hpp
index b35124981a..2e32b57a68 100644
--- a/components/core/src/clp_s/CommandLineArguments.hpp
+++ b/components/core/src/clp_s/CommandLineArguments.hpp
@@ -113,6 +113,8 @@ class CommandLineArguments {
 
     bool get_structurize_arrays() const { return m_structurize_arrays; }
 
+    bool get_sanitize_invalid_json() const { return m_sanitize_invalid_json; }
+
     bool get_ordered_decompression() const { return m_ordered_decompression; }
 
     size_t get_target_ordered_chunk_size() const { return m_target_ordered_chunk_size; }
@@ -202,6 +204,7 @@ class CommandLineArguments {
     bool m_no_retain_float_format{false};
     bool m_single_file_archive{false};
     bool m_structurize_arrays{false};
+    bool m_sanitize_invalid_json{false};
     bool m_ordered_decompression{false};
     size_t m_target_ordered_chunk_size{};
     bool m_print_ordered_chunk_stats{false};
diff --git a/components/core/src/clp_s/JsonFileIterator.cpp b/components/core/src/clp_s/JsonFileIterator.cpp
index a0a003d9f7..447e73617d 100644
--- a/components/core/src/clp_s/JsonFileIterator.cpp
+++ b/components/core/src/clp_s/JsonFileIterator.cpp
@@ -3,18 +3,25 @@
 #include <cctype>
 #include <cstring>
 
+#include <fmt/format.h>
 #include <spdlog/spdlog.h>
 
+#include "Utils.hpp"
+
 namespace clp_s {
 JsonFileIterator::JsonFileIterator(
         clp::ReaderInterface& reader,
         size_t max_document_size,
+        bool sanitize_invalid_json,
+        std::string path,
         size_t buf_size
 )
         : m_buf_size(buf_size),
           m_max_document_size(max_document_size),
           m_buf(new char[buf_size + simdjson::SIMDJSON_PADDING]),
-          m_reader(reader) {
+          m_reader(reader),
+          m_path(std::move(path)),
+          m_sanitize_invalid_json(sanitize_invalid_json) {
     read_new_json();
 }
 
@@ -64,6 +71,26 @@ bool JsonFileIterator::read_new_json() {
         )
                              .get(m_stream);
 
+        // If sanitization is enabled and we encounter errors that can be fixed by sanitization,
+        // sanitize the buffer and retry parsing
+        if (m_sanitize_invalid_json) {
+            // Handle invalid UTF-8 sequences by replacing with U+FFFD
+            if (simdjson::error_code::UTF8_ERROR == error) {
+                // Return value intentionally ignored - in read_new_json we always retry after
+                // sanitization regardless of whether changes were made
+                static_cast<void>(sanitize_invalid_utf8_and_log());
+                error = m_parser.iterate_many(m_buf, m_buf_occupied, m_buf_occupied).get(m_stream);
+            }
+
+            // Handle unescaped control characters by escaping them to \u00XX format
+            if (simdjson::error_code::UNESCAPED_CHARS == error) {
+                // Return value intentionally ignored - in read_new_json we always retry after
+                // sanitization regardless of whether changes were made
+                static_cast<void>(sanitize_control_chars_and_log());
+                error = m_parser.iterate_many(m_buf, m_buf_occupied, m_buf_occupied).get(m_stream);
+            }
+        }
+
         if (error) {
             m_error_code = error;
             return false;
@@ -95,6 +122,90 @@ size_t JsonFileIterator::skip_whitespace_and_get_truncated_bytes() {
     return m_buf_occupied - m_next_document_position;
 }
 
+bool JsonFileIterator::sanitize_invalid_utf8_and_log() {
+    size_t const old_buf_occupied = m_buf_occupied;
+    // Note: sanitize_utf8_buffer may reallocate m_buf and will update m_buf_size by reference if
+    // reallocation is needed. This keeps m_buf_size in sync with the actual allocated buffer size.
+    auto const result = StringUtils::sanitize_utf8_buffer(
+            m_buf,
+            m_buf_size,
+            m_buf_occupied,
+            simdjson::SIMDJSON_PADDING
+    );
+    m_buf_occupied = result.new_buf_occupied;
+    m_sanitization_bytes_added += m_buf_occupied - old_buf_occupied;
+
+    if (result.sanitized_char_counts.empty()) {
+        return false;
+    }
+
+    size_t total_replaced = 0;
+    for (auto const& [ch, count] : result.sanitized_char_counts) {
+        total_replaced += count;
+    }
+    SPDLOG_WARN(
+            "Replaced {} invalid UTF-8 sequence(s) with U+FFFD{}. Buffer expanded by {} bytes "
+            "({} -> {}).",
+            total_replaced,
+            m_path.empty() ? "" : fmt::format(" in file '{}'", m_path),
+            static_cast<int64_t>(m_buf_occupied) - static_cast<int64_t>(old_buf_occupied),
+            old_buf_occupied,
+            m_buf_occupied
+    );
+    return true;
+}
+
+bool JsonFileIterator::sanitize_control_chars_and_log() {
+    size_t const old_buf_occupied = m_buf_occupied;
+    // Note: sanitize_json_buffer may reallocate m_buf and will update m_buf_size by reference if
+    // reallocation is needed. This keeps m_buf_size in sync with the actual allocated buffer size.
+    auto const result = StringUtils::sanitize_json_buffer(
+            m_buf,
+            m_buf_size,
+            m_buf_occupied,
+            simdjson::SIMDJSON_PADDING
+    );
+    m_buf_occupied = result.new_buf_occupied;
+    m_sanitization_bytes_added += m_buf_occupied - old_buf_occupied;
+
+    if (result.sanitized_char_counts.empty()) {
+        return false;
+    }
+
+    size_t total_sanitized = 0;
+    std::string char_details;
+    for (auto const& [ch, count] : result.sanitized_char_counts) {
+        if (false == char_details.empty()) {
+            char_details += ", ";
+        }
+        char_details += fmt::format("0x{:02x} ({})", static_cast<unsigned char>(ch), count);
+        total_sanitized += count;
+    }
+    SPDLOG_WARN(
+            "Escaped {} control character(s) in JSON{}: [{}]. Buffer expanded by {} bytes "
+            "({} -> {}).",
+            total_sanitized,
+            m_path.empty() ? "" : fmt::format(" in file '{}'", m_path),
+            char_details,
+            static_cast<int64_t>(m_buf_occupied) - static_cast<int64_t>(old_buf_occupied),
+            old_buf_occupied,
+            m_buf_occupied
+    );
+    return true;
+}
+
+bool JsonFileIterator::reinitialize_document_stream() {
+    auto error = m_parser.iterate_many(m_buf, m_buf_occupied, m_buf_occupied).get(m_stream);
+    if (error) {
+        m_error_code = error;
+        return false;
+    }
+    m_doc_it = m_stream.begin();
+    m_first_doc_in_buffer = true;
+    m_next_document_position = 0;
+    return true;
+}
+
 bool JsonFileIterator::get_json(simdjson::ondemand::document_stream::iterator& it) {
     if (false == m_first_read) {
         ++m_doc_it;
@@ -118,6 +229,22 @@ bool JsonFileIterator::get_json(simdjson::ondemand::document_stream::iterator& i
                 return true;
             } else if (m_doc_it.error() == simdjson::error_code::UTF8_ERROR) {
                 maybe_utf8_edge_case = true;
+            } else if (m_sanitize_invalid_json
+                       && m_doc_it.error() == simdjson::error_code::UNESCAPED_CHARS)
+            {
+                // Unescaped control characters detected during document iteration. Sanitize the
+                // buffer and re-setup the document stream to restart from the beginning.
+                if (false == sanitize_control_chars_and_log()) {
+                    // Sanitization made no changes - report the original error to avoid infinite
+                    // loop
+                    m_error_code = m_doc_it.error();
+                    return false;
+                }
+
+                if (false == reinitialize_document_stream()) {
+                    return false;
+                }
+                continue;
             } else {
                 m_error_code = m_doc_it.error();
                 return false;
@@ -137,8 +264,22 @@ bool JsonFileIterator::get_json(simdjson::ondemand::document_stream::iterator& i
             // If we hit a UTF-8 error and either we have reached eof or the buffer occupancy is
             // greater than the maximum document size we assume that the UTF-8 error must have been
             // in the middle of the stream. Note: it is possible that the UTF-8 error is at the end
-            // of the stream and that this is actualy a truncation error. Unfortunately the only way
-            // to check is to parse it ourselves, so we rely on this heuristic for now.
+            // of the stream and that this is actually a truncation error. Unfortunately the only
+            // way to check is to parse it ourselves, so we rely on this heuristic for now.
+            if (m_sanitize_invalid_json) {
+                // Sanitize invalid UTF-8 sequences and retry
+                if (false == sanitize_invalid_utf8_and_log()) {
+                    // Sanitization made no changes - report the original error to avoid infinite
+                    // loop
+                    m_error_code = simdjson::error_code::UTF8_ERROR;
+                    return false;
+                }
+
+                if (false == reinitialize_document_stream()) {
+                    return false;
+                }
+                continue;
+            }
             m_error_code = simdjson::error_code::UTF8_ERROR;
             return false;
         } else if (maybe_utf8_edge_case) {
@@ -151,10 +292,12 @@ bool JsonFileIterator::get_json(simdjson::ondemand::document_stream::iterator& i
 size_t JsonFileIterator::get_num_bytes_consumed() {
     // If there are more documents left in the current buffer account for how much of the
     // buffer has been consumed, otherwise report the total number of bytes read so that we
-    // capture trailing whitespace.
+    // capture trailing whitespace. Include bytes added by sanitization since the sanitized
+    // content is what gets compressed.
     if (m_doc_it != m_stream.end()) {
-        return m_bytes_read - (m_buf_occupied - m_next_document_position);
+        return m_bytes_read + m_sanitization_bytes_added
+               - (m_buf_occupied - m_next_document_position);
     }
-    return m_bytes_read;
+    return m_bytes_read + m_sanitization_bytes_added;
 }
 }  // namespace clp_s
diff --git a/components/core/src/clp_s/JsonFileIterator.hpp b/components/core/src/clp_s/JsonFileIterator.hpp
index 9102087637..2b8be94a4c 100644
--- a/components/core/src/clp_s/JsonFileIterator.hpp
+++ b/components/core/src/clp_s/JsonFileIterator.hpp
@@ -1,6 +1,8 @@
 #ifndef CLP_S_JSONFILEITERATOR_HPP
 #define CLP_S_JSONFILEITERATOR_HPP
 
+#include <string>
+
 #include <simdjson.h>
 
 #include "../clp/ReaderInterface.hpp"
@@ -11,7 +13,7 @@ class JsonFileIterator {
     /**
      * An iterator over an input stream containing json objects. JSON is parsed
      * using simdjson::parse_many. This allows simdjson to efficiently find
-     * delimeters between JSON objects, and if enabled parse JSON ahead of time
+     * delimiters between JSON objects, and if enabled parse JSON ahead of time
      * in another thread while the JSON is being iterated over.
      *
      * The buffer grows automatically if there are JSON objects larger than the buffer size.
@@ -19,11 +21,15 @@ class JsonFileIterator {
 
      * @param reader the input stream containing JSON
      * @param max_document_size the maximum allowed size of a single document
+     * @param sanitize_invalid_json whether to sanitize invalid JSON (control chars, invalid UTF-8)
+     * @param path optional path to the file being read (used for logging)
      * @param buf_size the initial buffer size
      */
     explicit JsonFileIterator(
             clp::ReaderInterface& reader,
             size_t max_document_size,
+            bool sanitize_invalid_json,
+            std::string path = {},
             size_t buf_size = 1024 * 1024 /* 1 MiB default */
     );
     ~JsonFileIterator();
@@ -41,9 +47,11 @@ class JsonFileIterator {
     [[nodiscard]] size_t truncated_bytes() const { return m_truncated_bytes; }
 
     /**
-     * @return total number of bytes read from the file
+     * @return total number of bytes read from the file, plus any bytes added by sanitization
      */
-    [[nodiscard]] size_t get_num_bytes_read() const { return m_bytes_read; }
+    [[nodiscard]] size_t get_num_bytes_read() const {
+        return m_bytes_read + m_sanitization_bytes_added;
+    }
 
     /**
      * Note: this method can not be const because checking if a simdjson iterator is at the end
@@ -73,14 +81,40 @@ class JsonFileIterator {
      */
     [[nodiscard]] size_t skip_whitespace_and_get_truncated_bytes();
 
+    /**
+     * Sanitizes invalid UTF-8 sequences in the buffer by replacing them with U+FFFD,
+     * updates buffer tracking variables, and logs a warning if changes were made.
+     * @return true if sanitization made changes, false otherwise
+     * @note May reallocate m_buf if buffer expansion is needed. m_buf_size is updated accordingly.
+     */
+    [[nodiscard]] bool sanitize_invalid_utf8_and_log();
+
+    /**
+     * Sanitizes unescaped control characters in the buffer by escaping them to \\u00XX format,
+     * updates buffer tracking variables, and logs a warning if changes were made.
+     * @return true if sanitization made changes, false otherwise
+     * @note May reallocate m_buf if buffer expansion is needed. m_buf_size is updated accordingly.
+     */
+    [[nodiscard]] bool sanitize_control_chars_and_log();
+
+    /**
+     * Reinitializes the document stream after buffer sanitization.
+     * Resets iteration state to start from the beginning of the buffer.
+     * @return true on success, false if iteration setup fails (m_error_code is set on failure)
+     */
+    [[nodiscard]] bool reinitialize_document_stream();
+
     size_t m_truncated_bytes{0};
     size_t m_next_document_position{0};
     size_t m_bytes_read{0};
+    size_t m_sanitization_bytes_added{0};
     size_t m_buf_size{0};
     size_t m_buf_occupied{0};
     size_t m_max_document_size{0};
     char* m_buf{nullptr};
     clp::ReaderInterface& m_reader;
+    std::string m_path;
+    bool m_sanitize_invalid_json{false};
     simdjson::ondemand::parser m_parser;
     simdjson::ondemand::document_stream m_stream;
     bool m_eof{false};
diff --git a/components/core/src/clp_s/JsonParser.cpp b/components/core/src/clp_s/JsonParser.cpp
index c0e2123e7f..93687beab1 100644
--- a/components/core/src/clp_s/JsonParser.cpp
+++ b/components/core/src/clp_s/JsonParser.cpp
@@ -144,6 +144,7 @@ JsonParser::JsonParser(JsonParserOption const& option)
           m_structurize_arrays(option.structurize_arrays),
           m_record_log_order(option.record_log_order),
           m_retain_float_format(option.retain_float_format),
+          m_sanitize_invalid_json(option.sanitize_invalid_json),
           m_input_paths(option.input_paths),
           m_network_auth(option.network_auth) {
     if (false == m_timestamp_key.empty()) {
@@ -223,7 +224,7 @@ void JsonParser::parse_obj_in_array(simdjson::ondemand::object line, int32_t par
         }
 
         cur_field = *object_it_stack.top();
-        cur_key = cur_field.unescaped_key(true);
+        cur_key = cur_field.unescaped_key(m_sanitize_invalid_json);
         cur_value = cur_field.value();
 
         switch (cur_value.type()) {
@@ -302,7 +303,7 @@ void JsonParser::parse_obj_in_array(simdjson::ondemand::object line, int32_t par
                 break;
             }
             case simdjson::ondemand::json_type::string: {
-                std::string_view value = cur_value.get_string(true);
+                std::string_view value = cur_value.get_string(m_sanitize_invalid_json);
                 if (value.find(' ') != std::string::npos) {
                     node_id = m_archive_writer
                                       ->add_node(node_id_stack.top(), NodeType::ClpString, cur_key);
@@ -407,7 +408,7 @@ void JsonParser::parse_array(simdjson::ondemand::array array, int32_t parent_nod
                 break;
             }
             case simdjson::ondemand::json_type::string: {
-                std::string_view value = cur_value.get_string(true);
+                std::string_view value = cur_value.get_string(m_sanitize_invalid_json);
                 if (value.find(' ') != std::string::npos) {
                     node_id = m_archive_writer->add_node(parent_node_id, NodeType::ClpString, "");
                 } else {
@@ -452,7 +453,7 @@ void JsonParser::parse_line(
     do {
         if (false == object_stack.empty()) {
             cur_field = *object_it_stack.top();
-            cur_key = cur_field.unescaped_key(true);
+            cur_key = cur_field.unescaped_key(m_sanitize_invalid_json);
             line = cur_field.value();
         }
 
@@ -555,7 +556,7 @@ void JsonParser::parse_line(
                 break;
             }
             case simdjson::ondemand::json_type::string: {
-                std::string_view value = line.get_string(true);
+                std::string_view value = line.get_string(m_sanitize_invalid_json);
                 auto const matches_timestamp
                         = m_archive_writer->matches_timestamp(node_id_stack.top(), cur_key);
                 if (matches_timestamp) {
@@ -668,7 +669,8 @@ auto JsonParser::ingest_json(
         Path const& path,
         std::string const& archive_creator_id
 ) -> bool {
-    JsonFileIterator json_file_iterator(*reader, m_max_document_size);
+    JsonFileIterator
+            json_file_iterator(*reader, m_max_document_size, m_sanitize_invalid_json, path.path);
     if (simdjson::error_code::SUCCESS != json_file_iterator.get_error()) {
         SPDLOG_ERROR(
                 "Encountered error - {} - while trying to parse {} after parsing 0 bytes",
diff --git a/components/core/src/clp_s/JsonParser.hpp b/components/core/src/clp_s/JsonParser.hpp
index 82f8d04786..e763d5fe71 100644
--- a/components/core/src/clp_s/JsonParser.hpp
+++ b/components/core/src/clp_s/JsonParser.hpp
@@ -40,6 +40,7 @@ struct JsonParserOption {
     bool record_log_order{true};
     bool retain_float_format{false};
     bool single_file_archive{false};
+    bool sanitize_invalid_json{false};
     NetworkAuthOption network_auth{};
 };
 
@@ -239,6 +240,7 @@ class JsonParser {
     bool m_structurize_arrays{false};
     bool m_record_log_order{true};
     bool m_retain_float_format{false};
+    bool m_sanitize_invalid_json{false};
 
     absl::flat_hash_map<std::pair<uint32_t, NodeType>, std::pair<int32_t, bool>>
             m_ir_node_to_archive_node_id_mapping;
diff --git a/components/core/src/clp_s/Utils.cpp b/components/core/src/clp_s/Utils.cpp
index 59b18a5bca..321032788f 100644
--- a/components/core/src/clp_s/Utils.cpp
+++ b/components/core/src/clp_s/Utils.cpp
@@ -2,6 +2,7 @@
 
 #include <charconv>
 #include <cstdint>
+#include <cstring>
 #include <exception>
 #include <filesystem>
 #include <set>
@@ -163,6 +164,241 @@ bool UriUtils::get_last_uri_component(std::string_view const uri, std::string& n
     return true;
 }
 
+BufferSanitizeResult StringUtils::sanitize_json_buffer(
+        char*& buf,
+        size_t& buf_size,
+        size_t buf_occupied,
+        size_t simdjson_padding
+) {
+    // Early return for empty or null buffer
+    if (nullptr == buf || 0 == buf_occupied) {
+        return {buf_occupied, {}};
+    }
+
+    // Sanitize by escaping control characters inside JSON strings. Uses a lazy-copy approach:
+    // only allocate the output string when the first control character is found, and copy
+    // valid ranges in bulk rather than byte-by-byte for efficiency.
+    //
+    // Note: If the buffer contains unmatched quotes (e.g., truncated JSON), the string state
+    // tracking may be incorrect, potentially escaping control characters outside of actual
+    // JSON strings. This is acceptable since such malformed JSON will fail parsing anyway.
+    std::string sanitized;
+    std::map<char, size_t> sanitized_char_counts;
+    bool in_string = false;
+    bool escape_next = false;
+    size_t copy_start = 0;
+
+    for (size_t i = 0; i < buf_occupied; ++i) {
+        char const c = buf[i];
+
+        if (escape_next) {
+            escape_next = false;
+            continue;
+        }
+
+        if ('\\' == c && in_string) {
+            escape_next = true;
+            continue;
+        }
+
+        if ('"' == c) {
+            in_string = !in_string;
+            continue;
+        }
+
+        // Escape control characters (0x00-0x1F) inside strings to \u00XX format
+        if (in_string && static_cast<unsigned char>(c) < 0x20) {
+            if (sanitized.empty()) {
+                // First control char found - allocate with extra space for escapes
+                sanitized.reserve(buf_occupied + 64);
+            }
+            // Copy the valid range before this control character
+            sanitized.append(buf + copy_start, i - copy_start);
+            char_to_escaped_four_char_hex(sanitized, c);
+            ++sanitized_char_counts[c];
+            copy_start = i + 1;
+        }
+    }
+
+    // If no sanitization was needed, return early without any allocation
+    if (sanitized.empty()) {
+        return {buf_occupied, {}};
+    }
+
+    // Copy any remaining content after the last control character
+    sanitized.append(buf + copy_start, buf_occupied - copy_start);
+
+    // Grow buffer if needed to hold sanitized content
+    if (sanitized.size() > buf_size) {
+        size_t const new_buf_size = sanitized.size();
+        char* new_buf = new char[new_buf_size + simdjson_padding];
+        delete[] buf;
+        buf = new_buf;
+        buf_size = new_buf_size;
+    }
+
+    // Copy sanitized content to buffer
+    std::memcpy(buf, sanitized.data(), sanitized.size());
+    return {sanitized.size(), std::move(sanitized_char_counts)};
+}
+
+namespace {
+/**
+ * Checks if a byte is a valid UTF-8 continuation byte (10xxxxxx pattern).
+ * @param byte The byte to check
+ * @return true if the byte is a valid continuation byte (0x80-0xBF)
+ */
+constexpr bool is_continuation_byte(unsigned char byte) {
+    return (byte & 0xC0) == 0x80;
+}
+
+/**
+ * Validates a UTF-8 sequence starting at the given position and returns the sequence length.
+ * @param buf The buffer containing the UTF-8 data
+ * @param pos Current position in the buffer
+ * @param buf_occupied Total bytes in the buffer
+ * @return The length of the valid UTF-8 sequence (1-4), or 0 if invalid
+ */
+size_t validate_utf8_sequence(char const* buf, size_t pos, size_t buf_occupied) {
+    auto const byte = static_cast<unsigned char>(buf[pos]);
+    size_t remaining = buf_occupied - pos;
+
+    // ASCII (0x00-0x7F)
+    if (byte <= 0x7F) {
+        return 1;
+    }
+
+    // Invalid: continuation byte without leader, or invalid lead bytes
+    if (byte < 0xC2 || byte > 0xF4) {
+        return 0;
+    }
+
+    // 2-byte sequence (0xC2-0xDF)
+    if (byte <= 0xDF) {
+        if (remaining < 2 || !is_continuation_byte(static_cast<unsigned char>(buf[pos + 1]))) {
+            return 0;
+        }
+        return 2;
+    }
+
+    // 3-byte sequence (0xE0-0xEF)
+    if (byte <= 0xEF) {
+        if (remaining < 3) {
+            return 0;
+        }
+        auto const byte2 = static_cast<unsigned char>(buf[pos + 1]);
+        auto const byte3 = static_cast<unsigned char>(buf[pos + 2]);
+
+        if (!is_continuation_byte(byte2) || !is_continuation_byte(byte3)) {
+            return 0;
+        }
+
+        // Check for overlong encoding (E0 requires second byte >= 0xA0)
+        if (byte == 0xE0 && byte2 < 0xA0) {
+            return 0;
+        }
+
+        // Check for surrogate code points (ED with second byte >= 0xA0 means 0xD800-0xDFFF)
+        if (byte == 0xED && byte2 >= 0xA0) {
+            return 0;
+        }
+
+        return 3;
+    }
+
+    // 4-byte sequence (0xF0-0xF4)
+    if (remaining < 4) {
+        return 0;
+    }
+    auto const byte2 = static_cast<unsigned char>(buf[pos + 1]);
+    auto const byte3 = static_cast<unsigned char>(buf[pos + 2]);
+    auto const byte4 = static_cast<unsigned char>(buf[pos + 3]);
+
+    if (!is_continuation_byte(byte2) || !is_continuation_byte(byte3)
+        || !is_continuation_byte(byte4))
+    {
+        return 0;
+    }
+
+    // Check for overlong encoding (F0 requires second byte >= 0x90)
+    if (byte == 0xF0 && byte2 < 0x90) {
+        return 0;
+    }
+
+    // Check for code points > U+10FFFF (F4 requires second byte <= 0x8F)
+    if (byte == 0xF4 && byte2 > 0x8F) {
+        return 0;
+    }
+
+    return 4;
+}
+}  // namespace
+
+BufferSanitizeResult StringUtils::sanitize_utf8_buffer(
+        char*& buf,
+        size_t& buf_size,
+        size_t buf_occupied,
+        size_t simdjson_padding
+) {
+    // Early return for empty or null buffer
+    if (nullptr == buf || 0 == buf_occupied) {
+        return {buf_occupied, {}};
+    }
+
+    // Sanitize by replacing invalid UTF-8 sequences with U+FFFD. Uses a lazy-copy approach:
+    // only allocate the output string when the first invalid sequence is found, and copy
+    // valid ranges in bulk rather than byte-by-byte for efficiency.
+    constexpr std::string_view cUtf8ReplacementChar{"\xEF\xBF\xBD", 3};
+    constexpr char cInvalidUtf8Key = static_cast<char>(0xFF);
+
+    std::string sanitized;
+    std::map<char, size_t> sanitized_char_counts;
+    size_t copy_start = 0;
+
+    size_t i = 0;
+    while (i < buf_occupied) {
+        size_t const seq_len = validate_utf8_sequence(buf, i, buf_occupied);
+        if (seq_len > 0) {
+            // Valid sequence - advance position (will be copied in bulk later)
+            i += seq_len;
+        } else {
+            // Invalid sequence - need to sanitize
+            if (sanitized.empty()) {
+                // First invalid sequence found - allocate with extra space for replacements
+                sanitized.reserve(buf_occupied + 64);
+            }
+            // Copy the valid range before this invalid byte
+            sanitized.append(buf + copy_start, i - copy_start);
+            sanitized.append(cUtf8ReplacementChar);
+            ++sanitized_char_counts[cInvalidUtf8Key];
+            // Skip one byte and continue (maximal subpart replacement strategy)
+            ++i;
+            copy_start = i;
+        }
+    }
+
+    // If no sanitization was needed, return early without any allocation
+    if (sanitized.empty()) {
+        return {buf_occupied, {}};
+    }
+
+    // Copy any remaining valid content after the last invalid byte
+    sanitized.append(buf + copy_start, buf_occupied - copy_start);
+
+    // Grow buffer if needed to hold sanitized content
+    if (sanitized.size() > buf_size) {
+        size_t const new_buf_size = sanitized.size();
+        char* new_buf = new char[new_buf_size + simdjson_padding];
+        delete[] buf;
+        buf = new_buf;
+        buf_size = new_buf_size;
+    }
+
+    // Copy sanitized content to buffer
+    std::memcpy(buf, sanitized.data(), sanitized.size());
+    return {sanitized.size(), std::move(sanitized_char_counts)};
+}
+
 void StringUtils::escape_json_string(std::string& destination, std::string_view const source) {
     // Escaping is implemented using this `append_unescaped_slice` approach to offer a fast path
     // when strings are mostly or entirely valid escaped JSON. Benchmarking shows that this offers
diff --git a/components/core/src/clp_s/Utils.hpp b/components/core/src/clp_s/Utils.hpp
index a711eb8ad6..95033e18c3 100644
--- a/components/core/src/clp_s/Utils.hpp
+++ b/components/core/src/clp_s/Utils.hpp
@@ -5,6 +5,7 @@
 #include <charconv>
 #include <cstdint>
 #include <cstring>
+#include <map>
 #include <sstream>
 #include <string>
 #include <string_view>
@@ -63,6 +64,18 @@ class UriUtils {
     static bool get_last_uri_component(std::string_view const uri, std::string& name);
 };
 
+/**
+ * Result of sanitizing a buffer, including statistics about what was sanitized.
+ */
+struct BufferSanitizeResult {
+    /// New number of bytes occupied in the buffer after sanitization.
+    size_t new_buf_occupied;
+    /// Map of sanitized characters to their occurrence counts.
+    /// For JSON control char sanitization: key is the actual control character (0x00-0x1F).
+    /// For UTF-8 sanitization: key is 0xFF (sentinel value for all invalid sequences).
+    std::map<char, size_t> sanitized_char_counts;
+};
+
 class StringUtils {
 public:
     /**
@@ -81,6 +94,57 @@ class StringUtils {
      */
     static void escape_json_string(std::string& destination, std::string_view const source);
 
+    /**
+     * Sanitizes a JSON buffer by escaping unescaped control characters (0x00-0x1F) inside JSON
+     * strings. This is used to fix malformed JSON that contains raw control characters which
+     * would cause parsing errors.
+     *
+     * Only control characters inside JSON string values (between unescaped double quotes) are
+     * escaped. Control characters outside strings are left unchanged as they would cause
+     * structural JSON errors anyway.
+     *
+     * Characters are escaped using unicode escape sequences (e.g., \x00 becomes \u0000).
+     *
+     * @param buf Reference to pointer to the buffer. May be reallocated if expansion is needed.
+     *             The caller's pointer will be updated to point to the new buffer if reallocation
+     *             occurs. The caller is responsible for deleting the buffer.
+     * @param buf_size Current allocated size of the buffer (excluding simdjson padding).
+     *                 This parameter is modified by reference and will be updated if the buffer
+     *                 is reallocated to reflect the new buffer size.
+     * @param buf_occupied Number of bytes currently used in the buffer.
+     * @param simdjson_padding Amount of padding required after buffer content.
+     * @return Result containing new buf_occupied and counts of each sanitized character.
+     */
+    static BufferSanitizeResult sanitize_json_buffer(
+            char*& buf,
+            size_t& buf_size,
+            size_t buf_occupied,
+            size_t simdjson_padding
+    );
+
+    /**
+     * Sanitizes a buffer by replacing invalid UTF-8 sequences with the Unicode replacement
+     * character (U+FFFD). This is used to fix malformed data that contains invalid UTF-8
+     * which would cause JSON parsing errors.
+     *
+     * @param buf Reference to pointer to the buffer. May be reallocated if expansion is needed.
+     *            The caller's pointer will be updated to point to the new buffer if reallocation
+     *            occurs. The caller is responsible for deleting the buffer.
+     * @param buf_size Current allocated size of the buffer (excluding simdjson padding).
+     *                 This parameter is modified by reference and will be updated if the buffer
+     *                 is reallocated to reflect the new buffer size.
+     * @param buf_occupied Number of bytes currently used in the buffer.
+     * @param simdjson_padding Amount of padding required after buffer content.
+     * @return Result containing new buf_occupied and counts of each type of invalid sequence
+     *         replaced.
+     */
+    static BufferSanitizeResult sanitize_utf8_buffer(
+            char*& buf,
+            size_t& buf_size,
+            size_t buf_occupied,
+            size_t simdjson_padding
+    );
+
 private:
     /**
      * Converts a character into its two byte hexadecimal representation.
@@ -88,7 +152,6 @@ class StringUtils {
      * @return the two byte hexadecimal representation of c as an array of two characters.
      */
     static std::array<char, 2> char_to_hex(char c) {
-        std::array<char, 2> ret;
         auto nibble_to_hex = [](char nibble) -> char {
             if ('\x00' <= nibble && nibble <= '\x09') {
                 return '0' + (nibble - '\x00');
@@ -97,7 +160,7 @@ class StringUtils {
             }
         };
 
-        return std::array<char, 2>{nibble_to_hex(0x0F & (c >> 4)), nibble_to_hex(0x0f & c)};
+        return {nibble_to_hex(0x0F & (c >> 4)), nibble_to_hex(0x0f & c)};
     }
 
     /**
diff --git a/components/core/src/clp_s/clp-s.cpp b/components/core/src/clp_s/clp-s.cpp
index c2cf7b6abd..fd24634680 100644
--- a/components/core/src/clp_s/clp-s.cpp
+++ b/components/core/src/clp_s/clp-s.cpp
@@ -107,6 +107,7 @@ bool compress(CommandLineArguments const& command_line_arguments) {
     option.single_file_archive = command_line_arguments.get_single_file_archive();
     option.structurize_arrays = command_line_arguments.get_structurize_arrays();
     option.record_log_order = command_line_arguments.get_record_log_order();
+    option.sanitize_invalid_json = command_line_arguments.get_sanitize_invalid_json();
 
     clp_s::JsonParser parser(option);
     if (false == parser.ingest()) {
diff --git a/components/core/tests/test-clp_s-StringUtils.cpp b/components/core/tests/test-clp_s-StringUtils.cpp
new file mode 100644
index 0000000000..e3ef434ecb
--- /dev/null
+++ b/components/core/tests/test-clp_s-StringUtils.cpp
@@ -0,0 +1,539 @@
+#include <cstring>
+#include <memory>
+#include <string>
+#include <string_view>
+
+#include <catch2/catch_test_macros.hpp>
+#include <simdjson.h>
+
+#include "../src/clp_s/Utils.hpp"
+
+using clp_s::BufferSanitizeResult;
+using clp_s::StringUtils;
+
+// We use C++14 string literals (the `s` suffix) to construct strings containing embedded null bytes
+// and control characters. Without the `s` suffix, std::string's constructor from a C-string literal
+// stops at the first null byte, truncating the test input. Additionally, we use string
+// concatenation (e.g., "\x00" "end") to prevent hex escape sequences from being extended by
+// following hex digits.
+using namespace std::string_literals;
+
+namespace {
+/**
+ * Result structure containing both the sanitization result and the output string.
+ */
+struct SanitizeResultWithOutput {
+    BufferSanitizeResult result;
+    std::string output;
+};
+
+/**
+ * Helper to create a buffer with simdjson padding and run JSON sanitization.
+ * Returns both the result (with character counts) and the sanitized output string.
+ * @param input The input string to sanitize
+ * @return SanitizeResultWithOutput containing result metadata and sanitized string
+ * @note sanitize_json_buffer may reallocate the buffer (passed by reference). This helper
+ *       handles both reallocation and no-reallocation cases correctly.
+ */
+auto sanitize_json_with_result(std::string_view input) -> SanitizeResultWithOutput {
+    size_t buf_size = input.size();
+    size_t buf_occupied = input.size();
+    // Use raw pointer since sanitize_json_buffer may delete and reallocate
+    char* buf = new char[buf_size + simdjson::SIMDJSON_PADDING];
+    std::memcpy(buf, input.data(), input.size());
+
+    auto result = StringUtils::sanitize_json_buffer(
+            buf,
+            buf_size,
+            buf_occupied,
+            simdjson::SIMDJSON_PADDING
+    );
+    std::string output(buf, result.new_buf_occupied);
+    // buf now points to the final buffer (may have been reallocated)
+    delete[] buf;
+    return {std::move(result), std::move(output)};
+}
+
+/**
+ * Helper to create a buffer with simdjson padding and run JSON sanitization.
+ * @param input The input string to sanitize
+ * @return The sanitized string
+ */
+auto sanitize_string(std::string_view input) -> std::string {
+    return sanitize_json_with_result(input).output;
+}
+
+/**
+ * Helper to verify that a sanitized JSON string can be parsed by simdjson.
+ * @param json The JSON string to parse
+ * @return true if parsing succeeds, false otherwise
+ */
+auto can_parse_json(std::string_view json) -> bool {
+    simdjson::ondemand::parser parser;
+    simdjson::padded_string padded(json);
+    auto result = parser.iterate(padded);
+    return false == result.error();
+}
+}  // namespace
+
+TEST_CASE("sanitize_json_buffer_no_changes", "[clp_s][StringUtils]") {
+    // Valid JSON without control characters should pass through unchanged
+    SECTION("simple object") {
+        std::string input = R"({"key": "value"})";
+        REQUIRE(sanitize_string(input) == input);
+    }
+
+    SECTION("object with escaped characters") {
+        std::string input = R"({"key": "line1\nline2\ttab"})";
+        REQUIRE(sanitize_string(input) == input);
+    }
+
+    SECTION("object with unicode escapes") {
+        std::string input = R"({"key": "null char: \u0000"})";
+        REQUIRE(sanitize_string(input) == input);
+    }
+
+    SECTION("multiple objects") {
+        std::string input = R"({"a": "1"}{"b": "2"})";
+        REQUIRE(sanitize_string(input) == input);
+    }
+
+    SECTION("control chars outside strings are unchanged") {
+        // Control chars outside strings aren't valid JSON anyway,
+        // but we don't touch them - let the JSON parser report the error
+        std::string input = "{\x01\"key\": \"value\"}";
+        REQUIRE(sanitize_string(input) == input);
+    }
+}
+
+TEST_CASE("sanitize_json_buffer_escapes_control_chars", "[clp_s][StringUtils]") {
+    SECTION("null byte in string value") {
+        auto input = "{\"key\": \"val\x00ue\"}"s;
+        std::string expected = R"({"key": "val\u0000ue"})";
+        REQUIRE(sanitize_string(input) == expected);
+    }
+
+    SECTION("multiple null bytes") {
+        auto input = "{\"key\": \"\x00\x00\x00\"}"s;
+        std::string expected = R"({"key": "\u0000\u0000\u0000"})";
+        REQUIRE(sanitize_string(input) == expected);
+    }
+
+    SECTION("various control characters") {
+        // Test 0x01 (SOH), 0x02 (STX), 0x1F (US)
+        auto input = "{\"key\": \"a\x01"
+                     "b\x02"
+                     "c\x1F"
+                     "d\"}"s;
+        std::string expected = R"({"key": "a\u0001b\u0002c\u001fd"})";
+        REQUIRE(sanitize_string(input) == expected);
+    }
+
+    SECTION("control char in key") {
+        auto input = "{\"ke\x00y\": \"value\"}"s;
+        std::string expected = R"({"ke\u0000y": "value"})";
+        REQUIRE(sanitize_string(input) == expected);
+    }
+
+    SECTION("mixed valid escapes and control chars") {
+        auto input = "{\"key\": \"line1\\nhas\x00null\"}"s;
+        std::string expected = R"({"key": "line1\nhas\u0000null"})";
+        REQUIRE(sanitize_string(input) == expected);
+    }
+}
+
+TEST_CASE("sanitize_json_buffer_handles_escapes_correctly", "[clp_s][StringUtils]") {
+    SECTION("escaped quote should not toggle string state") {
+        // The \" should not end the string
+        std::string input = R"({"key": "quote:\"value"})";
+        REQUIRE(sanitize_string(input) == input);
+    }
+
+    SECTION("escaped backslash before quote") {
+        // \\" means escaped backslash followed by end of string
+        std::string input = R"({"key": "backslash:\\"})";
+        REQUIRE(sanitize_string(input) == input);
+    }
+
+    SECTION("control char after escaped backslash") {
+        // \\\x00 means escaped backslash followed by a control char still inside the string
+        auto input = "{\"key\": \"slash:\\\\\x00"
+                     "end\"}"s;
+        std::string expected = R"({"key": "slash:\\\u0000end"})";
+        REQUIRE(sanitize_string(input) == expected);
+    }
+
+    SECTION("control char after escaped quote") {
+        auto input = "{\"key\": \"quote:\\\"\x00"
+                     "after\"}"s;
+        std::string expected = R"({"key": "quote:\"\u0000after"})";
+        REQUIRE(sanitize_string(input) == expected);
+    }
+
+    SECTION("control char as invalid escape sequence target") {
+        // \<control-char> is not a valid JSON escape sequence, but we leave it as-is
+        // since the sanitizer treats any char after backslash as an escape target.
+        // This JSON is invalid anyway (bad escape sequence) and will fail parsing.
+        auto input = "{\"key\": \"bad:\\\x00"
+                     "end\"}"s;
+        auto expected = "{\"key\": \"bad:\\\x00"
+                        "end\"}"s;
+        REQUIRE(sanitize_string(input) == expected);
+    }
+
+    SECTION("triple backslash followed by control char") {
+        // \\\ followed by \x00: first \\ is escaped backslash, third \ starts escape sequence.
+        // The third backslash makes the null appear to be an escape target, so it's not escaped.
+        auto input = "{\"key\": \"\\\\\\\x00"
+                     "end\"}"s;
+        auto expected = "{\"key\": \"\\\\\\\x00"
+                        "end\"}"s;
+        REQUIRE(sanitize_string(input) == expected);
+    }
+}
+
+TEST_CASE("sanitize_json_buffer_result_is_valid_json", "[clp_s][StringUtils]") {
+    SECTION("sanitized null byte produces valid JSON") {
+        auto input = "{\"key\": \"val\x00ue\"}"s;
+        std::string sanitized = sanitize_string(input);
+        REQUIRE(can_parse_json(sanitized));
+    }
+
+    SECTION("sanitized multiple control chars produces valid JSON") {
+        auto input = "{\"data\": \"\x01\x02\x03\x04\x05\"}"s;
+        std::string sanitized = sanitize_string(input);
+        REQUIRE(can_parse_json(sanitized));
+    }
+}
+
+TEST_CASE("sanitize_json_buffer_handles_buffer_growth", "[clp_s][StringUtils]") {
+    // Each control char expands from 1 byte to 6 bytes (\u00XX)
+    // Create input that will require buffer growth
+    SECTION("many control chars requiring expansion") {
+        // 100 null bytes -> 600 bytes after escaping
+        std::string value(100, '\x00');
+        std::string input = "{\"key\": \"" + value + "\"}";
+
+        std::string sanitized = sanitize_string(input);
+
+        // Verify all nulls were escaped
+        REQUIRE(sanitized.find('\x00') == std::string::npos);
+        // Verify correct number of escape sequences
+        size_t count = 0;
+        size_t pos = 0;
+        while ((pos = sanitized.find("\\u0000", pos)) != std::string::npos) {
+            ++count;
+            pos += 6;
+        }
+        REQUIRE(count == 100);
+        // Verify result is valid JSON
+        REQUIRE(can_parse_json(sanitized));
+    }
+}
+
+TEST_CASE("sanitize_json_buffer_jsonl", "[clp_s][StringUtils]") {
+    SECTION("multiple JSON objects with control chars") {
+        auto input = "{\"a\": \"x\x00y\"}\n{\"b\": \"p\x01q\"}"s;
+        std::string expected = "{\"a\": \"x\\u0000y\"}\n{\"b\": \"p\\u0001q\"}";
+        REQUIRE(sanitize_string(input) == expected);
+    }
+}
+
+TEST_CASE("sanitize_json_buffer_truncated_json", "[clp_s][StringUtils]") {
+    SECTION("truncated JSON with unmatched quote - control chars may be escaped incorrectly") {
+        // This tests the edge case documented in the code: if JSON is truncated with unmatched
+        // quotes, string state tracking may be incorrect, potentially escaping control chars
+        // outside of actual JSON strings. This is acceptable since malformed JSON will fail
+        // parsing anyway.
+        //
+        // Input: truncated JSON where the string is not closed, followed by control chars
+        // The sanitizer may incorrectly think it's still in a string and escape the control chars
+        auto input = "{\"key\": \"unclosed string\x00\x01"
+                     "after\"}"s;
+        // The sanitizer will escape the control chars because it thinks we're still in a string
+        // (the quote after "unclosed string" is escaped, so the string continues)
+        std::string sanitized = sanitize_string(input);
+        // The control chars should be escaped (behavior may vary based on quote matching)
+        // The important thing is that the function doesn't crash and handles it gracefully
+        REQUIRE(
+                (sanitized.find('\x00') == std::string::npos
+                 || sanitized.find("\\u0000") != std::string::npos)
+        );
+    }
+
+    SECTION("truncated JSON ending mid-string") {
+        // JSON truncated in the middle of a string with control chars (no closing quote)
+        // This simulates a real truncation scenario where the buffer ends mid-string
+        auto input = "{\"key\": \"value\x00\x01"
+                     "truncated"s;
+        std::string sanitized = sanitize_string(input);
+        // Should handle gracefully without crashing
+        // Control chars inside the (unclosed) string should be escaped
+        REQUIRE(sanitized.size() >= input.size());
+        // Verify no raw control characters remain (they should be escaped)
+        REQUIRE(sanitized.find('\x00') == std::string::npos);
+        REQUIRE(sanitized.find('\x01') == std::string::npos);
+    }
+
+    SECTION("truncated JSON with control chars after unmatched quote") {
+        // JSON with unmatched quote followed by control chars outside the string
+        // The sanitizer may incorrectly escape these if it thinks we're still in a string
+        auto input = "{\"key\": \"value\x00"
+                     "}\x01\x02"s;
+        std::string sanitized = sanitize_string(input);
+        // Function should not crash - behavior may vary but should be consistent
+        REQUIRE(sanitized.size() >= input.size());
+    }
+}
+
+// =====================================================================================
+// UTF-8 Sanitization Tests
+// =====================================================================================
+
+namespace {
+/**
+ * Helper to create a buffer with simdjson padding and run UTF-8 sanitization.
+ * Returns both the result (with character counts) and the sanitized output string.
+ * @param input The input string to sanitize
+ * @return SanitizeResultWithOutput containing result metadata and sanitized string
+ * @note sanitize_utf8_buffer may reallocate the buffer (passed by reference). This helper
+ *       handles both reallocation and no-reallocation cases correctly.
+ */
+auto sanitize_utf8_with_result(std::string_view input) -> SanitizeResultWithOutput {
+    size_t buf_size = input.size();
+    size_t buf_occupied = input.size();
+    // Use raw pointer since sanitize_utf8_buffer may delete and reallocate
+    char* buf = new char[buf_size + simdjson::SIMDJSON_PADDING];
+    std::memcpy(buf, input.data(), input.size());
+
+    auto result = StringUtils::sanitize_utf8_buffer(
+            buf,
+            buf_size,
+            buf_occupied,
+            simdjson::SIMDJSON_PADDING
+    );
+    std::string output(buf, result.new_buf_occupied);
+    // buf now points to the final buffer (may have been reallocated)
+    delete[] buf;
+    return {std::move(result), std::move(output)};
+}
+
+/**
+ * Helper to create a buffer with simdjson padding and run UTF-8 sanitization.
+ * @param input The input string to sanitize
+ * @return The sanitized string
+ */
+auto sanitize_utf8_string(std::string_view input) -> std::string {
+    return sanitize_utf8_with_result(input).output;
+}
+
+/**
+ * Count occurrences of U+FFFD replacement character (0xEF 0xBF 0xBD) in a string.
+ */
+auto count_replacement_chars(std::string_view s) -> size_t {
+    size_t count = 0;
+    size_t i = 0;
+    while (i + 2 < s.size()) {
+        if (static_cast<unsigned char>(s[i]) == 0xEF && static_cast<unsigned char>(s[i + 1]) == 0xBF
+            && static_cast<unsigned char>(s[i + 2]) == 0xBD)
+        {
+            ++count;
+            i += 3;  // Skip past the replacement char (3 bytes for U+FFFD)
+        } else {
+            ++i;
+        }
+    }
+    return count;
+}
+}  // namespace
+
+TEST_CASE("sanitize_utf8_buffer_no_changes", "[clp_s][StringUtils]") {
+    SECTION("valid ASCII") {
+        std::string input = "Hello, World!";
+        REQUIRE(sanitize_utf8_string(input) == input);
+    }
+
+    SECTION("valid UTF-8 multibyte characters") {
+        // 2-byte: é (U+00E9) = 0xC3 0xA9
+        // 3-byte: € (U+20AC) = 0xE2 0x82 0xAC
+        // 4-byte: 𝄞 (U+1D11E) = 0xF0 0x9D 0x84 0x9E
+        std::string input = "café € 𝄞";
+        REQUIRE(sanitize_utf8_string(input) == input);
+    }
+
+    SECTION("valid JSON with UTF-8") {
+        std::string input = R"({"msg": "日本語テスト"})";
+        REQUIRE(sanitize_utf8_string(input) == input);
+    }
+
+    SECTION("empty string") {
+        std::string input = "";
+        REQUIRE(sanitize_utf8_string(input) == input);
+    }
+}
+
+TEST_CASE("sanitize_utf8_buffer_replaces_invalid_bytes", "[clp_s][StringUtils]") {
+    // U+FFFD is encoded as 0xEF 0xBF 0xBD in UTF-8
+    constexpr char cReplacementChar[] = "\xEF\xBF\xBD";
+
+    SECTION("single invalid byte 0xFF") {
+        auto input = "hello\xFF world"s;
+        std::string expected = "hello" + std::string(cReplacementChar) + " world";
+        REQUIRE(sanitize_utf8_string(input) == expected);
+    }
+
+    SECTION("single invalid byte 0xFE") {
+        auto input = "test\xFE"s;
+        std::string expected = "test" + std::string(cReplacementChar);
+        REQUIRE(sanitize_utf8_string(input) == expected);
+    }
+
+    SECTION("multiple invalid bytes") {
+        auto input = "\xFF\xFE\xFF"s;
+        std::string expected = std::string(cReplacementChar) + cReplacementChar + cReplacementChar;
+        REQUIRE(sanitize_utf8_string(input) == expected);
+    }
+
+    SECTION("continuation byte without leader (0x80-0xBF)") {
+        auto input = "test\x80 value"s;
+        std::string expected = "test" + std::string(cReplacementChar) + " value";
+        REQUIRE(sanitize_utf8_string(input) == expected);
+    }
+
+    SECTION("truncated 2-byte sequence") {
+        // 0xC3 expects one continuation byte, but we end the string
+        auto input = "test\xC3"s;
+        std::string expected = "test" + std::string(cReplacementChar);
+        REQUIRE(sanitize_utf8_string(input) == expected);
+    }
+
+    SECTION("truncated 3-byte sequence") {
+        // 0xE2 expects two continuation bytes
+        auto input = "test\xE2\x82"s;
+        std::string expected
+                = "test" + std::string(cReplacementChar) + std::string(cReplacementChar);
+        REQUIRE(sanitize_utf8_string(input) == expected);
+    }
+
+    SECTION("truncated 4-byte sequence") {
+        // 0xF0 expects three continuation bytes
+        auto input = "test\xF0\x9D\x84"s;
+        std::string expected = "test" + std::string(cReplacementChar)
+                               + std::string(cReplacementChar) + std::string(cReplacementChar);
+        REQUIRE(sanitize_utf8_string(input) == expected);
+    }
+
+    SECTION("overlong 2-byte encoding") {
+        // 0xC0 0x80 is overlong encoding of NUL (should be just 0x00)
+        // 0xC0 and 0xC1 are never valid UTF-8 lead bytes
+        auto input = "test\xC0\x80"s;
+        // Both bytes are invalid
+        std::string expected
+                = "test" + std::string(cReplacementChar) + std::string(cReplacementChar);
+        REQUIRE(sanitize_utf8_string(input) == expected);
+    }
+
+    SECTION("surrogate code points (U+D800-U+DFFF)") {
+        // 0xED 0xA0 0x80 encodes U+D800 (invalid surrogate)
+        auto input = "test\xED\xA0\x80 end"s;
+        // The sequence is invalid, bytes replaced individually
+        std::string expected = "test" + std::string(cReplacementChar)
+                               + std::string(cReplacementChar) + std::string(cReplacementChar)
+                               + " end";
+        REQUIRE(sanitize_utf8_string(input) == expected);
+    }
+
+    SECTION("code point above U+10FFFF") {
+        // 0xF4 0x90 0x80 0x80 would encode U+110000 (above max)
+        auto input = "test\xF4\x90\x80\x80 end"s;
+        std::string sanitized = sanitize_utf8_string(input);
+        // Should contain replacement characters
+        REQUIRE(count_replacement_chars(sanitized) >= 1);
+    }
+}
+
+TEST_CASE("sanitize_utf8_buffer_mixed_valid_invalid", "[clp_s][StringUtils]") {
+    constexpr char cReplacementChar[] = "\xEF\xBF\xBD";
+
+    SECTION("invalid byte between valid UTF-8") {
+        // café with invalid byte in middle
+        auto input = "caf\xC3\xA9\xFF test"s;  // café + 0xFF + " test"
+        std::string expected = "caf\xC3\xA9" + std::string(cReplacementChar) + " test";
+        REQUIRE(sanitize_utf8_string(input) == expected);
+    }
+
+    SECTION("JSON with invalid UTF-8 in value") {
+        auto input = "{\"msg\": \"test\xFF value\"}"s;
+        std::string expected = "{\"msg\": \"test" + std::string(cReplacementChar) + " value\"}";
+        REQUIRE(sanitize_utf8_string(input) == expected);
+    }
+}
+
+TEST_CASE("sanitize_utf8_buffer_returns_correct_counts", "[clp_s][StringUtils]") {
+    SECTION("counts invalid sequences") {
+        auto input = "\xFF\xFE\xFF"s;  // 3 invalid bytes
+
+        auto [result, output] = sanitize_utf8_with_result(input);
+
+        // The count uses 0xFF as a special key for invalid UTF-8 sequences
+        REQUIRE_FALSE(result.sanitized_char_counts.empty());
+        size_t total = 0;
+        for (auto const& [ch, count] : result.sanitized_char_counts) {
+            total += count;
+        }
+        REQUIRE(total == 3);
+    }
+
+    SECTION("returns empty counts when no sanitization needed") {
+        std::string input = "valid UTF-8 string";
+
+        auto [result, output] = sanitize_utf8_with_result(input);
+
+        REQUIRE(result.sanitized_char_counts.empty());
+        REQUIRE(result.new_buf_occupied == input.size());
+    }
+}
+
+TEST_CASE("sanitize_utf8_buffer_handles_buffer_growth", "[clp_s][StringUtils]") {
+    // Each invalid byte (1 byte) becomes U+FFFD (3 bytes), so buffer may need to grow
+    SECTION("many invalid bytes requiring expansion") {
+        // 100 invalid bytes -> 300 bytes after replacement with U+FFFD
+        std::string input(100, '\xFF');
+
+        std::string sanitized = sanitize_utf8_string(input);
+
+        // Verify all bytes were replaced
+        REQUIRE(sanitized.find('\xFF') == std::string::npos);
+        // Verify correct number of replacement characters
+        REQUIRE(count_replacement_chars(sanitized) == 100);
+        // Verify size: 100 * 3 = 300 bytes
+        REQUIRE(sanitized.size() == 300);
+    }
+}
+
+// =====================================================================================
+// JSON Sanitization Character Count Tests
+// =====================================================================================
+
+TEST_CASE("sanitize_json_buffer_returns_correct_char_counts", "[clp_s][StringUtils]") {
+    SECTION("counts multiple different control characters") {
+        // Input with: 3x \x00, 2x \x01, 1x \x1f
+        auto input = "{\"a\": \"\x00\x00\x01\x00\x01\x1f\"}"s;
+
+        auto [result, output] = sanitize_json_with_result(input);
+
+        REQUIRE(result.sanitized_char_counts.size() == 3);
+        REQUIRE(result.sanitized_char_counts.at('\x00') == 3);
+        REQUIRE(result.sanitized_char_counts.at('\x01') == 2);
+        REQUIRE(result.sanitized_char_counts.at('\x1f') == 1);
+    }
+
+    SECTION("returns empty counts when no sanitization needed") {
+        std::string input = R"({"key": "valid value"})";
+
+        auto [result, output] = sanitize_json_with_result(input);
+
+        REQUIRE(result.sanitized_char_counts.empty());
+        REQUIRE(result.new_buf_occupied == input.size());
+    }
+}