Trailing bytes partial mode (#211)

PrettyWood · web-flow · commit 9c3d0a5856e8 · 2025-08-12T10:53:05.000+01:00
diff --git a/Makefile b/Makefile
@@ -38,7 +38,7 @@ python-bench: python-dev-release
 
 .PHONY: bench
 bench:
-	cargo bench  -p jiter -F python
+	cargo bench -p jiter -F python
 
 .PHONY: fuzz
 fuzz:
diff --git a/crates/jiter-python/tests/test_jiter.py b/crates/jiter-python/tests/test_jiter.py
@@ -168,6 +168,40 @@ def test_partial_object_string_trailing_strings():
     assert parsed == {'title': 'Pride and Prejudice', 'author': 'Jane A'}
 
 
+def test_partial_json_invalid_utf8_bytes():
+    missing_closing_quote_string = '"abc€'
+    missing_closing_quote_bytes = missing_closing_quote_string.encode()  # b'"abc\xe2\x82\xac'
+    result = jiter.from_json(missing_closing_quote_bytes, partial_mode='trailing-strings')
+    assert result == 'abc€'
+
+    # remove the last byte to create an invalid UTF-8 sequence
+    non_unicode_partial_string_bytes = missing_closing_quote_bytes[:-1]  # b'"abc\xe2\x82' - missing last byte of €
+
+    # This should fail by default (incomplete UTF-8 sequence)...
+    with pytest.raises(ValueError, match='EOF while parsing a string'):
+        jiter.from_json(non_unicode_partial_string_bytes)
+
+    # ...but succeed in partial mode by truncating to valid UTF-8 boundary
+    result = jiter.from_json(non_unicode_partial_string_bytes, partial_mode='trailing-strings')
+    assert result == 'abc'
+
+    # However, truly invalid UTF-8 (not just incomplete) should always raise an error
+    # Byte \xff is always invalid in UTF-8
+    for invalid_utf8_bytes in (b'"abc\xff', b'"abc\xffdef'):
+        # This should fail by default (invalid UTF-8 sequence)...
+        with pytest.raises(ValueError, match='EOF while parsing a string'):
+            jiter.from_json(invalid_utf8_bytes)
+
+        # ...but ALSO WITH partial mode
+        with pytest.raises(ValueError, match='invalid unicode code point'):
+            jiter.from_json(invalid_utf8_bytes, partial_mode='trailing-strings')
+
+    # It should not truncate valid content
+    valid_utf8_bytes = b'"abc\xe2\x82\xac"'
+    result = jiter.from_json(valid_utf8_bytes, partial_mode='trailing-strings')
+    assert result == 'abc€'
+
+
 def test_partial_nested():
     json = b'{"a": 1, "b": 2, "c": [1, 2, {"d": 1, '
     parsed = jiter.from_json(json, partial_mode=True)
diff --git a/crates/jiter/src/string_decoder.rs b/crates/jiter/src/string_decoder.rs
@@ -122,7 +122,7 @@ where
 
         match decode_chunk(data, start, true, allow_partial)? {
             (StringChunk::StringEnd, ascii_only, index) => {
-                let s = to_str(&data[start..index], ascii_only, start)?;
+                let s = to_str(&data[start..index], ascii_only, start, allow_partial)?;
                 Ok((unsafe { StringOutput::data(s, ascii_only) }, index + 1))
             }
             (StringChunk::Backslash, ascii_only, index) => {
@@ -162,7 +162,7 @@ fn decode_to_tape<'t, 'j>(
                     }
                     Err(e) => {
                         if allow_partial && e.error_type == JsonErrorType::EofWhileParsingString {
-                            let s = to_str(tape, ascii_only, start)?;
+                            let s = to_str(tape, ascii_only, start, allow_partial)?;
                             return Ok((unsafe { StringOutput::tape(s, ascii_only) }, e.index));
                         }
                         return Err(e);
@@ -173,7 +173,7 @@ fn decode_to_tape<'t, 'j>(
             index += 1;
         } else {
             if allow_partial {
-                let s = to_str(tape, ascii_only, start)?;
+                let s = to_str(tape, ascii_only, start, allow_partial)?;
                 return Ok((unsafe { StringOutput::tape(s, ascii_only) }, index));
             }
             return json_err!(EofWhileParsingString, index);
@@ -183,7 +183,7 @@ fn decode_to_tape<'t, 'j>(
             (StringChunk::StringEnd, ascii_only, new_index) => {
                 tape.extend_from_slice(&data[index..new_index]);
                 index = new_index + 1;
-                let s = to_str(tape, ascii_only, start)?;
+                let s = to_str(tape, ascii_only, start, allow_partial)?;
                 return Ok((unsafe { StringOutput::tape(s, ascii_only) }, index));
             }
             (StringChunk::Backslash, ascii_only_new, index_new) => {
@@ -344,13 +344,24 @@ static CHAR_TYPE: [CharType; 256] = {
     ]
 };
 
-fn to_str(bytes: &[u8], ascii_only: bool, start: usize) -> JsonResult<&str> {
+fn to_str(bytes: &[u8], ascii_only: bool, start: usize, allow_partial: bool) -> JsonResult<&str> {
     if ascii_only {
         // safety: in this case we've already confirmed that all characters are ascii, we can safely
         // transmute from bytes to str
         Ok(unsafe { from_utf8_unchecked(bytes) })
     } else {
-        from_utf8(bytes).map_err(|e| json_error!(InvalidUnicodeCodePoint, start + e.valid_up_to() + 1))
+        match from_utf8(bytes) {
+            Ok(s) => Ok(s),
+            Err(e) if allow_partial && e.error_len().is_none() => {
+                // In partial mode, we handle incomplete (not invalid) UTF-8 sequences
+                // by truncating to the last valid UTF-8 boundary
+                // (`error_len()` is `None` for incomplete sequences)
+                let valid_up_to = e.valid_up_to();
+                // SAFETY: `valid_up_to()` returns the byte index up to which the input is valid UTF-8
+                Ok(unsafe { from_utf8_unchecked(&bytes[..valid_up_to]) })
+            }
+            Err(e) => Err(json_error!(InvalidUnicodeCodePoint, start + e.valid_up_to() + 1)),
+        }
     }
 }