Skip to content

Commit 9c3d0a5

Browse files
authored
Trailing bytes partial mode (#211)
1 parent 2f89404 commit 9c3d0a5

File tree

3 files changed

+52
-7
lines changed

3 files changed

+52
-7
lines changed

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ python-bench: python-dev-release
3838

3939
.PHONY: bench
4040
bench:
41-
cargo bench -p jiter -F python
41+
cargo bench -p jiter -F python
4242

4343
.PHONY: fuzz
4444
fuzz:

crates/jiter-python/tests/test_jiter.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,40 @@ def test_partial_object_string_trailing_strings():
168168
assert parsed == {'title': 'Pride and Prejudice', 'author': 'Jane A'}
169169

170170

171+
def test_partial_json_invalid_utf8_bytes():
172+
missing_closing_quote_string = '"abc€'
173+
missing_closing_quote_bytes = missing_closing_quote_string.encode() # b'"abc\xe2\x82\xac'
174+
result = jiter.from_json(missing_closing_quote_bytes, partial_mode='trailing-strings')
175+
assert result == 'abc€'
176+
177+
# remove the last byte to create an invalid UTF-8 sequence
178+
non_unicode_partial_string_bytes = missing_closing_quote_bytes[:-1] # b'"abc\xe2\x82' - missing last byte of €
179+
180+
# This should fail by default (incomplete UTF-8 sequence)...
181+
with pytest.raises(ValueError, match='EOF while parsing a string'):
182+
jiter.from_json(non_unicode_partial_string_bytes)
183+
184+
# ...but succeed in partial mode by truncating to valid UTF-8 boundary
185+
result = jiter.from_json(non_unicode_partial_string_bytes, partial_mode='trailing-strings')
186+
assert result == 'abc'
187+
188+
# However, truly invalid UTF-8 (not just incomplete) should always raise an error
189+
# Byte \xff is always invalid in UTF-8
190+
for invalid_utf8_bytes in (b'"abc\xff', b'"abc\xffdef'):
191+
# This should fail by default (invalid UTF-8 sequence)...
192+
with pytest.raises(ValueError, match='EOF while parsing a string'):
193+
jiter.from_json(invalid_utf8_bytes)
194+
195+
# ...but ALSO WITH partial mode
196+
with pytest.raises(ValueError, match='invalid unicode code point'):
197+
jiter.from_json(invalid_utf8_bytes, partial_mode='trailing-strings')
198+
199+
# It should not truncate valid content
200+
valid_utf8_bytes = b'"abc\xe2\x82\xac"'
201+
result = jiter.from_json(valid_utf8_bytes, partial_mode='trailing-strings')
202+
assert result == 'abc€'
203+
204+
171205
def test_partial_nested():
172206
json = b'{"a": 1, "b": 2, "c": [1, 2, {"d": 1, '
173207
parsed = jiter.from_json(json, partial_mode=True)

crates/jiter/src/string_decoder.rs

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,7 @@ where
122122

123123
match decode_chunk(data, start, true, allow_partial)? {
124124
(StringChunk::StringEnd, ascii_only, index) => {
125-
let s = to_str(&data[start..index], ascii_only, start)?;
125+
let s = to_str(&data[start..index], ascii_only, start, allow_partial)?;
126126
Ok((unsafe { StringOutput::data(s, ascii_only) }, index + 1))
127127
}
128128
(StringChunk::Backslash, ascii_only, index) => {
@@ -162,7 +162,7 @@ fn decode_to_tape<'t, 'j>(
162162
}
163163
Err(e) => {
164164
if allow_partial && e.error_type == JsonErrorType::EofWhileParsingString {
165-
let s = to_str(tape, ascii_only, start)?;
165+
let s = to_str(tape, ascii_only, start, allow_partial)?;
166166
return Ok((unsafe { StringOutput::tape(s, ascii_only) }, e.index));
167167
}
168168
return Err(e);
@@ -173,7 +173,7 @@ fn decode_to_tape<'t, 'j>(
173173
index += 1;
174174
} else {
175175
if allow_partial {
176-
let s = to_str(tape, ascii_only, start)?;
176+
let s = to_str(tape, ascii_only, start, allow_partial)?;
177177
return Ok((unsafe { StringOutput::tape(s, ascii_only) }, index));
178178
}
179179
return json_err!(EofWhileParsingString, index);
@@ -183,7 +183,7 @@ fn decode_to_tape<'t, 'j>(
183183
(StringChunk::StringEnd, ascii_only, new_index) => {
184184
tape.extend_from_slice(&data[index..new_index]);
185185
index = new_index + 1;
186-
let s = to_str(tape, ascii_only, start)?;
186+
let s = to_str(tape, ascii_only, start, allow_partial)?;
187187
return Ok((unsafe { StringOutput::tape(s, ascii_only) }, index));
188188
}
189189
(StringChunk::Backslash, ascii_only_new, index_new) => {
@@ -344,13 +344,24 @@ static CHAR_TYPE: [CharType; 256] = {
344344
]
345345
};
346346

347-
fn to_str(bytes: &[u8], ascii_only: bool, start: usize) -> JsonResult<&str> {
347+
fn to_str(bytes: &[u8], ascii_only: bool, start: usize, allow_partial: bool) -> JsonResult<&str> {
348348
if ascii_only {
349349
// safety: in this case we've already confirmed that all characters are ascii, we can safely
350350
// transmute from bytes to str
351351
Ok(unsafe { from_utf8_unchecked(bytes) })
352352
} else {
353-
from_utf8(bytes).map_err(|e| json_error!(InvalidUnicodeCodePoint, start + e.valid_up_to() + 1))
353+
match from_utf8(bytes) {
354+
Ok(s) => Ok(s),
355+
Err(e) if allow_partial && e.error_len().is_none() => {
356+
// In partial mode, we handle incomplete (not invalid) UTF-8 sequences
357+
// by truncating to the last valid UTF-8 boundary
358+
// (`error_len()` is `None` for incomplete sequences)
359+
let valid_up_to = e.valid_up_to();
360+
// SAFETY: `valid_up_to()` returns the byte index up to which the input is valid UTF-8
361+
Ok(unsafe { from_utf8_unchecked(&bytes[..valid_up_to]) })
362+
}
363+
Err(e) => Err(json_error!(InvalidUnicodeCodePoint, start + e.valid_up_to() + 1)),
364+
}
354365
}
355366
}
356367

0 commit comments

Comments
 (0)