|
| 1 | +"""Tests for sanitize_json_control_chars helper function. |
| 2 | +
|
| 3 | +This module tests the sanitize_json_control_chars helper that escapes raw |
| 4 | +control characters (U+0000–U+001F) in JSON strings produced by LLMs. Some |
| 5 | +models (e.g. kimi-k2.5, minimax-m2.5) emit literal control bytes instead of |
| 6 | +legal two-character JSON escape sequences, which causes json.loads() to fail. |
| 7 | +""" |
| 8 | + |
| 9 | +import json |
| 10 | + |
| 11 | +from openhands.sdk.agent.utils import sanitize_json_control_chars |
| 12 | + |
| 13 | + |
| 14 | +def test_valid_json_unchanged(): |
| 15 | + """Already-valid JSON is returned unmodified.""" |
| 16 | + raw = '{"command": "echo hello", "path": "/tmp"}' |
| 17 | + assert sanitize_json_control_chars(raw) == raw |
| 18 | + |
| 19 | + |
| 20 | +def test_literal_newline_escaped(): |
| 21 | + """A raw 0x0A byte inside a JSON string is replaced with \\n.""" |
| 22 | + raw = '{"command": "line1\nline2"}' |
| 23 | + sanitized = sanitize_json_control_chars(raw) |
| 24 | + assert "\n" not in sanitized |
| 25 | + parsed = json.loads(sanitized) |
| 26 | + assert parsed["command"] == "line1\nline2" |
| 27 | + |
| 28 | + |
| 29 | +def test_literal_tab_escaped(): |
| 30 | + """A raw 0x09 byte inside a JSON string is replaced with \\t.""" |
| 31 | + raw = '{"indent": "col1\tcol2"}' |
| 32 | + sanitized = sanitize_json_control_chars(raw) |
| 33 | + assert "\t" not in sanitized |
| 34 | + parsed = json.loads(sanitized) |
| 35 | + assert parsed["indent"] == "col1\tcol2" |
| 36 | + |
| 37 | + |
| 38 | +def test_multiple_control_chars(): |
| 39 | + """Multiple different control characters are all escaped.""" |
| 40 | + raw = '{"text": "a\tb\nc\rd"}' |
| 41 | + sanitized = sanitize_json_control_chars(raw) |
| 42 | + parsed = json.loads(sanitized) |
| 43 | + assert parsed["text"] == "a\tb\nc\rd" |
| 44 | + |
| 45 | + |
| 46 | +def test_null_byte_escaped(): |
| 47 | + """A raw NUL (0x00) byte is escaped to \\u0000.""" |
| 48 | + raw = '{"data": "before\x00after"}' |
| 49 | + sanitized = sanitize_json_control_chars(raw) |
| 50 | + assert "\\u0000" in sanitized |
| 51 | + parsed = json.loads(sanitized) |
| 52 | + assert parsed["data"] == "before\x00after" |
| 53 | + |
| 54 | + |
| 55 | +def test_form_feed_and_backspace(): |
| 56 | + """Form-feed and backspace get their short escape aliases.""" |
| 57 | + raw = '{"x": "a\x08b\x0cc"}' |
| 58 | + sanitized = sanitize_json_control_chars(raw) |
| 59 | + assert "\\b" in sanitized |
| 60 | + assert "\\f" in sanitized |
| 61 | + parsed = json.loads(sanitized) |
| 62 | + assert parsed["x"] == "a\x08b\x0cc" |
| 63 | + |
| 64 | + |
| 65 | +def test_already_escaped_sequences_preserved(): |
| 66 | + """Properly escaped sequences (\\n, \\t) are NOT double-escaped.""" |
| 67 | + raw = r'{"command": "echo \"hello\\nworld\""}' |
| 68 | + sanitized = sanitize_json_control_chars(raw) |
| 69 | + # Already-valid escape sequences should parse correctly |
| 70 | + parsed = json.loads(sanitized) |
| 71 | + assert "hello\\nworld" in parsed["command"] |
| 72 | + |
| 73 | + |
| 74 | +def test_empty_string(): |
| 75 | + """Empty input returns empty output.""" |
| 76 | + assert sanitize_json_control_chars("") == "" |
| 77 | + |
| 78 | + |
| 79 | +def test_realistic_tool_call_arguments(): |
| 80 | + """Simulates a realistic malformed tool_call.arguments from an LLM.""" |
| 81 | + # The LLM emitted a literal newline inside the "command" value |
| 82 | + raw = '{"command": "cd /workspace && \\\npython test.py", "path": "/workspace"}' |
| 83 | + sanitized = sanitize_json_control_chars(raw) |
| 84 | + parsed = json.loads(sanitized) |
| 85 | + assert "python test.py" in parsed["command"] |
| 86 | + assert parsed["path"] == "/workspace" |
0 commit comments