Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 19 additions & 1 deletion aider/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -1006,7 +1006,7 @@ def send_completion(self, messages, functions, stream, temperature=None):
kwargs["timeout"] = request_timeout
if self.verbose:
dump(kwargs)
kwargs["messages"] = messages
kwargs["messages"] = sanitize_for_utf8(messages)

# Are we using github copilot?
if "GITHUB_COPILOT_TOKEN" in os.environ:
Expand Down Expand Up @@ -1067,6 +1067,24 @@ def simple_send_with_retries(self, messages):
return None


def sanitize_for_utf8(obj):
"""Replace surrogate characters that cannot be encoded as UTF-8.

On some Windows systems (e.g. Chinese locale), file content or console
input can contain surrogate characters that cause UnicodeEncodeError
when httpx tries to JSON-encode the outgoing LLM request. This
recursively walks the message structure and replaces any surrogates
with the Unicode replacement character.
"""
if isinstance(obj, str):
return obj.encode("utf-8", errors="replace").decode("utf-8")
if isinstance(obj, dict):
return {k: sanitize_for_utf8(v) for k, v in obj.items()}
if isinstance(obj, list):
return [sanitize_for_utf8(item) for item in obj]
return obj


def register_models(model_settings_fnames):
files_loaded = []
for model_settings_fname in model_settings_fnames:
Expand Down
33 changes: 33 additions & 0 deletions tests/basic/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
Model,
ModelInfoManager,
register_models,
sanitize_for_utf8,
sanity_check_model,
sanity_check_models,
)
Expand Down Expand Up @@ -556,5 +557,37 @@ def test_use_temperature_in_send_completion(self, mock_completion):
)


class TestSanitizeForUtf8(unittest.TestCase):
def test_replaces_surrogates_in_string(self):
text = "hello \udcb0 world"
result = sanitize_for_utf8(text)
self.assertNotIn("\udcb0", result)
self.assertIn("hello", result)
self.assertIn("world", result)

def test_handles_nested_messages(self):
messages = [
{"role": "user", "content": "test \udcb0 content"},
{"role": "assistant", "content": "clean content"},
]
result = sanitize_for_utf8(messages)
# Should be JSON-encodable to UTF-8 without errors
import json

json.dumps(result).encode("utf-8")
self.assertNotIn("\udcb0", result[0]["content"])
self.assertEqual(result[1]["content"], "clean content")

def test_preserves_non_surrogate_unicode(self):
text = "hello \u4e16\u754c" # Chinese characters
result = sanitize_for_utf8(text)
self.assertEqual(result, text)

def test_passes_through_non_string_types(self):
self.assertEqual(sanitize_for_utf8(42), 42)
self.assertIsNone(sanitize_for_utf8(None))
self.assertEqual(sanitize_for_utf8(True), True)


if __name__ == "__main__":
unittest.main()