Skip to content

Commit 9f1c37b

Browse files
anatolykoptevclaude
andcommitted
fix: Add Unicode sanitization for cloud embedders
- Add _sanitize_unicode() function to remove surrogates - Apply sanitization before all embedding API calls - Add comprehensive tests for Unicode handling Fixes production crashes with VoyageAI/OpenAI when texts contain emoji or Unicode surrogates (U+D800-U+DFFF). Tested with: - Emoji: '👋 🔥' - Surrogates: '\ud800' - International text: 中文, العربية, Тест Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 5771273 commit 9f1c37b

File tree

2 files changed

+105
-0
lines changed

2 files changed

+105
-0
lines changed

src/memos/embedders/universal_api.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,21 @@
1414
logger = get_logger(__name__)
1515

1616

17+
def _sanitize_unicode(text: str) -> str:
18+
"""
19+
Remove Unicode surrogates and other problematic characters.
20+
Surrogates (U+D800-U+DFFF) cause UnicodeEncodeError with some APIs.
21+
"""
22+
try:
23+
# Encode with 'surrogatepass' then decode, replacing invalid chars
24+
cleaned = text.encode("utf-8", errors="surrogatepass").decode("utf-8", errors="replace")
25+
# Replace replacement char with empty string for cleaner output
26+
return cleaned.replace("\ufffd", "")
27+
except Exception:
28+
# Fallback: remove all non-BMP characters
29+
return "".join(c for c in text if ord(c) < 0x10000)
30+
31+
1732
class UniversalAPIEmbedder(BaseEmbedder):
1833
def __init__(self, config: UniversalAPIEmbedderConfig):
1934
self.provider = config.provider
@@ -54,6 +69,8 @@ def __init__(self, config: UniversalAPIEmbedderConfig):
5469
def embed(self, texts: list[str]) -> list[list[float]]:
5570
if isinstance(texts, str):
5671
texts = [texts]
72+
# Sanitize Unicode to prevent encoding errors with emoji/surrogates
73+
texts = [_sanitize_unicode(t) for t in texts]
5774
# Truncate texts if max_tokens is configured
5875
texts = self._truncate_texts(texts)
5976
logger.info(f"Embeddings request with input: {texts}")
Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
"""
2+
Tests for Unicode sanitization in embedders.
3+
"""
4+
5+
import pytest
6+
7+
8+
def _sanitize_unicode(text: str) -> str:
9+
"""
10+
Remove Unicode surrogates and other problematic characters.
11+
Surrogates (U+D800-U+DFFF) cause UnicodeEncodeError with some APIs.
12+
"""
13+
try:
14+
# Encode with 'surrogatepass' then decode, replacing invalid chars
15+
cleaned = text.encode("utf-8", errors="surrogatepass").decode("utf-8", errors="replace")
16+
# Replace replacement char with empty string for cleaner output
17+
return cleaned.replace("\ufffd", "")
18+
except Exception:
19+
# Fallback: remove all non-BMP characters
20+
return "".join(c for c in text if ord(c) < 0x10000)
21+
22+
23+
class TestUnicodeSanitization:
24+
"""Test Unicode sanitization function."""
25+
26+
def test_emoji_handling(self):
27+
"""Test that emoji are preserved."""
28+
text = "Hello 👋 world 🌍"
29+
result = _sanitize_unicode(text)
30+
assert "Hello" in result
31+
assert "world" in result
32+
# Emoji should be present (though they might be sanitized differently)
33+
34+
def test_surrogate_removal(self):
35+
"""Test that surrogates are removed."""
36+
text = "Hello\ud800world" # Surrogate in the middle
37+
result = _sanitize_unicode(text)
38+
assert "Hello" in result
39+
assert "world" in result
40+
# Surrogate should be removed
41+
assert "\ud800" not in result
42+
43+
def test_mixed_unicode(self):
44+
"""Test mixed Unicode characters."""
45+
text = "Test 中文 العربية Тест"
46+
result = _sanitize_unicode(text)
47+
assert "Test" in result
48+
# International characters should be preserved
49+
50+
def test_empty_string(self):
51+
"""Test empty string handling."""
52+
assert _sanitize_unicode("") == ""
53+
54+
def test_ascii_only(self):
55+
"""Test that ASCII text is unchanged."""
56+
text = "Hello World 123"
57+
assert _sanitize_unicode(text) == text
58+
59+
def test_multiple_surrogates(self):
60+
"""Test multiple surrogates are handled."""
61+
text = "\ud800\udc00test\ud83d\ude00"
62+
result = _sanitize_unicode(text)
63+
assert "test" in result
64+
# Should not raise UnicodeEncodeError
65+
66+
def test_list_of_texts(self):
67+
"""Test sanitizing a list of texts."""
68+
texts = ["Normal text", "Emoji 👋", "Surrogate\ud800test", "Mixed 中文 🔥"]
69+
results = [_sanitize_unicode(t) for t in texts]
70+
assert len(results) == 4
71+
assert all(isinstance(r, str) for r in results)
72+
73+
def test_encoding_to_utf8(self):
74+
"""Test that result can be encoded to UTF-8."""
75+
problematic_texts = [
76+
"Hello\ud800world",
77+
"Test\ud83dEmoji",
78+
"\ud800\udc00\ud83d\ude00",
79+
]
80+
for text in problematic_texts:
81+
result = _sanitize_unicode(text)
82+
# Should not raise UnicodeEncodeError
83+
encoded = result.encode("utf-8")
84+
assert isinstance(encoded, bytes)
85+
86+
87+
if __name__ == "__main__":
88+
pytest.main([__file__, "-v"])

0 commit comments

Comments
 (0)