Skip to content

Commit c450584

Browse files
committed
[owl] Fix Thai language missing tone marks (#903)
* Update Zalgo text validation * Allow 6 consecutive tone marks * Update to claude sonnet 4.6
1 parent 605a345 commit c450584

File tree

3 files changed

+17
-10
lines changed

3 files changed

+17
-10
lines changed

clients/python/src/jamaibase/types/common.py

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -78,10 +78,6 @@ def _is_bad_char(char: str, *, allow_newline: bool) -> bool:
7878
return True
7979

8080
# 3. Check for specific disallowed Unicode categories and blocks
81-
category = unicodedata.category(char)
82-
# Combining marks (e.g., for Zalgo text)
83-
if category.startswith("M"):
84-
return True
8581
# Box drawing
8682
if "\u2500" <= char <= "\u257f":
8783
return True
@@ -104,13 +100,19 @@ def _str_pre_validator(
104100
if disallow_empty_string and len(value) == 0:
105101
raise ValueError("Text is empty.")
106102

107-
# --- Simplified and Consolidated Character Validation ---
108-
# The generator expression is efficient as `any()` will short-circuit
109-
# on the first bad character found.
110-
value = "".join(char for char in value if not unicodedata.category(char).startswith("M"))
103+
# Reject excessive consecutive combining marks (Zalgo text) while
104+
# preserving valid scripts (Thai, Arabic, Hindi, Vietnamese, etc.)
105+
consecutive = 0
106+
for char in value:
107+
if unicodedata.category(char).startswith("M"):
108+
consecutive += 1
109+
if consecutive > 6:
110+
raise ValueError("Text contains excessive combining marks.")
111+
else:
112+
consecutive = 0
113+
111114
if any(_is_bad_char(char, allow_newline=allow_newline) for char in value):
112115
raise ValueError("Text contains disallowed or non-printable characters.")
113-
114116
return value
115117

116118

services/api/tests/gen_table/test_row_ops_v2.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1258,6 +1258,11 @@ def _check(rows: list[RowCompletionResponse], base: str, exc: list[str] = None):
12581258
"code": "row['result_column'] = int(row['input']) + int(row['input'])",
12591259
"expected": "4",
12601260
},
1261+
{
1262+
"input": "ยืนยัน",
1263+
"code": "msg='แจ้งซ่อม'; row['result_column']=f'{row['input']}{msg}'",
1264+
"expected": "ยืนยันแจ้งซ่อม",
1265+
},
12611266
# Test error handling:
12621267
{
12631268
"input": "DUMMY",

services/api/tests/routers/test_serving.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -818,7 +818,7 @@ def test_chat_reasoning_anthropic(setup: ServingContext, stream: bool):
818818
# Test default params
819819
response = _test_chat_reasoning_cloud(**kwargs)
820820
assert len(response.content) > 0
821-
kwargs["routing_id"] = "claude-3-7-sonnet-latest"
821+
kwargs["routing_id"] = "claude-4-6-sonnet-latest"
822822
response = _test_chat_reasoning_cloud(**kwargs)
823823
assert len(response.content) > 0
824824
# Test disabling reasoning

0 commit comments

Comments
 (0)