[owl] Fix Thai language missing tone marks (#903)

haoshan98 · haoshan98 · commit c45058446e33 · 2026-03-05T04:34:48.000Z
* Update Zalgo text validation

* Allow 6 consecutive tone marks

* Update to claude sonnet 4.6
diff --git a/clients/python/src/jamaibase/types/common.py b/clients/python/src/jamaibase/types/common.py
@@ -78,10 +78,6 @@ def _is_bad_char(char: str, *, allow_newline: bool) -> bool:
         return True
 
     # 3. Check for specific disallowed Unicode categories and blocks
-    category = unicodedata.category(char)
-    # Combining marks (e.g., for Zalgo text)
-    if category.startswith("M"):
-        return True
     # Box drawing
     if "\u2500" <= char <= "\u257f":
         return True
@@ -104,13 +100,19 @@ def _str_pre_validator(
     if disallow_empty_string and len(value) == 0:
         raise ValueError("Text is empty.")
 
-    # --- Simplified and Consolidated Character Validation ---
-    # The generator expression is efficient as `any()` will short-circuit
-    # on the first bad character found.
-    value = "".join(char for char in value if not unicodedata.category(char).startswith("M"))
+    # Reject excessive consecutive combining marks (Zalgo text) while
+    # preserving valid scripts (Thai, Arabic, Hindi, Vietnamese, etc.)
+    consecutive = 0
+    for char in value:
+        if unicodedata.category(char).startswith("M"):
+            consecutive += 1
+            if consecutive > 6:
+                raise ValueError("Text contains excessive combining marks.")
+        else:
+            consecutive = 0
+
     if any(_is_bad_char(char, allow_newline=allow_newline) for char in value):
         raise ValueError("Text contains disallowed or non-printable characters.")
-
     return value
 
 
diff --git a/services/api/tests/gen_table/test_row_ops_v2.py b/services/api/tests/gen_table/test_row_ops_v2.py
@@ -1258,6 +1258,11 @@ def _check(rows: list[RowCompletionResponse], base: str, exc: list[str] = None):
             "code": "row['result_column'] = int(row['input']) + int(row['input'])",
             "expected": "4",
         },
+        {
+            "input": "ยืนยัน",
+            "code": "msg='แจ้งซ่อม'; row['result_column']=f'{row['input']}{msg}'",
+            "expected": "ยืนยันแจ้งซ่อม",
+        },
         # Test error handling:
         {
             "input": "DUMMY",
diff --git a/services/api/tests/routers/test_serving.py b/services/api/tests/routers/test_serving.py
@@ -818,7 +818,7 @@ def test_chat_reasoning_anthropic(setup: ServingContext, stream: bool):
     # Test default params
     response = _test_chat_reasoning_cloud(**kwargs)
     assert len(response.content) > 0
-    kwargs["routing_id"] = "claude-3-7-sonnet-latest"
+    kwargs["routing_id"] = "claude-4-6-sonnet-latest"
     response = _test_chat_reasoning_cloud(**kwargs)
     assert len(response.content) > 0
     # Test disabling reasoning