remsky · lionel-rowe · Nov 14, 2025 · Nov 17, 2025
diff --git a/api/src/services/text_processing/text_processor.py b/api/src/services/text_processing/text_processor.py
@@ -5,13 +5,16 @@
 from typing import AsyncGenerator, Dict, List, Tuple, Optional
 
 from loguru import logger
+from unicode_segment.sentence import SentenceSegmenter
 
 from ...core.config import settings
 from ...structures.schemas import NormalizationOptions
 from .normalizer import normalize_text
 from .phonemizer import phonemize
 from .vocabulary import tokenize
 
+sentence_segmenter = SentenceSegmenter()
+
 # Pre-compiled regex patterns for performance
 # Updated regex to be more strict and avoid matching isolated brackets
 # Only matches complete patterns like [word](/ipa/) and prevents catastrophic backtracking
@@ -100,24 +103,13 @@ def process_text(text: str, language: str = "a") -> List[int]:
 
 
 def get_sentence_info(
-    text: str, lang_code: str = "a"
+    text: str,
+    # currently unused, as the Unicode sentence segmentation algorithm is language-agnostic
+    lang_code: str = "a",
 ) -> List[Tuple[str, List[int], int]]:
     """Process all sentences and return info"""
-    # Detect Chinese text
-    is_chinese = lang_code.startswith("z") or re.search(r"[\u4e00-\u9fff]", text)
-    if is_chinese:
-        # Split using Chinese punctuation
-        sentences = re.split(r"([，。！？；])+", text)
-    else:
-        sentences = re.split(r"([.!?;:])(?=\s|$)", text)
-
     results = []
-    for i in range(0, len(sentences), 2):
-        sentence = sentences[i].strip()
-        punct = sentences[i + 1] if i + 1 < len(sentences) else ""
-        if not sentence:
-            continue
-        full = sentence + punct
+    for _, full in sentence_segmenter.segment(text):
         # Strip the full sentence to remove any leading/trailing spaces before processing
         full = full.strip()
         if not full:  # Skip if empty after stripping

diff --git a/api/tests/test_text_processor.py b/api/tests/test_text_processor.py
@@ -44,6 +44,28 @@ def test_get_sentence_info():
         assert count == len(tokens)
         assert count > 0
 
+def test_get_sentence_info_with_unicode_sentence_segmenting():
+    text = "".join([
+        "This, that, the other thing, etc. Another sentence... A, b, c, etc., and ",
+        "more. D, e, f, etc. and more. One, i. e. two. Three, i. e., four. Five, ",
+        "i.e. six. You have 4.2 messages. Property access: `a.b.c`.",
+    ])
+
+    info = get_sentence_info(text)
+    sentences = [s for (s, _, _) in info]
+
+    assert sentences == [
+        "This, that, the other thing, etc.",
+        "Another sentence...",
+        "A, b, c, etc., and more.",
+        "D, e, f, etc. and more.",
+        "One, i. e. two.",
+        "Three, i. e., four.",
+        "Five, i.e. six.",
+        "You have 4.2 messages.",
+        "Property access: `a.b.c`.",
+    ]
+
 @pytest.mark.asyncio
 async def test_smart_split_short_text():
     """Test smart splitting with text under max tokens."""
@@ -230,4 +252,4 @@ async def test_smart_split_with_two_pause():
     # Third chunk: text
     assert chunks[2][2] is None  # No pause
     assert "zero point five" in chunks[2][0]
-    assert len(chunks[2][1]) > 0
+    assert len(chunks[2][1]) > 0
diff --git a/docs/requirements.in b/docs/requirements.in
@@ -18,7 +18,8 @@ soundfile==0.13.0
 
 # Text processing
 phonemizer==3.3.0
-regex==2024.11.6
+regex==2025.11.3
+unicode-segment==0.4.2
 
 # Utilities
 aiofiles==23.2.1  # Last version before Windows path handling changes

diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -169,12 +169,13 @@ referencing==0.35.1
     # via
     #   jsonschema
     #   jsonschema-specifications
-regex==2024.11.6
+regex==2025.11.3
     # via
     #   -r docs/requirements.in
     #   segments
     #   tiktoken
     #   transformers
+    #   unicode-segment
 requests==2.32.3
     # via
     #   -r docs/requirements.in
@@ -233,6 +234,8 @@ typing-extensions==4.12.2
     #   pydantic-core
     #   sqlalchemy
     #   uvicorn
+unicode-segment==0.4.2
+    # via -r docs/requirements.in
 uritemplate==4.1.1
     # via csvw
 urllib3==2.3.0

diff --git a/pyproject.toml b/pyproject.toml
@@ -18,7 +18,7 @@ dependencies = [
     "scipy==1.14.1",
     # Audio processing
     "soundfile==0.13.0",
-    "regex==2024.11.6",
+    "regex==2025.11.3",
     # Utilities
     "aiofiles==23.2.1",
     "tqdm==4.67.1",
@@ -40,6 +40,7 @@ dependencies = [
     "phonemizer-fork>=3.3.2",
     "av>=14.2.0",
     "text2num>=2.5.1",
+    "unicode-segment==0.4.2",
 ]
 
 [project.optional-dependencies]