diff --git a/api/src/services/text_processing/text_processor.py b/api/src/services/text_processing/text_processor.py index 483618f9..d6402e35 100644 --- a/api/src/services/text_processing/text_processor.py +++ b/api/src/services/text_processing/text_processor.py @@ -5,6 +5,7 @@ from typing import AsyncGenerator, Dict, List, Tuple, Optional from loguru import logger +from unicode_segment.sentence import SentenceSegmenter from ...core.config import settings from ...structures.schemas import NormalizationOptions @@ -12,6 +13,8 @@ from .phonemizer import phonemize from .vocabulary import tokenize +sentence_segmenter = SentenceSegmenter() + # Pre-compiled regex patterns for performance # Updated regex to be more strict and avoid matching isolated brackets # Only matches complete patterns like [word](/ipa/) and prevents catastrophic backtracking @@ -100,24 +103,13 @@ def process_text(text: str, language: str = "a") -> List[int]: def get_sentence_info( - text: str, lang_code: str = "a" + text: str, + # currently unused, as the Unicode sentence segmentation algorithm is language-agnostic + lang_code: str = "a", ) -> List[Tuple[str, List[int], int]]: """Process all sentences and return info""" - # Detect Chinese text - is_chinese = lang_code.startswith("z") or re.search(r"[\u4e00-\u9fff]", text) - if is_chinese: - # Split using Chinese punctuation - sentences = re.split(r"([,。!?;])+", text) - else: - sentences = re.split(r"([.!?;:])(?=\s|$)", text) - results = [] - for i in range(0, len(sentences), 2): - sentence = sentences[i].strip() - punct = sentences[i + 1] if i + 1 < len(sentences) else "" - if not sentence: - continue - full = sentence + punct + for _, full in sentence_segmenter.segment(text): # Strip the full sentence to remove any leading/trailing spaces before processing full = full.strip() if not full: # Skip if empty after stripping diff --git a/api/tests/test_text_processor.py b/api/tests/test_text_processor.py index 3fc8a87c..9e282503 100644 --- a/api/tests/test_text_processor.py +++ b/api/tests/test_text_processor.py @@ -44,6 +44,28 @@ def test_get_sentence_info(): assert count == len(tokens) assert count > 0 +def test_get_sentence_info_with_unicode_sentence_segmenting(): + text = "".join([ + "This, that, the other thing, etc. Another sentence... A, b, c, etc., and ", + "more. D, e, f, etc. and more. One, i. e. two. Three, i. e., four. Five, ", + "i.e. six. You have 4.2 messages. Property access: `a.b.c`.", + ]) + + info = get_sentence_info(text) + sentences = [s for (s, _, _) in info] + + assert sentences == [ + "This, that, the other thing, etc.", + "Another sentence...", + "A, b, c, etc., and more.", + "D, e, f, etc. and more.", + "One, i. e. two.", + "Three, i. e., four.", + "Five, i.e. six.", + "You have 4.2 messages.", + "Property access: `a.b.c`.", + ] + @pytest.mark.asyncio async def test_smart_split_short_text(): """Test smart splitting with text under max tokens.""" @@ -230,4 +252,4 @@ async def test_smart_split_with_two_pause(): # Third chunk: text assert chunks[2][2] is None # No pause assert "zero point five" in chunks[2][0] - assert len(chunks[2][1]) > 0 \ No newline at end of file + assert len(chunks[2][1]) > 0 diff --git a/docs/requirements.in b/docs/requirements.in index e0fb8269..93147804 100644 --- a/docs/requirements.in +++ b/docs/requirements.in @@ -18,7 +18,8 @@ soundfile==0.13.0 # Text processing phonemizer==3.3.0 -regex==2024.11.6 +regex==2025.11.3 +unicode-segment==0.4.2 # Utilities aiofiles==23.2.1 # Last version before Windows path handling changes diff --git a/docs/requirements.txt b/docs/requirements.txt index e47e4d92..3f2aaba7 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -169,12 +169,13 @@ referencing==0.35.1 # via # jsonschema # jsonschema-specifications -regex==2024.11.6 +regex==2025.11.3 # via # -r docs/requirements.in # segments # tiktoken # transformers + # unicode-segment requests==2.32.3 # via # -r docs/requirements.in @@ -233,6 +234,8 @@ typing-extensions==4.12.2 # pydantic-core # sqlalchemy # uvicorn +unicode-segment==0.4.2 + # via -r docs/requirements.in uritemplate==4.1.1 # via csvw urllib3==2.3.0 diff --git a/pyproject.toml b/pyproject.toml index 13447b57..302303eb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,7 +18,7 @@ dependencies = [ "scipy==1.14.1", # Audio processing "soundfile==0.13.0", - "regex==2024.11.6", + "regex==2025.11.3", # Utilities "aiofiles==23.2.1", "tqdm==4.67.1", @@ -40,6 +40,7 @@ dependencies = [ "phonemizer-fork>=3.3.2", "av>=14.2.0", "text2num>=2.5.1", + "unicode-segment==0.4.2", ] [project.optional-dependencies]