Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 7 additions & 15 deletions api/src/services/text_processing/text_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,16 @@
from typing import AsyncGenerator, Dict, List, Tuple, Optional

from loguru import logger
from unicode_segment.sentence import SentenceSegmenter

from ...core.config import settings
from ...structures.schemas import NormalizationOptions
from .normalizer import normalize_text
from .phonemizer import phonemize
from .vocabulary import tokenize

sentence_segmenter = SentenceSegmenter()

# Pre-compiled regex patterns for performance
# Updated regex to be more strict and avoid matching isolated brackets
# Only matches complete patterns like [word](/ipa/) and prevents catastrophic backtracking
Expand Down Expand Up @@ -100,24 +103,13 @@ def process_text(text: str, language: str = "a") -> List[int]:


def get_sentence_info(
text: str, lang_code: str = "a"
text: str,
# currently unused, as the Unicode sentence segmentation algorithm is language-agnostic
lang_code: str = "a",
) -> List[Tuple[str, List[int], int]]:
"""Process all sentences and return info"""
# Detect Chinese text
is_chinese = lang_code.startswith("z") or re.search(r"[\u4e00-\u9fff]", text)
if is_chinese:
# Split using Chinese punctuation
sentences = re.split(r"([,。!?;])+", text)
else:
sentences = re.split(r"([.!?;:])(?=\s|$)", text)

results = []
for i in range(0, len(sentences), 2):
sentence = sentences[i].strip()
punct = sentences[i + 1] if i + 1 < len(sentences) else ""
if not sentence:
continue
full = sentence + punct
for _, full in sentence_segmenter.segment(text):
# Strip the full sentence to remove any leading/trailing spaces before processing
full = full.strip()
if not full: # Skip if empty after stripping
Expand Down
24 changes: 23 additions & 1 deletion api/tests/test_text_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,28 @@ def test_get_sentence_info():
assert count == len(tokens)
assert count > 0

def test_get_sentence_info_with_unicode_sentence_segmenting():
text = "".join([
"This, that, the other thing, etc. Another sentence... A, b, c, etc., and ",
"more. D, e, f, etc. and more. One, i. e. two. Three, i. e., four. Five, ",
"i.e. six. You have 4.2 messages. Property access: `a.b.c`.",
])

info = get_sentence_info(text)
sentences = [s for (s, _, _) in info]

assert sentences == [
"This, that, the other thing, etc.",
"Another sentence...",
"A, b, c, etc., and more.",
"D, e, f, etc. and more.",
"One, i. e. two.",
"Three, i. e., four.",
"Five, i.e. six.",
"You have 4.2 messages.",
"Property access: `a.b.c`.",
]

@pytest.mark.asyncio
async def test_smart_split_short_text():
"""Test smart splitting with text under max tokens."""
Expand Down Expand Up @@ -230,4 +252,4 @@ async def test_smart_split_with_two_pause():
# Third chunk: text
assert chunks[2][2] is None # No pause
assert "zero point five" in chunks[2][0]
assert len(chunks[2][1]) > 0
assert len(chunks[2][1]) > 0
3 changes: 2 additions & 1 deletion docs/requirements.in
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@ soundfile==0.13.0

# Text processing
phonemizer==3.3.0
regex==2024.11.6
regex==2025.11.3
unicode-segment==0.4.2

# Utilities
aiofiles==23.2.1 # Last version before Windows path handling changes
Expand Down
5 changes: 4 additions & 1 deletion docs/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -169,12 +169,13 @@ referencing==0.35.1
# via
# jsonschema
# jsonschema-specifications
regex==2024.11.6
regex==2025.11.3
# via
# -r docs/requirements.in
# segments
# tiktoken
# transformers
# unicode-segment
requests==2.32.3
# via
# -r docs/requirements.in
Expand Down Expand Up @@ -233,6 +234,8 @@ typing-extensions==4.12.2
# pydantic-core
# sqlalchemy
# uvicorn
unicode-segment==0.4.2
# via -r docs/requirements.in
uritemplate==4.1.1
# via csvw
urllib3==2.3.0
Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ dependencies = [
"scipy==1.14.1",
# Audio processing
"soundfile==0.13.0",
"regex==2024.11.6",
"regex==2025.11.3",
# Utilities
"aiofiles==23.2.1",
"tqdm==4.67.1",
Expand All @@ -40,6 +40,7 @@ dependencies = [
"phonemizer-fork>=3.3.2",
"av>=14.2.0",
"text2num>=2.5.1",
"unicode-segment==0.4.2",
]

[project.optional-dependencies]
Expand Down