Skip to content

Commit 9c279f2

Browse files
committed
feat(text): add Chinese punctuation-based sentence splitting for better TTS
1 parent ce22f60 commit 9c279f2

File tree

2 files changed

+41
-25
lines changed

2 files changed

+41
-25
lines changed

api/src/services/text_processing/normalizer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111

1212
import inflect
1313
from numpy import number
14-
from text_to_num import text2num
14+
# from text_to_num import text2num
1515
from torch import mul
1616

1717
from ...structures.schemas import NormalizationOptions

api/src/services/text_processing/text_processor.py

Lines changed: 40 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -88,32 +88,48 @@ def process_text(text: str, language: str = "a") -> List[int]:
8888

8989

9090
def get_sentence_info(
91-
text: str, custom_phenomes_list: Dict[str, str]
91+
text: str, custom_phenomes_list: Dict[str, str], lang_code: str = "a"
9292
) -> List[Tuple[str, List[int], int]]:
93-
"""Process all sentences and return info."""
94-
sentences = re.split(r"([.!?;:])(?=\s|$)", text)
93+
"""Process all sentences and return info, 支持中文分句"""
94+
# 判断是否为中文
95+
is_chinese = lang_code.startswith("zh") or re.search(r"[\u4e00-\u9fff]", text)
96+
if is_chinese:
97+
# 按中文标点断句
98+
sentences = re.split(r"([,。!?;])", text)
99+
# 合并标点
100+
merged = []
101+
for i in range(0, len(sentences)-1, 2):
102+
merged.append(sentences[i] + sentences[i+1])
103+
if len(sentences) % 2 == 1:
104+
merged.append(sentences[-1])
105+
sentences = merged
106+
else:
107+
sentences = re.split(r"([.!?;:])(?=\s|$)", text)
95108
phoneme_length, min_value = len(custom_phenomes_list), 0
96-
97109
results = []
98-
for i in range(0, len(sentences), 2):
99-
sentence = sentences[i].strip()
100-
for replaced in range(min_value, phoneme_length):
101-
current_id = f"</|custom_phonemes_{replaced}|/>"
102-
if current_id in sentence:
103-
sentence = sentence.replace(
104-
current_id, custom_phenomes_list.pop(current_id)
105-
)
106-
min_value += 1
107-
108-
punct = sentences[i + 1] if i + 1 < len(sentences) else ""
109-
110-
if not sentence:
111-
continue
112-
113-
full = sentence + punct
114-
tokens = process_text_chunk(full)
115-
results.append((full, tokens, len(tokens)))
116-
110+
if is_chinese:
111+
for sentence in sentences:
112+
sentence = sentence.strip()
113+
if not sentence:
114+
continue
115+
tokens = process_text_chunk(sentence)
116+
results.append((sentence, tokens, len(tokens)))
117+
else:
118+
for i in range(0, len(sentences), 2):
119+
sentence = sentences[i].strip()
120+
for replaced in range(min_value, phoneme_length):
121+
current_id = f"</|custom_phonemes_{replaced}|/>"
122+
if current_id in sentence:
123+
sentence = sentence.replace(
124+
current_id, custom_phenomes_list.pop(current_id)
125+
)
126+
min_value += 1
127+
punct = sentences[i + 1] if i + 1 < len(sentences) else ""
128+
if not sentence:
129+
continue
130+
full = sentence + punct
131+
tokens = process_text_chunk(full)
132+
results.append((full, tokens, len(tokens)))
117133
return results
118134

119135

@@ -150,7 +166,7 @@ async def smart_split(
150166
)
151167

152168
# Process all sentences
153-
sentences = get_sentence_info(text, custom_phoneme_list)
169+
sentences = get_sentence_info(text, custom_phoneme_list, lang_code=lang_code)
154170

155171
current_chunk = []
156172
current_tokens = []

0 commit comments

Comments
 (0)