@@ -88,32 +88,48 @@ def process_text(text: str, language: str = "a") -> List[int]:
8888
8989
9090def get_sentence_info (
91- text : str , custom_phenomes_list : Dict [str , str ]
91+ text : str , custom_phenomes_list : Dict [str , str ], lang_code : str = "a"
9292) -> List [Tuple [str , List [int ], int ]]:
93- """Process all sentences and return info."""
94- sentences = re .split (r"([.!?;:])(?=\s|$)" , text )
93+ """Process all sentences and return info, 支持中文分句"""
94+ # 判断是否为中文
95+ is_chinese = lang_code .startswith ("zh" ) or re .search (r"[\u4e00-\u9fff]" , text )
96+ if is_chinese :
97+ # 按中文标点断句
98+ sentences = re .split (r"([,。!?;])" , text )
99+ # 合并标点
100+ merged = []
101+ for i in range (0 , len (sentences )- 1 , 2 ):
102+ merged .append (sentences [i ] + sentences [i + 1 ])
103+ if len (sentences ) % 2 == 1 :
104+ merged .append (sentences [- 1 ])
105+ sentences = merged
106+ else :
107+ sentences = re .split (r"([.!?;:])(?=\s|$)" , text )
95108 phoneme_length , min_value = len (custom_phenomes_list ), 0
96-
97109 results = []
98- for i in range (0 , len (sentences ), 2 ):
99- sentence = sentences [i ].strip ()
100- for replaced in range (min_value , phoneme_length ):
101- current_id = f"</|custom_phonemes_{ replaced } |/>"
102- if current_id in sentence :
103- sentence = sentence .replace (
104- current_id , custom_phenomes_list .pop (current_id )
105- )
106- min_value += 1
107-
108- punct = sentences [i + 1 ] if i + 1 < len (sentences ) else ""
109-
110- if not sentence :
111- continue
112-
113- full = sentence + punct
114- tokens = process_text_chunk (full )
115- results .append ((full , tokens , len (tokens )))
116-
110+ if is_chinese :
111+ for sentence in sentences :
112+ sentence = sentence .strip ()
113+ if not sentence :
114+ continue
115+ tokens = process_text_chunk (sentence )
116+ results .append ((sentence , tokens , len (tokens )))
117+ else :
118+ for i in range (0 , len (sentences ), 2 ):
119+ sentence = sentences [i ].strip ()
120+ for replaced in range (min_value , phoneme_length ):
121+ current_id = f"</|custom_phonemes_{ replaced } |/>"
122+ if current_id in sentence :
123+ sentence = sentence .replace (
124+ current_id , custom_phenomes_list .pop (current_id )
125+ )
126+ min_value += 1
127+ punct = sentences [i + 1 ] if i + 1 < len (sentences ) else ""
128+ if not sentence :
129+ continue
130+ full = sentence + punct
131+ tokens = process_text_chunk (full )
132+ results .append ((full , tokens , len (tokens )))
117133 return results
118134
119135
@@ -150,7 +166,7 @@ async def smart_split(
150166 )
151167
152168 # Process all sentences
153- sentences = get_sentence_info (text , custom_phoneme_list )
169+ sentences = get_sentence_info (text , custom_phoneme_list , lang_code = lang_code )
154170
155171 current_chunk = []
156172 current_tokens = []
0 commit comments