@@ -274,6 +274,57 @@ def post_handler_paragraph(content: str, limit: int):
274274 return reduce (lambda x , y : [* x , * y ], map (lambda row : re .findall (pattern , row ), result ), [])
275275
276276
277+ def smart_split_paragraph (content : str , limit : int ):
278+ """
279+ 智能分段:在limit前找到合适的分割点(句号、回车等)
280+ :param content: 需要分段的文本
281+ :param limit: 最大字符限制
282+ :return: 分段后的文本列表
283+ """
284+ if len (content ) <= limit :
285+ return [content ]
286+
287+ result = []
288+ start = 0
289+
290+ while start < len (content ):
291+ end = start + limit
292+
293+ if end >= len (content ):
294+ # 剩余文本不超过限制,直接添加
295+ result .append (content [start :])
296+ break
297+
298+ # 在limit范围内寻找最佳分割点
299+ best_split = end
300+
301+ # 优先级:句号 > 感叹号/问号 > 回车 > 分号/逗号 > 空格
302+ split_chars = [
303+ ('。' , - 1 ), ('!' , - 1 ), ('?' , - 1 ), # 句子结束符
304+ ('\n ' , 0 ), # 回车符
305+ (';' , - 1 ), (',' , - 1 ), # 标点符号
306+ (' ' , - 1 ) # 空格
307+ ]
308+
309+ # 从后往前找分割点
310+ for i in range (end - 1 , start + limit // 2 , - 1 ): # 至少保留一半内容
311+ for char , offset in split_chars :
312+ if content [i ] == char :
313+ best_split = i + 1 + offset
314+ break
315+ if best_split != end :
316+ break
317+
318+ # 如果找不到合适分割点,使用原始limit
319+ if best_split == end and end < len (content ):
320+ best_split = end
321+
322+ result .append (content [start :best_split ])
323+ start = best_split
324+
325+ return [text for text in result if text .strip ()]
326+
327+
277328replace_map = {
278329 re .compile ('\n +' ): '\n ' ,
279330 re .compile (' +' ): ' ' ,
@@ -316,7 +367,7 @@ def parse_to_tree(self, text: str, index=0):
316367 """
317368 level_content_list = parse_title_level (text , self .content_level_pattern , index )
318369 if len (level_content_list ) == 0 :
319- return [to_tree_obj (row , 'block' ) for row in post_handler_paragraph (text , limit = self .limit )]
370+ return [to_tree_obj (row , 'block' ) for row in smart_split_paragraph (text , limit = self .limit )]
320371 if index == 0 and text .lstrip ().index (level_content_list [0 ]["content" ].lstrip ()) != 0 :
321372 level_content_list .insert (0 , to_tree_obj ("" ))
322373
@@ -325,7 +376,7 @@ def parse_to_tree(self, text: str, index=0):
325376 for i in range (len (level_title_content_list )):
326377 start_content : str = level_title_content_list [i ].get ('content' )
327378 if cursor < text .index (start_content , cursor ):
328- for row in post_handler_paragraph (text [cursor : text .index (start_content , cursor )], limit = self .limit ):
379+ for row in smart_split_paragraph (text [cursor : text .index (start_content , cursor )], limit = self .limit ):
329380 level_content_list .insert (0 , to_tree_obj (row , 'block' ))
330381
331382 block , cursor = get_level_block (text , level_title_content_list , i , cursor )
0 commit comments