chore: replace split_text with smart_split_paragraph in pdf_split_handle.py

liuruibin · liuruibin · commit d147b794ce22 · 2025-10-27T14:23:42.000+08:00
diff --git a/apps/common/handle/impl/text/pdf_split_handle.py b/apps/common/handle/impl/text/pdf_split_handle.py
@@ -19,7 +19,7 @@
 
 from common.handle.base_split_handle import BaseSplitHandle
 from common.utils.logger import maxkb_logger
-from common.utils.split_model import SplitModel
+from common.utils.split_model import SplitModel, smart_split_paragraph
 
 default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'),
                         re.compile('(?<=\\n)(?<!#)## (?!#).*|(?<=^)(?<!#)## (?!#).*'),
@@ -183,7 +183,7 @@ def handle_toc(doc, limit):
             real_chapter_title = chapter_title[:256]
             # 限制章节内容长度
             if 0 < limit < len(chapter_text):
-                split_text = PdfSplitHandle.split_text(chapter_text, limit)
+                split_text = smart_split_paragraph(chapter_text, limit)
                 for text in split_text:
                     chapters.append({"title": real_chapter_title, "content": text})
             else:
@@ -262,7 +262,7 @@ def handle_links(doc, pattern_list, with_filter, limit):
 
                     # 限制章节内容长度
                     if 0 < limit < len(chapter_text):
-                        split_text = PdfSplitHandle.split_text(chapter_text, limit)
+                        split_text = smart_split_paragraph(chapter_text, limit)
                         for text in split_text:
                             chapters.append({"title": link_title, "content": text})
                     else:
@@ -296,29 +296,6 @@ def handle_links(doc, pattern_list, with_filter, limit):
             chapters = pre_toc + chapters
         return chapters
 
-    @staticmethod
-    def split_text(text, length):
-        segments = []
-        current_segment = ""
-
-        for char in text:
-            current_segment += char
-            if len(current_segment) >= length:
-                # 查找最近的句号
-                last_period_index = current_segment.rfind('.')
-                if last_period_index != -1:
-                    segments.append(current_segment[:last_period_index + 1])
-                    current_segment = current_segment[last_period_index + 1:]  # 更新当前段落
-                else:
-                    segments.append(current_segment)
-                    current_segment = ""
-
-        # 处理剩余的部分
-        if current_segment:
-            segments.append(current_segment)
-
-        return segments
-
     @staticmethod
     def handle_chapter_title(title):
         title = re.sub(r'[一二三四五六七八九十\s*]、\s*', '', title)
diff --git a/apps/common/utils/split_model.py b/apps/common/utils/split_model.py
@@ -276,7 +276,7 @@ def post_handler_paragraph(content: str, limit: int):
 
 def smart_split_paragraph(content: str, limit: int):
     """
-    智能分段：在limit前找到合适的分割点（句号、回车等）
+    智能分段:在limit前找到合适的分割点(句号、回车等)
     :param content: 需要分段的文本
     :param limit: 最大字符限制
     :return: 分段后的文本列表
@@ -291,31 +291,29 @@ def smart_split_paragraph(content: str, limit: int):
         end = start + limit
 
         if end >= len(content):
-            # 剩余文本不超过限制，直接添加
+            # 剩余文本不超过限制,直接添加
             result.append(content[start:])
             break
 
         # 在limit范围内寻找最佳分割点
         best_split = end
 
-        # 优先级：句号 > 感叹号/问号 > 回车 > 分号/逗号 > 空格
+        # 优先级:句号 > 感叹号/问号 > 回车
         split_chars = [
-            ('。', -1), ('！', -1), ('？', -1),  # 句子结束符
+            ('。', 0), ('!', 0), ('?', 0),  # 句子结束符,包含在当前段
             ('\n', 0),  # 回车符
-            ('；', -1), ('，', -1),  # 标点符号
-            (' ', -1)  # 空格
         ]
 
         # 从后往前找分割点
         for i in range(end - 1, start + limit // 2, -1):  # 至少保留一半内容
             for char, offset in split_chars:
                 if content[i] == char:
-                    best_split = i + 1 + offset
+                    best_split = i + 1  # 包含分隔符在当前段
                     break
             if best_split != end:
                 break
 
-        # 如果找不到合适分割点，使用原始limit
+        # 如果找不到合适分割点,使用原始limit
         if best_split == end and end < len(content):
             best_split = end