refactor: PDF分段强制按字数限制

liuruibin · liuruibin · commit 834ccaa35bdd · 2024-10-29T11:44:37.000+08:00
--bug=1047568 --user=刘瑞斌 【github#1363】pdf 文件高级分段默认分段长度为500，但生成的段落长度超过29000字符 https://www.tapd.cn/57709429/s/1600183
diff --git a/apps/common/handle/impl/pdf_split_handle.py b/apps/common/handle/impl/pdf_split_handle.py
@@ -42,7 +42,7 @@ def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_bu
         pdf_document = fitz.open(temp_file_path)
         try:
             # 处理有目录的pdf
-            result = self.handle_toc(pdf_document)
+            result = self.handle_toc(pdf_document, limit)
             if result is not None:
                 return {'name': file.name, 'content': result}
 
@@ -110,7 +110,7 @@ def handle_pdf_content(file, pdf_document):
         return content
 
     @staticmethod
-    def handle_toc(doc):
+    def handle_toc(doc, limit):
         # 找到目录
         toc = doc.get_toc()
         if toc is None or len(toc) == 0:
@@ -155,17 +155,16 @@ def handle_toc(doc):
                         text = text[:idx]
 
                 chapter_text += text  # 提取文本
-
+            # 限制章节内容长度
+            if 0 < limit < len(chapter_text):
+                split_text = PdfSplitHandle.split_text(chapter_text, limit)
+                for text in split_text:
+                    chapters.append({"title": chapter_title, "content": text})
+            else:
+                chapters.append({"title": chapter_title, "content": chapter_text if chapter_text else chapter_title})
             # 保存章节内容和章节标题
-            chapters.append({"title": chapter_title, "content": chapter_text if chapter_text else chapter_title})
         return chapters
 
-    @staticmethod
-    def handle_chapter_title(title):
-        title = re.sub(r'[一二三四五六七八九十\s*]、\s*', '', title)
-        title = re.sub(r'第[一二三四五六七八九十]章\s*', '', title)
-        return title
-
     @staticmethod
     def handle_links(doc, pattern_list, with_filter, limit):
         # 创建存储章节内容的数组
@@ -228,11 +227,14 @@ def handle_links(doc, pattern_list, with_filter, limit):
                                 text = text[:idx]
                         chapter_text += text
 
-                    # 保存章节信息
-                    chapters.append({
-                        "title": link_title,
-                        "content": chapter_text
-                    })
+                    # 限制章节内容长度
+                    if 0 < limit < len(chapter_text):
+                        split_text = PdfSplitHandle.split_text(chapter_text, limit)
+                        for text in split_text:
+                            chapters.append({"title": link_title, "content": text})
+                    else:
+                        # 保存章节信息
+                        chapters.append({"title": link_title, "content": chapter_text})
 
         # 目录中没有前言部分，手动处理
         if handle_pre_toc:
@@ -261,6 +263,35 @@ def handle_links(doc, pattern_list, with_filter, limit):
             chapters = pre_toc + chapters
         return chapters
 
+    @staticmethod
+    def split_text(text, length):
+        segments = []
+        current_segment = ""
+
+        for char in text:
+            current_segment += char
+            if len(current_segment) >= length:
+                # 查找最近的句号
+                last_period_index = current_segment.rfind('.')
+                if last_period_index != -1:
+                    segments.append(current_segment[:last_period_index + 1])
+                    current_segment = current_segment[last_period_index + 1:]  # 更新当前段落
+                else:
+                    segments.append(current_segment)
+                    current_segment = ""
+
+        # 处理剩余的部分
+        if current_segment:
+            segments.append(current_segment)
+
+        return segments
+
+    @staticmethod
+    def handle_chapter_title(title):
+        title = re.sub(r'[一二三四五六七八九十\s*]、\s*', '', title)
+        title = re.sub(r'第[一二三四五六七八九十]章\s*', '', title)
+        return title
+
     def support(self, file, get_buffer):
         file_name: str = file.name.lower()
         if file_name.endswith(".pdf") or file_name.endswith(".PDF"):