Skip to content

Commit 834ccaa

Browse files
committed
refactor: PDF分段强制按字数限制
--bug=1047568 --user=刘瑞斌 【github#1363】pdf 文件高级分段默认分段长度为500,但生成的段落长度超过29000字符 https://www.tapd.cn/57709429/s/1600183
1 parent 2cb8d26 commit 834ccaa

File tree

1 file changed

+46
-15
lines changed

1 file changed

+46
-15
lines changed

apps/common/handle/impl/pdf_split_handle.py

Lines changed: 46 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_bu
4242
pdf_document = fitz.open(temp_file_path)
4343
try:
4444
# 处理有目录的pdf
45-
result = self.handle_toc(pdf_document)
45+
result = self.handle_toc(pdf_document, limit)
4646
if result is not None:
4747
return {'name': file.name, 'content': result}
4848

@@ -110,7 +110,7 @@ def handle_pdf_content(file, pdf_document):
110110
return content
111111

112112
@staticmethod
113-
def handle_toc(doc):
113+
def handle_toc(doc, limit):
114114
# 找到目录
115115
toc = doc.get_toc()
116116
if toc is None or len(toc) == 0:
@@ -155,17 +155,16 @@ def handle_toc(doc):
155155
text = text[:idx]
156156

157157
chapter_text += text # 提取文本
158-
158+
# 限制章节内容长度
159+
if 0 < limit < len(chapter_text):
160+
split_text = PdfSplitHandle.split_text(chapter_text, limit)
161+
for text in split_text:
162+
chapters.append({"title": chapter_title, "content": text})
163+
else:
164+
chapters.append({"title": chapter_title, "content": chapter_text if chapter_text else chapter_title})
159165
# 保存章节内容和章节标题
160-
chapters.append({"title": chapter_title, "content": chapter_text if chapter_text else chapter_title})
161166
return chapters
162167

163-
@staticmethod
164-
def handle_chapter_title(title):
165-
title = re.sub(r'[一二三四五六七八九十\s*]、\s*', '', title)
166-
title = re.sub(r'第[一二三四五六七八九十]章\s*', '', title)
167-
return title
168-
169168
@staticmethod
170169
def handle_links(doc, pattern_list, with_filter, limit):
171170
# 创建存储章节内容的数组
@@ -228,11 +227,14 @@ def handle_links(doc, pattern_list, with_filter, limit):
228227
text = text[:idx]
229228
chapter_text += text
230229

231-
# 保存章节信息
232-
chapters.append({
233-
"title": link_title,
234-
"content": chapter_text
235-
})
230+
# 限制章节内容长度
231+
if 0 < limit < len(chapter_text):
232+
split_text = PdfSplitHandle.split_text(chapter_text, limit)
233+
for text in split_text:
234+
chapters.append({"title": link_title, "content": text})
235+
else:
236+
# 保存章节信息
237+
chapters.append({"title": link_title, "content": chapter_text})
236238

237239
# 目录中没有前言部分,手动处理
238240
if handle_pre_toc:
@@ -261,6 +263,35 @@ def handle_links(doc, pattern_list, with_filter, limit):
261263
chapters = pre_toc + chapters
262264
return chapters
263265

266+
@staticmethod
267+
def split_text(text, length):
268+
segments = []
269+
current_segment = ""
270+
271+
for char in text:
272+
current_segment += char
273+
if len(current_segment) >= length:
274+
# 查找最近的句号
275+
last_period_index = current_segment.rfind('.')
276+
if last_period_index != -1:
277+
segments.append(current_segment[:last_period_index + 1])
278+
current_segment = current_segment[last_period_index + 1:] # 更新当前段落
279+
else:
280+
segments.append(current_segment)
281+
current_segment = ""
282+
283+
# 处理剩余的部分
284+
if current_segment:
285+
segments.append(current_segment)
286+
287+
return segments
288+
289+
@staticmethod
290+
def handle_chapter_title(title):
291+
title = re.sub(r'[一二三四五六七八九十\s*]、\s*', '', title)
292+
title = re.sub(r'第[一二三四五六七八九十]章\s*', '', title)
293+
return title
294+
264295
def support(self, file, get_buffer):
265296
file_name: str = file.name.lower()
266297
if file_name.endswith(".pdf") or file_name.endswith(".PDF"):

0 commit comments

Comments
 (0)