Skip to content

Commit d147b79

Browse files
committed
chore: replace split_text with smart_split_paragraph in pdf_split_handle.py
1 parent 8da6f74 commit d147b79

File tree

2 files changed

+9
-34
lines changed

2 files changed

+9
-34
lines changed

apps/common/handle/impl/text/pdf_split_handle.py

Lines changed: 3 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919

2020
from common.handle.base_split_handle import BaseSplitHandle
2121
from common.utils.logger import maxkb_logger
22-
from common.utils.split_model import SplitModel
22+
from common.utils.split_model import SplitModel, smart_split_paragraph
2323

2424
default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'),
2525
re.compile('(?<=\\n)(?<!#)## (?!#).*|(?<=^)(?<!#)## (?!#).*'),
@@ -183,7 +183,7 @@ def handle_toc(doc, limit):
183183
real_chapter_title = chapter_title[:256]
184184
# 限制章节内容长度
185185
if 0 < limit < len(chapter_text):
186-
split_text = PdfSplitHandle.split_text(chapter_text, limit)
186+
split_text = smart_split_paragraph(chapter_text, limit)
187187
for text in split_text:
188188
chapters.append({"title": real_chapter_title, "content": text})
189189
else:
@@ -262,7 +262,7 @@ def handle_links(doc, pattern_list, with_filter, limit):
262262

263263
# 限制章节内容长度
264264
if 0 < limit < len(chapter_text):
265-
split_text = PdfSplitHandle.split_text(chapter_text, limit)
265+
split_text = smart_split_paragraph(chapter_text, limit)
266266
for text in split_text:
267267
chapters.append({"title": link_title, "content": text})
268268
else:
@@ -296,29 +296,6 @@ def handle_links(doc, pattern_list, with_filter, limit):
296296
chapters = pre_toc + chapters
297297
return chapters
298298

299-
@staticmethod
300-
def split_text(text, length):
301-
segments = []
302-
current_segment = ""
303-
304-
for char in text:
305-
current_segment += char
306-
if len(current_segment) >= length:
307-
# 查找最近的句号
308-
last_period_index = current_segment.rfind('.')
309-
if last_period_index != -1:
310-
segments.append(current_segment[:last_period_index + 1])
311-
current_segment = current_segment[last_period_index + 1:] # 更新当前段落
312-
else:
313-
segments.append(current_segment)
314-
current_segment = ""
315-
316-
# 处理剩余的部分
317-
if current_segment:
318-
segments.append(current_segment)
319-
320-
return segments
321-
322299
@staticmethod
323300
def handle_chapter_title(title):
324301
title = re.sub(r'[一二三四五六七八九十\s*]、\s*', '', title)

apps/common/utils/split_model.py

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -276,7 +276,7 @@ def post_handler_paragraph(content: str, limit: int):
276276

277277
def smart_split_paragraph(content: str, limit: int):
278278
"""
279-
智能分段在limit前找到合适的分割点句号、回车等
279+
智能分段:在limit前找到合适的分割点(句号、回车等)
280280
:param content: 需要分段的文本
281281
:param limit: 最大字符限制
282282
:return: 分段后的文本列表
@@ -291,31 +291,29 @@ def smart_split_paragraph(content: str, limit: int):
291291
end = start + limit
292292

293293
if end >= len(content):
294-
# 剩余文本不超过限制直接添加
294+
# 剩余文本不超过限制,直接添加
295295
result.append(content[start:])
296296
break
297297

298298
# 在limit范围内寻找最佳分割点
299299
best_split = end
300300

301-
# 优先级句号 > 感叹号/问号 > 回车 > 分号/逗号 > 空格
301+
# 优先级:句号 > 感叹号/问号 > 回车
302302
split_chars = [
303-
('。', -1), ('', -1), ('', -1), # 句子结束符
303+
('。', 0), ('!', 0), ('?', 0), # 句子结束符,包含在当前段
304304
('\n', 0), # 回车符
305-
(';', -1), (',', -1), # 标点符号
306-
(' ', -1) # 空格
307305
]
308306

309307
# 从后往前找分割点
310308
for i in range(end - 1, start + limit // 2, -1): # 至少保留一半内容
311309
for char, offset in split_chars:
312310
if content[i] == char:
313-
best_split = i + 1 + offset
311+
best_split = i + 1 # 包含分隔符在当前段
314312
break
315313
if best_split != end:
316314
break
317315

318-
# 如果找不到合适分割点使用原始limit
316+
# 如果找不到合适分割点,使用原始limit
319317
if best_split == end and end < len(content):
320318
best_split = end
321319

0 commit comments

Comments
 (0)