|
19 | 19 |
|
20 | 20 | from common.handle.base_split_handle import BaseSplitHandle |
21 | 21 | from common.utils.logger import maxkb_logger |
22 | | -from common.utils.split_model import SplitModel |
| 22 | +from common.utils.split_model import SplitModel, smart_split_paragraph |
23 | 23 |
|
24 | 24 | default_pattern_list = [re.compile('(?<=^)# .*|(?<=\\n)# .*'), |
25 | 25 | re.compile('(?<=\\n)(?<!#)## (?!#).*|(?<=^)(?<!#)## (?!#).*'), |
@@ -183,7 +183,7 @@ def handle_toc(doc, limit): |
183 | 183 | real_chapter_title = chapter_title[:256] |
184 | 184 | # 限制章节内容长度 |
185 | 185 | if 0 < limit < len(chapter_text): |
186 | | - split_text = PdfSplitHandle.split_text(chapter_text, limit) |
| 186 | + split_text = smart_split_paragraph(chapter_text, limit) |
187 | 187 | for text in split_text: |
188 | 188 | chapters.append({"title": real_chapter_title, "content": text}) |
189 | 189 | else: |
@@ -262,7 +262,7 @@ def handle_links(doc, pattern_list, with_filter, limit): |
262 | 262 |
|
263 | 263 | # 限制章节内容长度 |
264 | 264 | if 0 < limit < len(chapter_text): |
265 | | - split_text = PdfSplitHandle.split_text(chapter_text, limit) |
| 265 | + split_text = smart_split_paragraph(chapter_text, limit) |
266 | 266 | for text in split_text: |
267 | 267 | chapters.append({"title": link_title, "content": text}) |
268 | 268 | else: |
@@ -296,29 +296,6 @@ def handle_links(doc, pattern_list, with_filter, limit): |
296 | 296 | chapters = pre_toc + chapters |
297 | 297 | return chapters |
298 | 298 |
|
299 | | - @staticmethod |
300 | | - def split_text(text, length): |
301 | | - segments = [] |
302 | | - current_segment = "" |
303 | | - |
304 | | - for char in text: |
305 | | - current_segment += char |
306 | | - if len(current_segment) >= length: |
307 | | - # 查找最近的句号 |
308 | | - last_period_index = current_segment.rfind('.') |
309 | | - if last_period_index != -1: |
310 | | - segments.append(current_segment[:last_period_index + 1]) |
311 | | - current_segment = current_segment[last_period_index + 1:] # 更新当前段落 |
312 | | - else: |
313 | | - segments.append(current_segment) |
314 | | - current_segment = "" |
315 | | - |
316 | | - # 处理剩余的部分 |
317 | | - if current_segment: |
318 | | - segments.append(current_segment) |
319 | | - |
320 | | - return segments |
321 | | - |
322 | 299 | @staticmethod |
323 | 300 | def handle_chapter_title(title): |
324 | 301 | title = re.sub(r'[一二三四五六七八九十\s*]、\s*', '', title) |
|
0 commit comments