@@ -42,7 +42,7 @@ def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_bu
4242 pdf_document = fitz .open (temp_file_path )
4343 try :
4444 # 处理有目录的pdf
45- result = self .handle_toc (pdf_document )
45+ result = self .handle_toc (pdf_document , limit )
4646 if result is not None :
4747 return {'name' : file .name , 'content' : result }
4848
@@ -110,7 +110,7 @@ def handle_pdf_content(file, pdf_document):
110110 return content
111111
112112 @staticmethod
113- def handle_toc (doc ):
113+ def handle_toc (doc , limit ):
114114 # 找到目录
115115 toc = doc .get_toc ()
116116 if toc is None or len (toc ) == 0 :
@@ -155,17 +155,16 @@ def handle_toc(doc):
155155 text = text [:idx ]
156156
157157 chapter_text += text # 提取文本
158-
158+ # 限制章节内容长度
159+ if 0 < limit < len (chapter_text ):
160+ split_text = PdfSplitHandle .split_text (chapter_text , limit )
161+ for text in split_text :
162+ chapters .append ({"title" : chapter_title , "content" : text })
163+ else :
164+ chapters .append ({"title" : chapter_title , "content" : chapter_text if chapter_text else chapter_title })
159165 # 保存章节内容和章节标题
160- chapters .append ({"title" : chapter_title , "content" : chapter_text if chapter_text else chapter_title })
161166 return chapters
162167
163- @staticmethod
164- def handle_chapter_title (title ):
165- title = re .sub (r'[一二三四五六七八九十\s*]、\s*' , '' , title )
166- title = re .sub (r'第[一二三四五六七八九十]章\s*' , '' , title )
167- return title
168-
169168 @staticmethod
170169 def handle_links (doc , pattern_list , with_filter , limit ):
171170 # 创建存储章节内容的数组
@@ -228,11 +227,14 @@ def handle_links(doc, pattern_list, with_filter, limit):
228227 text = text [:idx ]
229228 chapter_text += text
230229
231- # 保存章节信息
232- chapters .append ({
233- "title" : link_title ,
234- "content" : chapter_text
235- })
230+ # 限制章节内容长度
231+ if 0 < limit < len (chapter_text ):
232+ split_text = PdfSplitHandle .split_text (chapter_text , limit )
233+ for text in split_text :
234+ chapters .append ({"title" : link_title , "content" : text })
235+ else :
236+ # 保存章节信息
237+ chapters .append ({"title" : link_title , "content" : chapter_text })
236238
237239 # 目录中没有前言部分,手动处理
238240 if handle_pre_toc :
@@ -261,6 +263,35 @@ def handle_links(doc, pattern_list, with_filter, limit):
261263 chapters = pre_toc + chapters
262264 return chapters
263265
266+ @staticmethod
267+ def split_text (text , length ):
268+ segments = []
269+ current_segment = ""
270+
271+ for char in text :
272+ current_segment += char
273+ if len (current_segment ) >= length :
274+ # 查找最近的句号
275+ last_period_index = current_segment .rfind ('.' )
276+ if last_period_index != - 1 :
277+ segments .append (current_segment [:last_period_index + 1 ])
278+ current_segment = current_segment [last_period_index + 1 :] # 更新当前段落
279+ else :
280+ segments .append (current_segment )
281+ current_segment = ""
282+
283+ # 处理剩余的部分
284+ if current_segment :
285+ segments .append (current_segment )
286+
287+ return segments
288+
289+ @staticmethod
290+ def handle_chapter_title (title ):
291+ title = re .sub (r'[一二三四五六七八九十\s*]、\s*' , '' , title )
292+ title = re .sub (r'第[一二三四五六七八九十]章\s*' , '' , title )
293+ return title
294+
264295 def support (self , file , get_buffer ):
265296 file_name : str = file .name .lower ()
266297 if file_name .endswith (".pdf" ) or file_name .endswith (".PDF" ):
0 commit comments