@@ -91,42 +91,59 @@ def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_bu
9191
9292 @staticmethod
9393 def handle_pdf_content (file , pdf_document ):
94+ # 第一步:收集所有字体大小
95+ font_sizes = []
96+ for page_num in range (len (pdf_document )):
97+ page = pdf_document .load_page (page_num )
98+ blocks = page .get_text ("dict" )["blocks" ]
99+ for block in blocks :
100+ if block ["type" ] == 0 :
101+ for line in block ["lines" ]:
102+ for span in line ["spans" ]:
103+ if span ["size" ] > 0 :
104+ font_sizes .append (span ["size" ])
105+
106+ # 计算正文字体大小(众数)
107+ if not font_sizes :
108+ body_font_size = 12
109+ else :
110+ from collections import Counter
111+ body_font_size = Counter (font_sizes ).most_common (1 )[0 ][0 ]
112+
113+ # 第二步:提取内容
94114 content = ""
95115 for page_num in range (len (pdf_document )):
96116 start_time = time .time ()
97117 page = pdf_document .load_page (page_num )
98- text = page .get_text ()
118+ blocks = page .get_text ("dict" )[ "blocks" ]
99119
100- if text and text .strip (): # 如果页面中有文本内容
101- page_content = text
102- else :
103- try :
104- new_doc = fitz .open ()
105- new_doc .insert_pdf (pdf_document , from_page = page_num , to_page = page_num )
106- page_num_pdf = tempfile .gettempdir () + f"/{ file .name } _{ page_num } .pdf"
107- new_doc .save (page_num_pdf )
108- new_doc .close ()
109-
110- loader = PyPDFLoader (page_num_pdf , extract_images = True )
111- page_content = "\n " + loader .load ()[0 ].page_content
112- except NotImplementedError as e :
113- # 文件格式不支持,直接退出
114- raise e
115- except BaseException as e :
116- # 当页出错继续进行下一页,防止一个页面出错导致整个文件解析失败
117- maxkb_logger .error (f"File: { file .name } , Page: { page_num + 1 } , error: { e } " )
118- continue
119- finally :
120- os .remove (page_num_pdf )
121-
122- content += page_content
120+ for block in blocks :
121+ if block ["type" ] == 0 : # 文本块
122+ for line in block ["lines" ]:
123+ if not line ["spans" ]:
124+ continue
125+
126+ text = "" .join ([span ["text" ] for span in line ["spans" ]])
127+ font_size = line ["spans" ][0 ]["size" ]
128+
129+ # 根据与正文字体的差值判断
130+ size_diff = font_size - body_font_size
131+
132+ if size_diff > 2 : # 明显大于正文
133+ content += f"## { text } \n \n "
134+ elif size_diff > 0.5 : # 略大于正文
135+ content += f"### { text } \n \n "
136+ else : # 正文
137+ content += f"{ text } \n "
138+
139+ elif block ["type" ] == 1 : # 图片块
140+ content += f"\n \n "
123141
124- # Null characters are not allowed.
125142 content = content .replace ('\0 ' , '' )
126143
127144 elapsed_time = time .time () - start_time
128145 maxkb_logger .debug (
129- f"File: { file .name } , Page: { page_num + 1 } , Time : { elapsed_time : .3f} s, content-length: { len ( page_content ) } " )
146+ f"File: { file .name } , Page: { page_num + 1 } , Time: { elapsed_time :.3f} s" )
130147
131148 return content
132149
0 commit comments