11"""
2- PaddleX 文档解析器
2+ PP-StructureV3 文档解析器
33
4- 使用 PaddleX PP-StructureV3 进行文档版面解析和内容提取
4+ 使用 PP-StructureV3 进行文档版面解析和内容提取
55"""
66
77import base64
1717
1818
1919class PaddleXDocumentParser (BaseDocumentProcessor ):
20- """PaddleX 文档解析器 - 使用 PP-StructureV3 进行版面解析"""
20+ """PP-StructureV3 文档解析器 - 使用 PP-StructureV3 进行版面解析"""
2121
2222 def __init__ (self , server_url : str | None = None ):
2323 self .server_url = server_url or os .getenv ("PADDLEX_URI" ) or "http://localhost:8080"
@@ -28,7 +28,7 @@ def get_service_name(self) -> str:
2828 return "paddlex_ocr"
2929
3030 def get_supported_extensions (self ) -> list [str ]:
31- """PaddleX 支持 PDF 和多种图像格式"""
31+ """PP-StructureV3 支持 PDF 和多种图像格式"""
3232 return [".pdf" , ".jpg" , ".jpeg" , ".png" , ".bmp" , ".tiff" , ".tif" ]
3333
3434 def _encode_file_to_base64 (self , file_path : str ) -> str :
@@ -64,7 +64,7 @@ def _call_layout_api(
6464 use_seal_recognition : bool = False ,
6565 ** kwargs ,
6666 ) -> dict [str , Any ]:
67- """调用PaddleX版面解析API """
67+ """调用PP-StructureV3版面解析API """
6868 # 处理文件输入
6969 processed_file_input = self ._process_file_input (file_input )
7070 payload = {"file" : processed_file_input }
@@ -92,7 +92,7 @@ def _call_layout_api(
9292 if response .status_code == 200 :
9393 return response .json ()
9494 else :
95- error_msg = f"PaddleX API请求失败: { response .status_code } "
95+ error_msg = f"PP-StructureV3 API请求失败: { response .status_code } "
9696 try :
9797 error_result = response .json ()
9898 raise DocumentParserException (f"{ error_msg } : { error_result } " , self .get_service_name (), "api_error" )
@@ -157,45 +157,45 @@ def _parse_api_result(self, api_result: dict[str, Any], file_path: str) -> dict[
157157 return parsed_result
158158
159159 def check_health (self ) -> dict :
160- """检查 PaddleX 服务健康状态"""
160+ """检查 PP-StructureV3 服务健康状态"""
161161 try :
162162 response = requests .get (f"{ self .base_url } /health" , timeout = 5 )
163163
164164 if response .status_code == 200 :
165165 return {
166166 "status" : "healthy" ,
167- "message" : "PaddleX 服务运行正常" ,
167+ "message" : "PP-StructureV3 服务运行正常" ,
168168 "details" : {"server_url" : self .server_url },
169169 }
170170 else :
171171 return {
172172 "status" : "unhealthy" ,
173- "message" : f"PaddleX 服务响应异常: { response .status_code } " ,
173+ "message" : f"PP-StructureV3 服务响应异常: { response .status_code } " ,
174174 "details" : {"server_url" : self .server_url },
175175 }
176176
177177 except requests .exceptions .ConnectionError :
178178 return {
179179 "status" : "unavailable" ,
180- "message" : "PaddleX 服务无法连接,请检查服务是否启动" ,
180+ "message" : "PP-StructureV3 服务无法连接,请检查服务是否启动" ,
181181 "details" : {"server_url" : self .server_url },
182182 }
183183 except requests .exceptions .Timeout :
184184 return {
185185 "status" : "timeout" ,
186- "message" : "PaddleX 服务连接超时" ,
186+ "message" : "PP-StructureV3 服务连接超时" ,
187187 "details" : {"server_url" : self .server_url },
188188 }
189189 except Exception as e :
190190 return {
191191 "status" : "error" ,
192- "message" : f"PaddleX 健康检查失败: { str (e )} " ,
192+ "message" : f"PP-StructureV3 健康检查失败: { str (e )} " ,
193193 "details" : {"server_url" : self .server_url , "error" : str (e )},
194194 }
195195
196196 def process_file (self , file_path : str , params : dict | None = None ) -> str :
197197 """
198- 使用 PaddleX 处理文档
198+ 使用 PP-StructureV3 处理文档
199199
200200 Args:
201201 file_path: 文件路径
@@ -220,7 +220,7 @@ def process_file(self, file_path: str, params: dict | None = None) -> str:
220220 health = self .check_health ()
221221 if health ["status" ] != "healthy" :
222222 raise DocumentParserException (
223- f"PaddleX 服务不可用: { health ['message' ]} " , self .get_service_name (), health ["status" ]
223+ f"PP-StructureV3 服务不可用: { health ['message' ]} " , self .get_service_name (), health ["status" ]
224224 )
225225
226226 try :
@@ -230,7 +230,7 @@ def process_file(self, file_path: str, params: dict | None = None) -> str:
230230 # 判断文件类型
231231 file_type = 0 if file_ext == ".pdf" else 1
232232
233- logger .info (f"PaddleX 开始处理: { os .path .basename (file_path )} " )
233+ logger .info (f"PP-StructureV3 开始处理: { os .path .basename (file_path )} " )
234234
235235 # 调用API
236236 api_result = self ._call_layout_api (
@@ -244,15 +244,15 @@ def process_file(self, file_path: str, params: dict | None = None) -> str:
244244 # 检查API调用是否成功
245245 if api_result .get ("errorCode" ) != 0 :
246246 raise DocumentParserException (
247- f"PaddleX API错误: { api_result .get ('errorMsg' , '未知错误' )} " , self .get_service_name (), "api_error"
247+ f"PP-StructureV3 API错误: { api_result .get ('errorMsg' , '未知错误' )} " , self .get_service_name (), "api_error"
248248 )
249249
250250 # 解析结果
251251 result = self ._parse_api_result (api_result , file_path )
252252 text = result .get ("full_text" , "" )
253253
254254 processing_time = time .time () - start_time
255- logger .info (f"PaddleX 处理成功: { os .path .basename (file_path )} - { len (text )} 字符 ({ processing_time :.2f} s)" )
255+ logger .info (f"PP-StructureV3 处理成功: { os .path .basename (file_path )} - { len (text )} 字符 ({ processing_time :.2f} s)" )
256256
257257 # 记录统计信息
258258 summary = result .get ("summary" , {})
@@ -265,6 +265,6 @@ def process_file(self, file_path: str, params: dict | None = None) -> str:
265265 raise
266266 except Exception as e :
267267 processing_time = time .time () - start_time
268- error_msg = f"PaddleX 处理失败: { str (e )} "
268+ error_msg = f"PP-StructureV3 处理失败: { str (e )} "
269269 logger .error (f"{ error_msg } ({ processing_time :.2f} s)" )
270270 raise DocumentParserException (error_msg , self .get_service_name (), "processing_failed" )
0 commit comments