@@ -26,6 +26,7 @@ def __init__(self, *args, **kwargs):
2626 self .server_url = "http://datamate-mineru:8000"
2727 self .backend = "vlm-http-client"
2828 self .output_dir = "/dataset/outputs"
29+ self .max_retries = 3
2930
3031 def execute (self , sample : Dict [str , Any ]) -> Dict [str , Any ]:
3132 start = time .time ()
@@ -51,16 +52,29 @@ async def async_process_file(self, sample):
5152 content = ""
5253 for page in range (0 , total_page , 10 ):
5354 logger .info (f"fileName: { filename } , total_page: { total_page } , page: { page } ." )
54- await aio_do_parse (
55- output_dir = self .output_dir ,
56- pdf_file_names = [filename_without_ext ],
57- pdf_bytes_list = [pdf_bytes ],
58- p_lang_list = ["ch" ],
59- backend = self .backend ,
60- server_url = self .server_url ,
61- start_page_id = page ,
62- end_page_id = min (page + 9 , total_page - 1 ),
63- )
55+ for attempt in range (self .max_retries ):
56+ try :
57+ await aio_do_parse (
58+ output_dir = self .output_dir ,
59+ pdf_file_names = [filename_without_ext ],
60+ pdf_bytes_list = [pdf_bytes ],
61+ p_lang_list = ["ch" ],
62+ backend = self .backend ,
63+ server_url = self .server_url ,
64+ start_page_id = page ,
65+ end_page_id = min (page + 9 , total_page - 1 ),
66+ )
67+ break # 成功则跳出重试循环
68+ except Exception as e :
69+ logger .warning (
70+ f"Extract { filename } [{ page } -{ page + 9 } ] failed (attempt { attempt + 1 } /{ self .max_retries } ). "
71+ f"Error: { e } . Retrying in 5s..."
72+ )
73+ if attempt < self .max_retries - 1 :
74+ await asyncio .sleep (5 )
75+ else :
76+ logger .error (f"aio_do_parse failed after { self .max_retries } attempts." )
77+ raise # 耗尽次数后抛出异常,交给上层 execute 处理
6478 if os .path .exists (parse_dir ):
6579 content += get_infer_result (".md" , filename_without_ext , parse_dir )
6680 shutil .rmtree (parse_dir )
0 commit comments