11import logging
22import re
3- from typing import Dict
3+ import time
4+ from typing import Dict , Optional
45
56import markdownify
67import requests
@@ -24,9 +25,10 @@ class StdMinerUParser(BaseParser):
2425 """
2526
2627 def __init__ (
27- self ,
28- enable_markdownify : bool = True ,
29- ** kwargs ,
28+ self ,
29+ enable_markdownify : bool = True ,
30+ mineru_endpoint : Optional [str ] = None , # Added: 支持传入自定义 endpoint
31+ ** kwargs ,
3032 ):
3133 """
3234 Initialize MinerU parser.
@@ -38,7 +40,10 @@ def __init__(
3840 """
3941 super ().__init__ (** kwargs )
4042 # Get MinerU endpoint from environment variable or parameter
41- self .minerU = CONFIG .mineru_endpoint
43+ # Modified: 优先使用传入的参数,否则使用 Config
44+ base_url = mineru_endpoint if mineru_endpoint else CONFIG .mineru_endpoint
45+ self .minerU = base_url .rstrip ("/" ) if base_url else ""
46+
4247 self .enable_markdownify = enable_markdownify
4348 # Helper for processing markdown images
4449 self .image_helper = MarkdownImageUtil ()
@@ -162,6 +167,130 @@ def parse_into_text(self, content: bytes) -> Document:
162167 return Document (content = text , images = images )
163168
164169
170+ # Added: 新增 MinerUCloudParser 类,支持异步任务提交
171+ class MinerUCloudParser (StdMinerUParser ):
172+ """
173+ MinerU Parser for REMOTE/CLOUD API (Asynchronous).
174+ Uses the /submit -> /status -> /result workflow.
175+ """
176+
177+ SUBMIT_TIMEOUT = 30
178+ POLL_INTERVAL = 2
179+ MAX_WAIT_TIME = 600
180+
181+ def parse_into_text (self , content : bytes ) -> Document :
182+ """
183+ Parse document content using Cloud MinerU API (Async/Polling).
184+ """
185+ if not self .enable :
186+ return Document ()
187+
188+ logger .info (f"Parsing PDF via Cloud MinerU API (size: { len (content )} bytes)" )
189+
190+ try :
191+ # --- Step 1: Submit Task ---
192+ submit_url = f"{ self .minerU } /submit"
193+ logger .info (f"Submitting task to { submit_url } " )
194+
195+ response = requests .post (
196+ url = submit_url ,
197+ files = {"files" : content },
198+ data = {
199+ "enable_formula" : "true" ,
200+ "enable_table" : "true" ,
201+ "layout_model" : "doclayout_yolo" ,
202+ "backend" : "pipeline" ,
203+ },
204+ timeout = self .SUBMIT_TIMEOUT ,
205+ )
206+ response .raise_for_status ()
207+
208+ # Robust task_id extraction
209+ resp_data = response .json ()
210+ task_id = resp_data .get ("task_id" ) or resp_data .get ("data" , {}).get ("task_id" )
211+
212+ if not task_id :
213+ raise ValueError (f"No task_id in response: { resp_data } " )
214+
215+ logger .info (f"Task submitted, ID: { task_id } , waiting for completion..." )
216+
217+ # --- Step 2: Poll Status ---
218+ start_time = time .time ()
219+
220+ while True :
221+ if time .time () - start_time > self .MAX_WAIT_TIME :
222+ raise TimeoutError (f"Task { task_id } timed out after { self .MAX_WAIT_TIME } s" )
223+
224+ try :
225+ status_resp = requests .get (
226+ f"{ self .minerU } /status/{ task_id } " ,
227+ timeout = 10
228+ )
229+ status_resp .raise_for_status ()
230+ status_data = status_resp .json ()
231+ except requests .RequestException as e :
232+ logger .warning (f"Status check failed for { task_id } : { e } . Retrying..." )
233+ time .sleep (self .POLL_INTERVAL )
234+ continue
235+
236+ state = status_data .get ("status" ) or status_data .get ("state" )
237+
238+ if state in ["done" , "success" ]:
239+ break
240+ elif state == "failed" :
241+ error_msg = status_data .get ("error" ) or "Unknown error"
242+ raise RuntimeError (f"Task { task_id } failed: { error_msg } " )
243+ else :
244+ time .sleep (self .POLL_INTERVAL )
245+
246+ # --- Step 3: Get Result ---
247+ result_resp = requests .get (
248+ f"{ self .minerU } /result/{ task_id } " ,
249+ timeout = 30
250+ )
251+ result_resp .raise_for_status ()
252+ result_json = result_resp .json ()
253+
254+ # Normalize result data
255+ result_data = result_json .get ("result" , result_json )
256+
257+ md_content = result_data .get ("md_content" , "" )
258+ images_b64 = result_data .get ("images" , {})
259+
260+ # 使用父类的方法处理图片和Markdown转换 (复用现有逻辑)
261+
262+ # Convert HTML tables
263+ if self .enable_markdownify :
264+ md_content = markdownify .markdownify (md_content )
265+
266+ images = {}
267+ image_replace = {}
268+
269+ for ipath , b64_str in images_b64 .items ():
270+ if f"images/{ ipath } " not in md_content :
271+ continue
272+ match = self .base64_pattern .match (b64_str )
273+ if match :
274+ file_ext = match .group (1 )
275+ b64_str_clean = match .group (2 )
276+ image_bytes = endecode .encode_image (b64_str_clean , errors = "ignore" )
277+ if not image_bytes : continue
278+
279+ if self .storage :
280+ image_url = self .storage .upload_bytes (image_bytes , file_ext = f".{ file_ext } " )
281+ images [image_url ] = b64_str_clean
282+ image_replace [f"images/{ ipath } " ] = image_url
283+
284+ if image_replace :
285+ md_content = self .image_helper .replace_path (md_content , image_replace )
286+
287+ return Document (content = md_content , images = images )
288+
289+ except Exception as e :
290+ logger .error (f"Cloud MinerU parsing failed: { e } " , exc_info = True )
291+ return Document ()
292+
293+
165294class MinerUParser (PipelineParser ):
166295 """
167296 MinerU Parser with pipeline processing.
@@ -181,13 +310,20 @@ class MinerUParser(PipelineParser):
181310
182311 # Configure your file path and MinerU endpoint
183312 your_file = "/path/to/your/file.pdf"
184- os .environ ["MINERU_ENDPOINT" ] = "http://host.docker.internal:9987"
313+
314+ # Added: 修改为 Localhost 方便测试
315+ test_endpoint = "http://localhost:9987"
316+ os .environ ["MINERU_ENDPOINT" ] = test_endpoint
185317
186318 # Create parser instance
187- parser = MinerUParser ()
319+ # Modified: 传入 endpoint
320+ parser = MinerUParser (mineru_endpoint = test_endpoint )
188321
189322 # Parse PDF file
190- with open (your_file , "rb" ) as f :
191- content = f .read ()
192- document = parser .parse_into_text (content )
193- logger .error (document .content )
323+ if os .path .exists (your_file ):
324+ with open (your_file , "rb" ) as f :
325+ content = f .read ()
326+ document = parser .parse_into_text (content )
327+ logger .error (document .content )
328+ else :
329+ print (f"File not found: { your_file } " )
0 commit comments