fix(parser): separate StdMinerUParser and MinerUCloudParser implementation

cc · begoniezhao · commit 5c7f05189ed8 · 2026-01-23T10:28:38.000+08:00
diff --git a/docreader/parser/mineru_parser.py b/docreader/parser/mineru_parser.py
@@ -1,6 +1,7 @@
 import logging
 import re
-from typing import Dict
+import time
+from typing import Dict, Optional
 
 import markdownify
 import requests
@@ -24,9 +25,10 @@ class StdMinerUParser(BaseParser):
     """
 
     def __init__(
-        self,
-        enable_markdownify: bool = True,
-        **kwargs,
+            self,
+            enable_markdownify: bool = True,
+            mineru_endpoint: Optional[str] = None,  # Added: 支持传入自定义 endpoint
+            **kwargs,
     ):
         """
         Initialize MinerU parser.
@@ -38,7 +40,10 @@ def __init__(
         """
         super().__init__(**kwargs)
         # Get MinerU endpoint from environment variable or parameter
-        self.minerU = CONFIG.mineru_endpoint
+        # Modified: 优先使用传入的参数，否则使用 Config
+        base_url = mineru_endpoint if mineru_endpoint else CONFIG.mineru_endpoint
+        self.minerU = base_url.rstrip("/") if base_url else ""
+
         self.enable_markdownify = enable_markdownify
         # Helper for processing markdown images
         self.image_helper = MarkdownImageUtil()
@@ -162,6 +167,130 @@ def parse_into_text(self, content: bytes) -> Document:
         return Document(content=text, images=images)
 
 
+# Added: 新增 MinerUCloudParser 类，支持异步任务提交
+class MinerUCloudParser(StdMinerUParser):
+    """
+    MinerU Parser for REMOTE/CLOUD API (Asynchronous).
+    Uses the /submit -> /status -> /result workflow.
+    """
+
+    SUBMIT_TIMEOUT = 30
+    POLL_INTERVAL = 2
+    MAX_WAIT_TIME = 600
+
+    def parse_into_text(self, content: bytes) -> Document:
+        """
+        Parse document content using Cloud MinerU API (Async/Polling).
+        """
+        if not self.enable:
+            return Document()
+
+        logger.info(f"Parsing PDF via Cloud MinerU API (size: {len(content)} bytes)")
+
+        try:
+            # --- Step 1: Submit Task ---
+            submit_url = f"{self.minerU}/submit"
+            logger.info(f"Submitting task to {submit_url}")
+
+            response = requests.post(
+                url=submit_url,
+                files={"files": content},
+                data={
+                    "enable_formula": "true",
+                    "enable_table": "true",
+                    "layout_model": "doclayout_yolo",
+                    "backend": "pipeline",
+                },
+                timeout=self.SUBMIT_TIMEOUT,
+            )
+            response.raise_for_status()
+
+            # Robust task_id extraction
+            resp_data = response.json()
+            task_id = resp_data.get("task_id") or resp_data.get("data", {}).get("task_id")
+
+            if not task_id:
+                raise ValueError(f"No task_id in response: {resp_data}")
+
+            logger.info(f"Task submitted, ID: {task_id}, waiting for completion...")
+
+            # --- Step 2: Poll Status ---
+            start_time = time.time()
+
+            while True:
+                if time.time() - start_time > self.MAX_WAIT_TIME:
+                    raise TimeoutError(f"Task {task_id} timed out after {self.MAX_WAIT_TIME}s")
+
+                try:
+                    status_resp = requests.get(
+                        f"{self.minerU}/status/{task_id}",
+                        timeout=10
+                    )
+                    status_resp.raise_for_status()
+                    status_data = status_resp.json()
+                except requests.RequestException as e:
+                    logger.warning(f"Status check failed for {task_id}: {e}. Retrying...")
+                    time.sleep(self.POLL_INTERVAL)
+                    continue
+
+                state = status_data.get("status") or status_data.get("state")
+
+                if state in ["done", "success"]:
+                    break
+                elif state == "failed":
+                    error_msg = status_data.get("error") or "Unknown error"
+                    raise RuntimeError(f"Task {task_id} failed: {error_msg}")
+                else:
+                    time.sleep(self.POLL_INTERVAL)
+
+            # --- Step 3: Get Result ---
+            result_resp = requests.get(
+                f"{self.minerU}/result/{task_id}",
+                timeout=30
+            )
+            result_resp.raise_for_status()
+            result_json = result_resp.json()
+
+            # Normalize result data
+            result_data = result_json.get("result", result_json)
+
+            md_content = result_data.get("md_content", "")
+            images_b64 = result_data.get("images", {})
+
+            # 使用父类的方法处理图片和Markdown转换 (复用现有逻辑)
+
+            # Convert HTML tables
+            if self.enable_markdownify:
+                md_content = markdownify.markdownify(md_content)
+
+            images = {}
+            image_replace = {}
+
+            for ipath, b64_str in images_b64.items():
+                if f"images/{ipath}" not in md_content:
+                    continue
+                match = self.base64_pattern.match(b64_str)
+                if match:
+                    file_ext = match.group(1)
+                    b64_str_clean = match.group(2)
+                    image_bytes = endecode.encode_image(b64_str_clean, errors="ignore")
+                    if not image_bytes: continue
+
+                    if self.storage:
+                        image_url = self.storage.upload_bytes(image_bytes, file_ext=f".{file_ext}")
+                        images[image_url] = b64_str_clean
+                        image_replace[f"images/{ipath}"] = image_url
+
+            if image_replace:
+                md_content = self.image_helper.replace_path(md_content, image_replace)
+
+            return Document(content=md_content, images=images)
+
+        except Exception as e:
+            logger.error(f"Cloud MinerU parsing failed: {e}", exc_info=True)
+            return Document()
+
+
 class MinerUParser(PipelineParser):
     """
     MinerU Parser with pipeline processing.
@@ -181,13 +310,20 @@ class MinerUParser(PipelineParser):
 
     # Configure your file path and MinerU endpoint
     your_file = "/path/to/your/file.pdf"
-    os.environ["MINERU_ENDPOINT"] = "http://host.docker.internal:9987"
+
+    # Added: 修改为 Localhost 方便测试
+    test_endpoint = "http://localhost:9987"
+    os.environ["MINERU_ENDPOINT"] = test_endpoint
 
     # Create parser instance
-    parser = MinerUParser()
+    # Modified: 传入 endpoint
+    parser = MinerUParser(mineru_endpoint=test_endpoint)
 
     # Parse PDF file
-    with open(your_file, "rb") as f:
-        content = f.read()
-        document = parser.parse_into_text(content)
-        logger.error(document.content)
+    if os.path.exists(your_file):
+        with open(your_file, "rb") as f:
+            content = f.read()
+            document = parser.parse_into_text(content)
+            logger.error(document.content)
+    else:
+        print(f"File not found: {your_file}")