Skip to content

Commit 5c7f051

Browse files
ccbegoniezhao
authored andcommitted
fix(parser): separate StdMinerUParser and MinerUCloudParser implementation
1 parent fe6f84b commit 5c7f051

File tree

1 file changed

+147
-11
lines changed

1 file changed

+147
-11
lines changed

docreader/parser/mineru_parser.py

Lines changed: 147 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import logging
22
import re
3-
from typing import Dict
3+
import time
4+
from typing import Dict, Optional
45

56
import markdownify
67
import requests
@@ -24,9 +25,10 @@ class StdMinerUParser(BaseParser):
2425
"""
2526

2627
def __init__(
27-
self,
28-
enable_markdownify: bool = True,
29-
**kwargs,
28+
self,
29+
enable_markdownify: bool = True,
30+
mineru_endpoint: Optional[str] = None, # Added: 支持传入自定义 endpoint
31+
**kwargs,
3032
):
3133
"""
3234
Initialize MinerU parser.
@@ -38,7 +40,10 @@ def __init__(
3840
"""
3941
super().__init__(**kwargs)
4042
# Get MinerU endpoint from environment variable or parameter
41-
self.minerU = CONFIG.mineru_endpoint
43+
# Modified: 优先使用传入的参数,否则使用 Config
44+
base_url = mineru_endpoint if mineru_endpoint else CONFIG.mineru_endpoint
45+
self.minerU = base_url.rstrip("/") if base_url else ""
46+
4247
self.enable_markdownify = enable_markdownify
4348
# Helper for processing markdown images
4449
self.image_helper = MarkdownImageUtil()
@@ -162,6 +167,130 @@ def parse_into_text(self, content: bytes) -> Document:
162167
return Document(content=text, images=images)
163168

164169

170+
# Added: 新增 MinerUCloudParser 类,支持异步任务提交
171+
class MinerUCloudParser(StdMinerUParser):
172+
"""
173+
MinerU Parser for REMOTE/CLOUD API (Asynchronous).
174+
Uses the /submit -> /status -> /result workflow.
175+
"""
176+
177+
SUBMIT_TIMEOUT = 30
178+
POLL_INTERVAL = 2
179+
MAX_WAIT_TIME = 600
180+
181+
def parse_into_text(self, content: bytes) -> Document:
182+
"""
183+
Parse document content using Cloud MinerU API (Async/Polling).
184+
"""
185+
if not self.enable:
186+
return Document()
187+
188+
logger.info(f"Parsing PDF via Cloud MinerU API (size: {len(content)} bytes)")
189+
190+
try:
191+
# --- Step 1: Submit Task ---
192+
submit_url = f"{self.minerU}/submit"
193+
logger.info(f"Submitting task to {submit_url}")
194+
195+
response = requests.post(
196+
url=submit_url,
197+
files={"files": content},
198+
data={
199+
"enable_formula": "true",
200+
"enable_table": "true",
201+
"layout_model": "doclayout_yolo",
202+
"backend": "pipeline",
203+
},
204+
timeout=self.SUBMIT_TIMEOUT,
205+
)
206+
response.raise_for_status()
207+
208+
# Robust task_id extraction
209+
resp_data = response.json()
210+
task_id = resp_data.get("task_id") or resp_data.get("data", {}).get("task_id")
211+
212+
if not task_id:
213+
raise ValueError(f"No task_id in response: {resp_data}")
214+
215+
logger.info(f"Task submitted, ID: {task_id}, waiting for completion...")
216+
217+
# --- Step 2: Poll Status ---
218+
start_time = time.time()
219+
220+
while True:
221+
if time.time() - start_time > self.MAX_WAIT_TIME:
222+
raise TimeoutError(f"Task {task_id} timed out after {self.MAX_WAIT_TIME}s")
223+
224+
try:
225+
status_resp = requests.get(
226+
f"{self.minerU}/status/{task_id}",
227+
timeout=10
228+
)
229+
status_resp.raise_for_status()
230+
status_data = status_resp.json()
231+
except requests.RequestException as e:
232+
logger.warning(f"Status check failed for {task_id}: {e}. Retrying...")
233+
time.sleep(self.POLL_INTERVAL)
234+
continue
235+
236+
state = status_data.get("status") or status_data.get("state")
237+
238+
if state in ["done", "success"]:
239+
break
240+
elif state == "failed":
241+
error_msg = status_data.get("error") or "Unknown error"
242+
raise RuntimeError(f"Task {task_id} failed: {error_msg}")
243+
else:
244+
time.sleep(self.POLL_INTERVAL)
245+
246+
# --- Step 3: Get Result ---
247+
result_resp = requests.get(
248+
f"{self.minerU}/result/{task_id}",
249+
timeout=30
250+
)
251+
result_resp.raise_for_status()
252+
result_json = result_resp.json()
253+
254+
# Normalize result data
255+
result_data = result_json.get("result", result_json)
256+
257+
md_content = result_data.get("md_content", "")
258+
images_b64 = result_data.get("images", {})
259+
260+
# 使用父类的方法处理图片和Markdown转换 (复用现有逻辑)
261+
262+
# Convert HTML tables
263+
if self.enable_markdownify:
264+
md_content = markdownify.markdownify(md_content)
265+
266+
images = {}
267+
image_replace = {}
268+
269+
for ipath, b64_str in images_b64.items():
270+
if f"images/{ipath}" not in md_content:
271+
continue
272+
match = self.base64_pattern.match(b64_str)
273+
if match:
274+
file_ext = match.group(1)
275+
b64_str_clean = match.group(2)
276+
image_bytes = endecode.encode_image(b64_str_clean, errors="ignore")
277+
if not image_bytes: continue
278+
279+
if self.storage:
280+
image_url = self.storage.upload_bytes(image_bytes, file_ext=f".{file_ext}")
281+
images[image_url] = b64_str_clean
282+
image_replace[f"images/{ipath}"] = image_url
283+
284+
if image_replace:
285+
md_content = self.image_helper.replace_path(md_content, image_replace)
286+
287+
return Document(content=md_content, images=images)
288+
289+
except Exception as e:
290+
logger.error(f"Cloud MinerU parsing failed: {e}", exc_info=True)
291+
return Document()
292+
293+
165294
class MinerUParser(PipelineParser):
166295
"""
167296
MinerU Parser with pipeline processing.
@@ -181,13 +310,20 @@ class MinerUParser(PipelineParser):
181310

182311
# Configure your file path and MinerU endpoint
183312
your_file = "/path/to/your/file.pdf"
184-
os.environ["MINERU_ENDPOINT"] = "http://host.docker.internal:9987"
313+
314+
# Added: 修改为 Localhost 方便测试
315+
test_endpoint = "http://localhost:9987"
316+
os.environ["MINERU_ENDPOINT"] = test_endpoint
185317

186318
# Create parser instance
187-
parser = MinerUParser()
319+
# Modified: 传入 endpoint
320+
parser = MinerUParser(mineru_endpoint=test_endpoint)
188321

189322
# Parse PDF file
190-
with open(your_file, "rb") as f:
191-
content = f.read()
192-
document = parser.parse_into_text(content)
193-
logger.error(document.content)
323+
if os.path.exists(your_file):
324+
with open(your_file, "rb") as f:
325+
content = f.read()
326+
document = parser.parse_into_text(content)
327+
logger.error(document.content)
328+
else:
329+
print(f"File not found: {your_file}")

0 commit comments

Comments
 (0)