|
5 | 5 | Description: MinerU PDF文本抽取 |
6 | 6 | Create: 2025/10/29 17:24 |
7 | 7 | """ |
8 | | -import json |
| 8 | +import os |
| 9 | +import shutil |
9 | 10 | import time |
10 | | -from loguru import logger |
11 | 11 | from typing import Dict, Any |
12 | 12 |
|
13 | | -from datamate.core.base_op import Mapper |
14 | 13 | from datamate.common.utils.rest_client import http_request |
| 14 | +from datamate.core.base_op import Mapper |
| 15 | +from loguru import logger |
| 16 | +from mineru.cli.common import do_parse, read_fn |
| 17 | +from mineru.cli.fast_api import get_infer_result |
| 18 | +from pypdf import PdfReader |
15 | 19 |
|
16 | 20 |
|
17 | 21 | class MineruFormatter(Mapper): |
18 | 22 | """基于外部API,抽取PDF中的文本""" |
19 | 23 |
|
20 | 24 | def __init__(self, *args, **kwargs): |
21 | 25 | super(MineruFormatter, self).__init__(*args, **kwargs) |
22 | | - self.base_url = "http://datamate-mineru:9001" |
23 | | - self.pdf_extract_url = f"{self.base_url}/api/pdf-extract" |
| 26 | + self.server_url = "http://datamate-mineru:8000" |
| 27 | + self.backend = "vlm-http-client" |
| 28 | + self.output_dir = "/dataset/outputs" |
24 | 29 |
|
25 | 30 | def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]: |
26 | 31 | start = time.time() |
27 | 32 | filename = sample[self.filename_key] |
28 | | - if not filename.lower().endswith(".pdf"): |
| 33 | + filename_without_ext = os.path.splitext(filename)[0] |
| 34 | + if not filename.lower().endswith((".png", ".jpeg", ".jpg", ".webp", ".gif", ".pdf")): |
29 | 35 | return sample |
30 | 36 | try: |
31 | | - data = {"source_path": sample[self.filepath_key], "export_path": sample[self.export_path_key]} |
32 | | - response = http_request(method="POST", url=self.pdf_extract_url, data=data) |
33 | | - sample[self.text_key] = json.loads(response.text).get("result") |
| 37 | + filepath = sample[self.filepath_key] |
| 38 | + parse_dir = os.path.join(self.output_dir, filename_without_ext, "vlm") |
| 39 | + pdf_bytes = read_fn(filepath) |
| 40 | + total_page = len(PdfReader(filepath).pages) |
| 41 | + content = "" |
| 42 | + for page in range(0, total_page, 10): |
| 43 | + do_parse( |
| 44 | + output_dir=self.output_dir, |
| 45 | + pdf_file_names=[filename_without_ext], |
| 46 | + pdf_bytes_list=[pdf_bytes], |
| 47 | + p_lang_list=["ch"], |
| 48 | + backend=self.backend, |
| 49 | + server_url=self.server_url, |
| 50 | + start_page_id=page, |
| 51 | + end_page_id=min(page + 9, total_page - 1), |
| 52 | + ) |
| 53 | + if os.path.exists(parse_dir): |
| 54 | + content += get_infer_result(".md", filename_without_ext, parse_dir) |
| 55 | + shutil.rmtree(parse_dir) |
| 56 | + sample[self.text_key] = content |
34 | 57 | logger.info( |
35 | 58 | f"fileName: {filename}, method: MineruFormatter costs {(time.time() - start):6f} s") |
36 | | - except UnicodeDecodeError as err: |
37 | | - logger.exception(f"fileName: {filename}, method: MineruFormatter causes decode error: {err}") |
| 59 | + except Exception as e: |
| 60 | + logger.exception(f"fileName: {filename}, method: MineruFormatter causes error: {e}") |
38 | 61 | raise |
39 | 62 | return sample |
0 commit comments