支持mineru npu处理 (#174)

hhhhsc701 · web-flow · commit 924d977d6f2f · 2025-12-17T16:31:06.000+08:00
* feature: unstructured支持简单pdf处理

* feature: update values.yaml to enhance ray-cluster configuration with security context, environment variables, and resource limits

* feature: update deploy.yaml and process.py for mineru server configuration and PDF processing enhancements

* feature: update deploy.yaml and process.py for mineru server configuration and PDF processing enhancements

* feature: improve PDF processing logic and update dependencies in process.py and pyproject.toml

* feature: improve PDF processing logic and update dependencies in process.py and pyproject.toml

* feature: update Dockerfile for improved package source mirrors and add mineru-npu to build targets
diff --git a/Makefile b/Makefile
@@ -155,7 +155,7 @@ endef
 # ========== Build Targets ==========
 
 # Valid build targets
-VALID_BUILD_TARGETS := backend database frontend runtime backend-python deer-flow mineru
+VALID_BUILD_TARGETS := backend database frontend runtime backend-python deer-flow mineru mineru-npu
 
 # Generic docker build target with service name as parameter
 # Automatically prefixes image names with "datamate-" unless it's deer-flow
diff --git a/deployment/docker/datamate/docker-compose.yml b/deployment/docker/datamate/docker-compose.yml
@@ -99,19 +99,28 @@ services:
     restart: on-failure
     environment:
       MINERU_MODEL_SOURCE: local
-      MINERU_DEVICE_MODE: cpu  # cpu|cuda|npu|mps
-      MINERU_BACKEND_MODE: pipeline
+      MINERU_DEVICE_MODE: npu  # cpu|cuda|npu|mps
+      VLLM_WORKER_MULTIPROC_METHOD: spawn
     privileged: true
     command:
-      - python
-      - /opt/runtime/datamate/mineru/mineru_api.py
-      - --port
-      - "9001"
+      - mineru-openai-server
+      - --engine vllm
+      - --host 0.0.0.0
+      - --port "8000"
     volumes:
       - dataset_volume:/dataset
       - mineru_log_volume:/var/log/datamate/mineru
+      - /var/log/npu/:/usr/slog
+      - /usr/local/dcmi:/usr/local/dcmi
+      - /usr/local/bin/npu-smi:/usr/local/bin/npu-smi
+      - /usr/local/Ascend/driver:/usr/local/Ascend/driver
     networks: [ datamate ]
     profiles: [ mineru ]
+    devices:
+      - /dev/davinci0
+      - /dev/davinci_manager
+      - /dev/devmm_svm
+      - /dev/hisi_hdc
 
   # 5) redis
   datamate-redis:
diff --git a/deployment/helm/datamate/values.yaml b/deployment/helm/datamate/values.yaml
@@ -175,7 +175,6 @@ ray-cluster:
   enabled: true
   head:
     rayStartParams:
-      object-store-memory: '78643200'
       num-cpus: '0'
     containerEnv:
       - name: RAY_DEDUP_LOGS
@@ -194,8 +193,8 @@ ray-cluster:
         value: "datamate"
     resources:
       limits:
-        cpu: "2"
-        memory: "8G"
+        cpu: "4"
+        memory: "16G"
       requests:
         cpu: "1"
         memory: "2G"
diff --git a/deployment/kubernetes/mineru/deploy.yaml b/deployment/kubernetes/mineru/deploy.yaml
@@ -22,26 +22,32 @@ spec:
           image: datamate-mineru
           imagePullPolicy: IfNotPresent
           command:
-            - python
-            - /opt/runtime/datamate/mineru/mineru_api.py
+            - mineru-openai-server
+          args:
+            - --engine
+            - vllm
+            - --host
+            - 0.0.0.0
             - --port
-            - "9001"
+            - "8000"
           env:
             - name: MINERU_MODEL_SOURCE
               value: local
             - name: MINERU_DEVICE_MODE
-              value: cpu
-            - name: MINERU_BACKEND_MODE
-              value: pipeline
+              value: npu
+            - name: VLLM_WORKER_MULTIPROC_METHOD
+              value: spawn
           ports:
-            - containerPort: 9001
+            - containerPort: 8000
           resources:
             limits:
-              cpu: 16
+              cpu: 8
               memory: 32Gi
+              huawei.com/Ascend910: 1
             requests:
               cpu: 100m
               memory: 100Mi
+              huawei.com/Ascend910: 1
           volumeMounts:
             - name: dataset-volume
               mountPath: /dataset
@@ -67,8 +73,8 @@ metadata:
 spec:
   type: ClusterIP
   ports:
-    - port: 9001
-      targetPort: 9001
+    - port: 8000
+      targetPort: 8000
       protocol: TCP
   selector:
     app: datamate
diff --git a/runtime/ops/formatter/mineru_formatter/process.py b/runtime/ops/formatter/mineru_formatter/process.py
@@ -5,35 +5,58 @@
 Description: MinerU PDF文本抽取
 Create: 2025/10/29 17:24
 """
-import json
+import os
+import shutil
 import time
-from loguru import logger
 from typing import Dict, Any
 
-from datamate.core.base_op import Mapper
 from datamate.common.utils.rest_client import http_request
+from datamate.core.base_op import Mapper
+from loguru import logger
+from mineru.cli.common import do_parse, read_fn
+from mineru.cli.fast_api import get_infer_result
+from pypdf import PdfReader
 
 
 class MineruFormatter(Mapper):
     """基于外部API，抽取PDF中的文本"""
 
     def __init__(self, *args, **kwargs):
         super(MineruFormatter, self).__init__(*args, **kwargs)
-        self.base_url = "http://datamate-mineru:9001"
-        self.pdf_extract_url = f"{self.base_url}/api/pdf-extract"
+        self.server_url = "http://datamate-mineru:8000"
+        self.backend = "vlm-http-client"
+        self.output_dir = "/dataset/outputs"
 
     def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
         start = time.time()
         filename = sample[self.filename_key]
-        if not filename.lower().endswith(".pdf"):
+        filename_without_ext = os.path.splitext(filename)[0]
+        if not filename.lower().endswith((".png", ".jpeg", ".jpg", ".webp", ".gif", ".pdf")):
             return sample
         try:
-            data = {"source_path": sample[self.filepath_key], "export_path": sample[self.export_path_key]}
-            response = http_request(method="POST", url=self.pdf_extract_url, data=data)
-            sample[self.text_key] = json.loads(response.text).get("result")
+            filepath = sample[self.filepath_key]
+            parse_dir = os.path.join(self.output_dir, filename_without_ext, "vlm")
+            pdf_bytes = read_fn(filepath)
+            total_page = len(PdfReader(filepath).pages)
+            content = ""
+            for page in range(0, total_page, 10):
+                do_parse(
+                    output_dir=self.output_dir,
+                    pdf_file_names=[filename_without_ext],
+                    pdf_bytes_list=[pdf_bytes],
+                    p_lang_list=["ch"],
+                    backend=self.backend,
+                    server_url=self.server_url,
+                    start_page_id=page,
+                    end_page_id=min(page + 9, total_page - 1),
+                )
+                if os.path.exists(parse_dir):
+                    content += get_infer_result(".md", filename_without_ext, parse_dir)
+                    shutil.rmtree(parse_dir)
+            sample[self.text_key] = content
             logger.info(
                 f"fileName: {filename}, method: MineruFormatter costs {(time.time() - start):6f} s")
-        except UnicodeDecodeError as err:
-            logger.exception(f"fileName: {filename}, method: MineruFormatter causes decode error: {err}")
+        except Exception as e:
+            logger.exception(f"fileName: {filename}, method: MineruFormatter causes error: {e}")
             raise
         return sample
diff --git a/runtime/ops/pyproject.toml b/runtime/ops/pyproject.toml
@@ -11,7 +11,9 @@ dependencies = [
     "emoji>=2.15.0",
     "jieba>=0.42.1",
     "loguru>=0.7.3",
-    "numpy==1.23.3",
+    "mineru>=2.6.5",
+    "numpy==1.24.3",
+    "python-multipart>=0.0.20",
     "opencv-contrib-python-headless==4.7.0.72",
     "opencv-python-headless==4.7.0.72",
     "openslide-python>=1.4.3",
@@ -29,4 +31,4 @@ dependencies = [
     "sqlalchemy>=2.0.44",
     "xmltodict>=1.0.2",
     "zhconv>=1.4.3",
-]
+]
diff --git a/scripts/images/backend/Dockerfile b/scripts/images/backend/Dockerfile
@@ -1,6 +1,8 @@
 FROM maven:3-eclipse-temurin-8 AS datax-builder
 
-RUN apt-get update && \
+RUN sed -i "s@http://.*archive.ubuntu.com@http://mirrors.huaweicloud.com@g" /etc/apt/sources.list && \
+    sed -i "s@http://.*security.ubuntu.com@http://mirrors.huaweicloud.com@g" /etc/apt/sources.list && \
+    apt-get update && \
     apt-get install -y git && \
     git clone https://github.com/alibaba/DataX.git
 
@@ -21,7 +23,9 @@ RUN cd /opt/backend && \
 
 FROM eclipse-temurin:21-jdk
 
-RUN apt-get update && \
+RUN sed -i "s@http://.*archive.ubuntu.com@http://mirrors.huaweicloud.com@g" /etc/apt/sources.list && \
+    sed -i "s@http://.*security.ubuntu.com@http://mirrors.huaweicloud.com@g" /etc/apt/sources.list && \
+    apt-get update && \
     apt-get install -y vim wget curl nfs-common rsync python3 python3-pip python-is-python3 dos2unix && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
diff --git a/scripts/images/mineru-npu/Dockerfile b/scripts/images/mineru-npu/Dockerfile
@@ -0,0 +1,32 @@
+# 基础镜像配置 vLLM 或 LMDeploy ，请根据实际需要选择其中一个，要求 ARM(AArch64) CPU + Ascend NPU。
+# Base image containing the vLLM inference environment, requiring ARM(AArch64) CPU + Ascend NPU.
+FROM quay.io/ascend/vllm-ascend:v0.11.0rc2
+# Base image containing the LMDeploy inference environment, requiring ARM(AArch64) CPU + Ascend NPU.
+# FROM crpi-4crprmm5baj1v8iv.cn-hangzhou.personal.cr.aliyuncs.com/lmdeploy_dlinfer/ascend:mineru-a2
+
+
+# Install libgl for opencv support & Noto fonts for Chinese characters
+RUN apt-get update && \
+    apt-get install -y \
+        fonts-noto-core \
+        fonts-noto-cjk \
+        fontconfig \
+        libgl1 \
+        libglib2.0-0 && \
+    fc-cache -fv && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# Install mineru latest
+RUN python3 -m pip install -U pip -i https://mirrors.aliyun.com/pypi/simple && \
+    python3 -m pip install 'mineru[core]>=2.6.5' \
+                            numpy==1.26.4 \
+                            opencv-python==4.11.0.86 \
+                            -i https://mirrors.aliyun.com/pypi/simple && \
+    python3 -m pip cache purge
+
+# Download models and update the configuration file
+RUN TORCH_DEVICE_BACKEND_AUTOLOAD=0 /bin/bash -c "mineru-models-download -s modelscope -m all"
+
+# Set the entry point to activate the virtual environment and run the command line tool
+ENTRYPOINT ["/bin/bash", "-c", "export MINERU_MODEL_SOURCE=local && exec \"$@\"", "--"]