Merge branch 'main' into dev

Dallas98 · Dallas98 · commit b58d561eb773 · 2025-12-17T16:58:29.000+08:00
diff --git a/Makefile b/Makefile
@@ -155,7 +155,7 @@ endef
 # ========== Build Targets ==========
 
 # Valid build targets
-VALID_BUILD_TARGETS := backend database frontend runtime backend-python deer-flow mineru
+VALID_BUILD_TARGETS := backend database frontend runtime backend-python deer-flow mineru mineru-npu
 
 # Generic docker build target with service name as parameter
 # Automatically prefixes image names with "datamate-" unless it's deer-flow
diff --git a/deployment/docker/datamate/docker-compose.yml b/deployment/docker/datamate/docker-compose.yml
@@ -99,19 +99,28 @@ services:
     restart: on-failure
     environment:
       MINERU_MODEL_SOURCE: local
-      MINERU_DEVICE_MODE: cpu  # cpu|cuda|npu|mps
-      MINERU_BACKEND_MODE: pipeline
+      MINERU_DEVICE_MODE: npu  # cpu|cuda|npu|mps
+      VLLM_WORKER_MULTIPROC_METHOD: spawn
     privileged: true
     command:
-      - python
-      - /opt/runtime/datamate/mineru/mineru_api.py
-      - --port
-      - "9001"
+      - mineru-openai-server
+      - --engine vllm
+      - --host 0.0.0.0
+      - --port "8000"
     volumes:
       - dataset_volume:/dataset
       - mineru_log_volume:/var/log/datamate/mineru
+      - /var/log/npu/:/usr/slog
+      - /usr/local/dcmi:/usr/local/dcmi
+      - /usr/local/bin/npu-smi:/usr/local/bin/npu-smi
+      - /usr/local/Ascend/driver:/usr/local/Ascend/driver
     networks: [ datamate ]
     profiles: [ mineru ]
+    devices:
+      - /dev/davinci0
+      - /dev/davinci_manager
+      - /dev/devmm_svm
+      - /dev/hisi_hdc
 
   # 5) redis
   datamate-redis:
diff --git a/deployment/helm/datamate/charts/ray-cluster/values.yaml b/deployment/helm/datamate/charts/ray-cluster/values.yaml
@@ -58,25 +58,10 @@ head:
   # in the headGroupSpec. See https://github.com/ray-project/kuberay/pull/1128 for more details.
   serviceAccountName: ""
   restartPolicy: ""
-  rayStartParams:
-    object-store-memory: '78643200'
+  rayStartParams: {}
   # containerEnv specifies environment variables for the Ray container,
   # Follows standard K8s container env schema.
-  containerEnv:
-    - name: RAY_DEDUP_LOGS
-      value: "0"
-    - name: RAY_TQDM_PATCH_PRINT
-      value: "0"
-    - name: MYSQL_HOST
-      value: "datamate-database"
-    - name: MYSQL_PORT
-      value: "3306"
-    - name: MYSQL_USER
-      value: "root"
-    - name: MYSQL_PASSWORD
-      value: "password"
-    - name: MYSQL_DATABASE
-      value: "datamate"
+  containerEnv: []
   # - name: EXAMPLE_ENV
   #   value: "1"
   envFrom: []
@@ -93,14 +78,7 @@ head:
   # It is usually best to set requests equal to limits.
   # See https://docs.ray.io/en/latest/cluster/kubernetes/user-guides/config.html#resources
   # for further guidance.
-  resources:
-    limits:
-      cpu: "2"
-      # To avoid out-of-memory issues, never allocate less than 2G memory for the Ray head.
-      memory: "8G"
-    requests:
-      cpu: "1"
-      memory: "2G"
+  resources: {}
   annotations: {}
   nodeSelector: {}
   tolerations: []
@@ -156,21 +134,7 @@ worker:
   initContainers: []
   # containerEnv specifies environment variables for the Ray container,
   # Follows standard K8s container env schema.
-  containerEnv:
-    - name: RAY_DEDUP_LOGS
-      value: "0"
-    - name: RAY_TQDM_PATCH_PRINT
-      value: "0"
-    - name: MYSQL_HOST
-      value: "datamate-database"
-    - name: MYSQL_PORT
-      value: "3306"
-    - name: MYSQL_USER
-      value: "root"
-    - name: MYSQL_PASSWORD
-      value: "password"
-    - name: MYSQL_DATABASE
-      value: "datamate"
+  containerEnv: []
   # - name: EXAMPLE_ENV
   #   value: "1"
   envFrom: []
@@ -187,13 +151,7 @@ worker:
   # It is usually best to set requests equal to limits.
   # See https://docs.ray.io/en/latest/cluster/kubernetes/user-guides/config.html#resources
   # for further guidance.
-  resources:
-    limits:
-      cpu: "4"
-      memory: "8G"
-    requests:
-      cpu: "1"
-      memory: "1G"
+  resources: {}
   annotations: {}
   nodeSelector: {}
   tolerations: []
diff --git a/deployment/helm/datamate/values.yaml b/deployment/helm/datamate/values.yaml
@@ -77,6 +77,10 @@ database:
       subPath: database
 
 backend:
+  securityContext:
+    capabilities:
+      add:
+        - SYS_ADMIN
   env:
     - name: DB_PASSWORD
       value: *dbPass
@@ -170,6 +174,30 @@ runtime:
 ray-cluster:
   enabled: true
   head:
+    rayStartParams:
+      num-cpus: '0'
+    containerEnv:
+      - name: RAY_DEDUP_LOGS
+        value: "0"
+      - name: RAY_TQDM_PATCH_PRINT
+        value: "0"
+      - name: MYSQL_HOST
+        value: "datamate-database"
+      - name: MYSQL_PORT
+        value: "3306"
+      - name: MYSQL_USER
+        value: "root"
+      - name: MYSQL_PASSWORD
+        value: *dbPass
+      - name: MYSQL_DATABASE
+        value: "datamate"
+    resources:
+      limits:
+        cpu: "4"
+        memory: "16G"
+      requests:
+        cpu: "1"
+        memory: "2G"
     volumes:
       - *datasetVolume
       - *flowVolume
@@ -196,6 +224,28 @@ ray-cluster:
           - containerPort: 8081
         volumeMounts: *runtimeVolumeMounts
   worker:
+    containerEnv:
+      - name: RAY_DEDUP_LOGS
+        value: "0"
+      - name: RAY_TQDM_PATCH_PRINT
+        value: "0"
+      - name: MYSQL_HOST
+        value: "datamate-database"
+      - name: MYSQL_PORT
+        value: "3306"
+      - name: MYSQL_USER
+        value: "root"
+      - name: MYSQL_PASSWORD
+        value: *dbPass
+      - name: MYSQL_DATABASE
+        value: "datamate"
+    resources:
+      limits:
+        cpu: "8"
+        memory: "64G"
+      requests:
+        cpu: "1"
+        memory: "2G"
     volumes:
       - *datasetVolume
       - *flowVolume
diff --git a/deployment/kubernetes/mineru/deploy.yaml b/deployment/kubernetes/mineru/deploy.yaml
@@ -22,26 +22,32 @@ spec:
           image: datamate-mineru
           imagePullPolicy: IfNotPresent
           command:
-            - python
-            - /opt/runtime/datamate/mineru/mineru_api.py
+            - mineru-openai-server
+          args:
+            - --engine
+            - vllm
+            - --host
+            - 0.0.0.0
             - --port
-            - "9001"
+            - "8000"
           env:
             - name: MINERU_MODEL_SOURCE
               value: local
             - name: MINERU_DEVICE_MODE
-              value: cpu
-            - name: MINERU_BACKEND_MODE
-              value: pipeline
+              value: npu
+            - name: VLLM_WORKER_MULTIPROC_METHOD
+              value: spawn
           ports:
-            - containerPort: 9001
+            - containerPort: 8000
           resources:
             limits:
-              cpu: 16
+              cpu: 8
               memory: 32Gi
+              huawei.com/Ascend910: 1
             requests:
               cpu: 100m
               memory: 100Mi
+              huawei.com/Ascend910: 1
           volumeMounts:
             - name: dataset-volume
               mountPath: /dataset
@@ -67,8 +73,8 @@ metadata:
 spec:
   type: ClusterIP
   ports:
-    - port: 9001
-      targetPort: 9001
+    - port: 8000
+      targetPort: 8000
       protocol: TCP
   selector:
     app: datamate
diff --git a/runtime/ops/formatter/mineru_formatter/process.py b/runtime/ops/formatter/mineru_formatter/process.py
@@ -5,35 +5,58 @@
 Description: MinerU PDF文本抽取
 Create: 2025/10/29 17:24
 """
-import json
+import os
+import shutil
 import time
-from loguru import logger
 from typing import Dict, Any
 
-from datamate.core.base_op import Mapper
 from datamate.common.utils.rest_client import http_request
+from datamate.core.base_op import Mapper
+from loguru import logger
+from mineru.cli.common import do_parse, read_fn
+from mineru.cli.fast_api import get_infer_result
+from pypdf import PdfReader
 
 
 class MineruFormatter(Mapper):
     """基于外部API，抽取PDF中的文本"""
 
     def __init__(self, *args, **kwargs):
         super(MineruFormatter, self).__init__(*args, **kwargs)
-        self.base_url = "http://datamate-mineru:9001"
-        self.pdf_extract_url = f"{self.base_url}/api/pdf-extract"
+        self.server_url = "http://datamate-mineru:8000"
+        self.backend = "vlm-http-client"
+        self.output_dir = "/dataset/outputs"
 
     def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
         start = time.time()
         filename = sample[self.filename_key]
-        if not filename.lower().endswith(".pdf"):
+        filename_without_ext = os.path.splitext(filename)[0]
+        if not filename.lower().endswith((".png", ".jpeg", ".jpg", ".webp", ".gif", ".pdf")):
             return sample
         try:
-            data = {"source_path": sample[self.filepath_key], "export_path": sample[self.export_path_key]}
-            response = http_request(method="POST", url=self.pdf_extract_url, data=data)
-            sample[self.text_key] = json.loads(response.text).get("result")
+            filepath = sample[self.filepath_key]
+            parse_dir = os.path.join(self.output_dir, filename_without_ext, "vlm")
+            pdf_bytes = read_fn(filepath)
+            total_page = len(PdfReader(filepath).pages)
+            content = ""
+            for page in range(0, total_page, 10):
+                do_parse(
+                    output_dir=self.output_dir,
+                    pdf_file_names=[filename_without_ext],
+                    pdf_bytes_list=[pdf_bytes],
+                    p_lang_list=["ch"],
+                    backend=self.backend,
+                    server_url=self.server_url,
+                    start_page_id=page,
+                    end_page_id=min(page + 9, total_page - 1),
+                )
+                if os.path.exists(parse_dir):
+                    content += get_infer_result(".md", filename_without_ext, parse_dir)
+                    shutil.rmtree(parse_dir)
+            sample[self.text_key] = content
             logger.info(
                 f"fileName: {filename}, method: MineruFormatter costs {(time.time() - start):6f} s")
-        except UnicodeDecodeError as err:
-            logger.exception(f"fileName: {filename}, method: MineruFormatter causes decode error: {err}")
+        except Exception as e:
+            logger.exception(f"fileName: {filename}, method: MineruFormatter causes error: {e}")
             raise
         return sample
diff --git a/runtime/ops/pyproject.toml b/runtime/ops/pyproject.toml
@@ -11,7 +11,9 @@ dependencies = [
     "emoji>=2.15.0",
     "jieba>=0.42.1",
     "loguru>=0.7.3",
-    "numpy==1.23.3",
+    "mineru>=2.6.5",
+    "numpy==1.24.3",
+    "python-multipart>=0.0.20",
     "opencv-contrib-python-headless==4.7.0.72",
     "opencv-python-headless==4.7.0.72",
     "openslide-python>=1.4.3",
@@ -29,4 +31,4 @@ dependencies = [
     "sqlalchemy>=2.0.44",
     "xmltodict>=1.0.2",
     "zhconv>=1.4.3",
-]
+]
diff --git a/runtime/python-executor/datamate/core/base_op.py b/runtime/python-executor/datamate/core/base_op.py
@@ -146,10 +146,10 @@ def create_failure_sample(self, sample: Dict[str, Any], op_name, excp: BaseExcep
     def read_file(self, sample):
         filepath = sample[self.filepath_key]
         filetype = sample[self.filetype_key]
-        if filetype in ["ppt", "pptx", "docx", "doc", "xlsx"]:
+        if filetype in ["ppt", "pptx", "docx", "doc", "xlsx", "csv", "md", "pdf"]:
             elements = partition(filename=filepath)
             sample[self.text_key] = "\n\n".join([str(el) for el in elements])
-        elif filetype in ["txt", "md", "markdown", "xml", "html", "csv", "json", "jsonl"]:
+        elif filetype in ["txt", "md", "markdown", "xml", "html", "json", "jsonl"]:
             with open(filepath, 'rb') as f:
                 content = f.read()
                 sample[self.text_key] = content.decode("utf-8-sig").replace("\r\n", "\n")
diff --git a/runtime/python-executor/pyproject.toml b/runtime/python-executor/pyproject.toml
@@ -21,7 +21,7 @@ dependencies = [
     "loguru>=0.7.3",
     "opencv-python-headless>=4.12.0.88",
     "ray[data,default]==2.52.1",
-    "unstructured[csv,docx,pptx,xlsx]==0.18.15",
+    "unstructured[csv,docx,pptx,xlsx,pdf,md]==0.18.15",
     "uvicorn[standard]>=0.38.0",
 ]
 
diff --git a/scripts/images/mineru-npu/Dockerfile b/scripts/images/mineru-npu/Dockerfile
@@ -0,0 +1,32 @@
+# 基础镜像配置 vLLM 或 LMDeploy ，请根据实际需要选择其中一个，要求 ARM(AArch64) CPU + Ascend NPU。
+# Base image containing the vLLM inference environment, requiring ARM(AArch64) CPU + Ascend NPU.
+FROM quay.io/ascend/vllm-ascend:v0.11.0rc2
+# Base image containing the LMDeploy inference environment, requiring ARM(AArch64) CPU + Ascend NPU.
+# FROM crpi-4crprmm5baj1v8iv.cn-hangzhou.personal.cr.aliyuncs.com/lmdeploy_dlinfer/ascend:mineru-a2
+
+
+# Install libgl for opencv support & Noto fonts for Chinese characters
+RUN apt-get update && \
+    apt-get install -y \
+        fonts-noto-core \
+        fonts-noto-cjk \
+        fontconfig \
+        libgl1 \
+        libglib2.0-0 && \
+    fc-cache -fv && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# Install mineru latest
+RUN python3 -m pip install -U pip -i https://mirrors.aliyun.com/pypi/simple && \
+    python3 -m pip install 'mineru[core]>=2.6.5' \
+                            numpy==1.26.4 \
+                            opencv-python==4.11.0.86 \
+                            -i https://mirrors.aliyun.com/pypi/simple && \
+    python3 -m pip cache purge
+
+# Download models and update the configuration file
+RUN TORCH_DEVICE_BACKEND_AUTOLOAD=0 /bin/bash -c "mineru-models-download -s modelscope -m all"
+
+# Set the entry point to activate the virtual environment and run the command line tool
+ENTRYPOINT ["/bin/bash", "-c", "export MINERU_MODEL_SOURCE=local && exec \"$@\"", "--"]
diff --git a/scripts/images/runtime/Dockerfile b/scripts/images/runtime/Dockerfile

Original file line number	Diff line number	Diff line change
`@@ -21,7 +21,7 @@ dependencies = [`
`21`	`21`	`"loguru>=0.7.3",`
`22`	`22`	`"opencv-python-headless>=4.12.0.88",`
`23`	`23`	`"ray[data,default]==2.52.1",`
`24`		`- "unstructured[csv,docx,pptx,xlsx]==0.18.15",`
	`24`	`+ "unstructured[csv,docx,pptx,xlsx,pdf,md]==0.18.15",`
`25`	`25`	`"uvicorn[standard]>=0.38.0",`
`26`	`26`	`]`
`27`	`27`