refactor(parser): 将同步zip文件处理改为异步实现 Fixes: #377

xerrors · xerrors · commit 7a24179913fc · 2025-12-18T20:58:48.000+08:00
修改mineru_parser.py和mineru_official_parser.py中的zip处理逻辑，使用asyncio.run调用异步_process_zip_file函数
更新indexing.py中的_process_zip_file和相关辅助函数为异步实现
调整docker-compose.yml中的容器命名和文档中的服务启动命令
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -186,7 +186,7 @@ services:
       context: .
       dockerfile: docker/mineru.Dockerfile
     image: mineru-vllm:latest
-    container_name: mineru
+    container_name: mineru-vllm-server
     profiles:
       - all
     env_file:
@@ -237,20 +237,10 @@ services:
     command:
       --host 0.0.0.0
       --port 30001
-      # parameters for vllm-engine
-      # --data-parallel-size 2  # If using multiple GPUs, increase throughput using vllm's multi-GPU parallel mode
-      # --gpu-memory-utilization 0.5  # If running on a single GPU and encountering VRAM shortage, reduce the KV cache size by this parameter, if VRAM issues persist, try lowering it further to `0.4` or below.
     ulimits:
       memlock: -1
       stack: 67108864
     ipc: host
-    deploy:
-      resources:
-        reservations:
-          devices:
-            - driver: nvidia
-              device_ids: [ "0" ]
-              capabilities: [ gpu ]
     restart: unless-stopped
 
   paddlex:
diff --git a/docs/latest/advanced/document-processing.md b/docs/latest/advanced/document-processing.md
@@ -48,18 +48,18 @@ docker compose up -d api
 需要在 `.env` 文件中配置：
 
 ```bash
-MINERU_VL_SERVER=http://localhost:30000
-MINERU_API_URI=http://localhost:30001
+MINERU_VL_SERVER=http://localhost:30000  # 对应 docker compose 中的 mineru-vllm-server 服务
+MINERU_API_URI=http://localhost:30001  # 对应 docker compose 中的 mineru-api 服务
 ```
 
 然后启动相关服务
 
 ```bash
 # 需要 GPU，启动 MinerU 服务
-docker compose up -d mineru-vllm-server mineru-api
+docker compose up mineru-vllm-server mineru-api -d
 
 # 启动主服务
-docker compose up -d api
+docker compose up api -d
 ```
 
 ### 3. 官方云服务 (MinerU Official)
@@ -73,7 +73,7 @@ API 密钥可以从 [MinerU 官网](https://mineru.net) 申请。
 MINERU_API_KEY="your-api-key-here"
 ```
 
-然后使用 `docker compose up -d api` 重启后端服务。
+然后使用 `docker compose up api -d` 重启后端服务。
 
 ### 4. 结构化解析 (PaddleX)
 
diff --git a/src/knowledge/indexing.py b/src/knowledge/indexing.py
@@ -590,7 +590,7 @@ async def process_file_to_markdown(file_path: str, params: dict | None = None) -
             if not params or "db_id" not in params:
                 raise ValueError("ZIP文件处理需要在params中提供db_id参数")
 
-            zip_result = await asyncio.to_thread(_process_zip_file, str(file_path_obj), params["db_id"])
+            zip_result = await _process_zip_file(str(file_path_obj), params["db_id"])
 
             # 将处理结果保存到params中供调用方使用
             params["_zip_images_info"] = zip_result["images_info"]
@@ -624,7 +624,7 @@ async def process_file_to_markdown(file_path: str, params: dict | None = None) -
     return result
 
 
-def _process_zip_file(zip_path: str, db_id: str) -> dict:
+async def _process_zip_file(zip_path: str, db_id: str) -> dict:
     """
     处理ZIP文件，提取markdown内容和图片（内部函数）
 
@@ -673,11 +673,11 @@ def _process_zip_file(zip_path: str, db_id: str) -> dict:
         images_dir = _find_images_directory(zf, md_file)
 
         if images_dir:
-            images_info = _process_images(zf, images_dir, db_id, md_file)
+            images_info = await _process_images(zf, images_dir, db_id, md_file)
             markdown_content = _replace_image_links(markdown_content, images_info)
 
     # 4. 生成结果
-    content_hash = asyncio.run(calculate_content_hash(markdown_content.encode("utf-8")))
+    content_hash = await calculate_content_hash(markdown_content.encode("utf-8"))
 
     return {
         "markdown_content": markdown_content,
@@ -722,9 +722,9 @@ async def _process_images(zip_file: zipfile.ZipFile, images_dir: str, db_id: str
     image_names = [n for n in zip_file.namelist() if n.startswith(images_dir + "/")]
 
     # 上传图片到MinIO
-    minio_client = await get_minio_client()
+    minio_client = get_minio_client()
     bucket_name = "kb-images"
-    minio_client.ensure_bucket_exists(bucket_name)
+    await asyncio.to_thread(minio_client.ensure_bucket_exists, bucket_name)
 
     file_id = hashstr(Path(md_file_path).name, length=16)
 
@@ -742,7 +742,7 @@ async def _process_images(zip_file: zipfile.ZipFile, images_dir: str, db_id: str
             object_name = f"{db_id}/{file_id}/images/{Path(img_name).name}"
             content_type = CONTENT_TYPE_MAP.get(suffix, "image/jpeg")
 
-            result = minio_client.upload_file(
+            result = await minio_client.aupload_file(
                 bucket_name=bucket_name,
                 object_name=object_name,
                 data=data,
diff --git a/src/plugins/mineru_official_parser.py b/src/plugins/mineru_official_parser.py
@@ -154,10 +154,11 @@ def process_file(self, file_path: str, params: dict[str, Any] | None = None) ->
                 )
                 return text
 
+            import asyncio
             from src.knowledge.indexing import _process_zip_file
 
             try:
-                processed = _process_zip_file(zip_path, params.get("db_id") or "ocr-test")
+                processed = asyncio.run(_process_zip_file(zip_path, params.get("db_id") or "ocr-test"))
                 text = processed["markdown_content"]
             except Exception:
                 import zipfile
diff --git a/src/plugins/mineru_parser.py b/src/plugins/mineru_parser.py
@@ -189,7 +189,8 @@ def process_file(self, file_path: str, params: dict | None = None) -> str:
                     tmp_zip.flush()
 
                     try:
-                        processed = _process_zip_file(tmp_zip.name, params.get("db_id"))
+                        import asyncio
+                        processed = asyncio.run(_process_zip_file(tmp_zip.name, params.get("db_id")))
                         text = processed["markdown_content"]
                     finally:
                         os.unlink(tmp_zip.name)