Skip to content

Commit 7a24179

Browse files
committed
refactor(parser): 将同步zip文件处理改为异步实现 Fixes: #377
修改mineru_parser.py和mineru_official_parser.py中的zip处理逻辑,使用asyncio.run调用异步_process_zip_file函数 更新indexing.py中的_process_zip_file和相关辅助函数为异步实现 调整docker-compose.yml中的容器命名和文档中的服务启动命令
1 parent fd9dc40 commit 7a24179

File tree

5 files changed

+17
-25
lines changed

5 files changed

+17
-25
lines changed

docker-compose.yml

Lines changed: 1 addition & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -186,7 +186,7 @@ services:
186186
context: .
187187
dockerfile: docker/mineru.Dockerfile
188188
image: mineru-vllm:latest
189-
container_name: mineru
189+
container_name: mineru-vllm-server
190190
profiles:
191191
- all
192192
env_file:
@@ -237,20 +237,10 @@ services:
237237
command:
238238
--host 0.0.0.0
239239
--port 30001
240-
# parameters for vllm-engine
241-
# --data-parallel-size 2 # If using multiple GPUs, increase throughput using vllm's multi-GPU parallel mode
242-
# --gpu-memory-utilization 0.5 # If running on a single GPU and encountering VRAM shortage, reduce the KV cache size by this parameter, if VRAM issues persist, try lowering it further to `0.4` or below.
243240
ulimits:
244241
memlock: -1
245242
stack: 67108864
246243
ipc: host
247-
deploy:
248-
resources:
249-
reservations:
250-
devices:
251-
- driver: nvidia
252-
device_ids: [ "0" ]
253-
capabilities: [ gpu ]
254244
restart: unless-stopped
255245

256246
paddlex:

docs/latest/advanced/document-processing.md

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -48,18 +48,18 @@ docker compose up -d api
4848
需要在 `.env` 文件中配置:
4949

5050
```bash
51-
MINERU_VL_SERVER=http://localhost:30000
52-
MINERU_API_URI=http://localhost:30001
51+
MINERU_VL_SERVER=http://localhost:30000 # 对应 docker compose 中的 mineru-vllm-server 服务
52+
MINERU_API_URI=http://localhost:30001 # 对应 docker compose 中的 mineru-api 服务
5353
```
5454

5555
然后启动相关服务
5656

5757
```bash
5858
# 需要 GPU,启动 MinerU 服务
59-
docker compose up -d mineru-vllm-server mineru-api
59+
docker compose up mineru-vllm-server mineru-api -d
6060

6161
# 启动主服务
62-
docker compose up -d api
62+
docker compose up api -d
6363
```
6464

6565
### 3. 官方云服务 (MinerU Official)
@@ -73,7 +73,7 @@ API 密钥可以从 [MinerU 官网](https://mineru.net) 申请。
7373
MINERU_API_KEY="your-api-key-here"
7474
```
7575

76-
然后使用 `docker compose up -d api` 重启后端服务。
76+
然后使用 `docker compose up api -d` 重启后端服务。
7777

7878
### 4. 结构化解析 (PaddleX)
7979

src/knowledge/indexing.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -590,7 +590,7 @@ async def process_file_to_markdown(file_path: str, params: dict | None = None) -
590590
if not params or "db_id" not in params:
591591
raise ValueError("ZIP文件处理需要在params中提供db_id参数")
592592

593-
zip_result = await asyncio.to_thread(_process_zip_file, str(file_path_obj), params["db_id"])
593+
zip_result = await _process_zip_file(str(file_path_obj), params["db_id"])
594594

595595
# 将处理结果保存到params中供调用方使用
596596
params["_zip_images_info"] = zip_result["images_info"]
@@ -624,7 +624,7 @@ async def process_file_to_markdown(file_path: str, params: dict | None = None) -
624624
return result
625625

626626

627-
def _process_zip_file(zip_path: str, db_id: str) -> dict:
627+
async def _process_zip_file(zip_path: str, db_id: str) -> dict:
628628
"""
629629
处理ZIP文件,提取markdown内容和图片(内部函数)
630630
@@ -673,11 +673,11 @@ def _process_zip_file(zip_path: str, db_id: str) -> dict:
673673
images_dir = _find_images_directory(zf, md_file)
674674

675675
if images_dir:
676-
images_info = _process_images(zf, images_dir, db_id, md_file)
676+
images_info = await _process_images(zf, images_dir, db_id, md_file)
677677
markdown_content = _replace_image_links(markdown_content, images_info)
678678

679679
# 4. 生成结果
680-
content_hash = asyncio.run(calculate_content_hash(markdown_content.encode("utf-8")))
680+
content_hash = await calculate_content_hash(markdown_content.encode("utf-8"))
681681

682682
return {
683683
"markdown_content": markdown_content,
@@ -722,9 +722,9 @@ async def _process_images(zip_file: zipfile.ZipFile, images_dir: str, db_id: str
722722
image_names = [n for n in zip_file.namelist() if n.startswith(images_dir + "/")]
723723

724724
# 上传图片到MinIO
725-
minio_client = await get_minio_client()
725+
minio_client = get_minio_client()
726726
bucket_name = "kb-images"
727-
minio_client.ensure_bucket_exists(bucket_name)
727+
await asyncio.to_thread(minio_client.ensure_bucket_exists, bucket_name)
728728

729729
file_id = hashstr(Path(md_file_path).name, length=16)
730730

@@ -742,7 +742,7 @@ async def _process_images(zip_file: zipfile.ZipFile, images_dir: str, db_id: str
742742
object_name = f"{db_id}/{file_id}/images/{Path(img_name).name}"
743743
content_type = CONTENT_TYPE_MAP.get(suffix, "image/jpeg")
744744

745-
result = minio_client.upload_file(
745+
result = await minio_client.aupload_file(
746746
bucket_name=bucket_name,
747747
object_name=object_name,
748748
data=data,

src/plugins/mineru_official_parser.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -154,10 +154,11 @@ def process_file(self, file_path: str, params: dict[str, Any] | None = None) ->
154154
)
155155
return text
156156

157+
import asyncio
157158
from src.knowledge.indexing import _process_zip_file
158159

159160
try:
160-
processed = _process_zip_file(zip_path, params.get("db_id") or "ocr-test")
161+
processed = asyncio.run(_process_zip_file(zip_path, params.get("db_id") or "ocr-test"))
161162
text = processed["markdown_content"]
162163
except Exception:
163164
import zipfile

src/plugins/mineru_parser.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -189,7 +189,8 @@ def process_file(self, file_path: str, params: dict | None = None) -> str:
189189
tmp_zip.flush()
190190

191191
try:
192-
processed = _process_zip_file(tmp_zip.name, params.get("db_id"))
192+
import asyncio
193+
processed = asyncio.run(_process_zip_file(tmp_zip.name, params.get("db_id")))
193194
text = processed["markdown_content"]
194195
finally:
195196
os.unlink(tmp_zip.name)

0 commit comments

Comments
 (0)