Skip to content

Commit be87508

Browse files
authored
feat: add operator-packages-volume to docker-compose and update Docke… (#179)
* feat: add operator-packages-volume to docker-compose and update Dockerfile for site-packages path * feat: add retry
1 parent 27b1cc8 commit be87508

File tree

4 files changed

+38
-11
lines changed

4 files changed

+38
-11
lines changed

deployment/docker/datamate/docker-compose.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,7 @@ services:
9090
- dataset_volume:/dataset
9191
- flow_volume:/flow
9292
- operator-runtime-volume:/opt/runtime/datamate/ops/user
93+
- operator-packages-volume:/usr/local/lib/ops/site-packages
9394
networks: [ datamate ]
9495

9596
# 4) mineru
@@ -150,6 +151,8 @@ volumes:
150151
name: datamate-operator-upload-volume
151152
operator-runtime-volume:
152153
name: datamate-operator-runtime-volume
154+
operator-packages-volume:
155+
name: datamate-operator-packages-volume
153156
mineru_log_volume:
154157
name: datamate-mineru_log_volume
155158

deployment/helm/datamate/values.yaml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,9 @@ runtime:
170170
- mountPath: /opt/runtime/datamate/ops/user
171171
name: operator-volume
172172
subPath: extract
173+
- mountPath: /usr/local/lib/ops/site-packages
174+
name: operator-volume
175+
subPath: site-packages
173176

174177
ray-cluster:
175178
enabled: true
@@ -214,6 +217,9 @@ ray-cluster:
214217
- mountPath: /opt/runtime/datamate/ops/user
215218
name: operator-volume
216219
subPath: extract
220+
- mountPath: /usr/local/lib/ops/site-packages
221+
name: operator-volume
222+
subPath: site-packages
217223
sidecarContainers:
218224
- name: runtime
219225
image: datamate-runtime
@@ -262,3 +268,6 @@ ray-cluster:
262268
- mountPath: /opt/runtime/datamate/ops/user
263269
name: operator-volume
264270
subPath: extract
271+
- mountPath: /usr/local/lib/ops/site-packages
272+
name: operator-volume
273+
subPath: site-packages

runtime/ops/formatter/mineru_formatter/process.py

Lines changed: 24 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ def __init__(self, *args, **kwargs):
2626
self.server_url = "http://datamate-mineru:8000"
2727
self.backend = "vlm-http-client"
2828
self.output_dir = "/dataset/outputs"
29+
self.max_retries = 3
2930

3031
def execute(self, sample: Dict[str, Any]) -> Dict[str, Any]:
3132
start = time.time()
@@ -51,16 +52,29 @@ async def async_process_file(self, sample):
5152
content = ""
5253
for page in range(0, total_page, 10):
5354
logger.info(f"fileName: {filename}, total_page: {total_page}, page: {page}.")
54-
await aio_do_parse(
55-
output_dir=self.output_dir,
56-
pdf_file_names=[filename_without_ext],
57-
pdf_bytes_list=[pdf_bytes],
58-
p_lang_list=["ch"],
59-
backend=self.backend,
60-
server_url=self.server_url,
61-
start_page_id=page,
62-
end_page_id=min(page + 9, total_page - 1),
63-
)
55+
for attempt in range(self.max_retries):
56+
try:
57+
await aio_do_parse(
58+
output_dir=self.output_dir,
59+
pdf_file_names=[filename_without_ext],
60+
pdf_bytes_list=[pdf_bytes],
61+
p_lang_list=["ch"],
62+
backend=self.backend,
63+
server_url=self.server_url,
64+
start_page_id=page,
65+
end_page_id=min(page + 9, total_page - 1),
66+
)
67+
break # 成功则跳出重试循环
68+
except Exception as e:
69+
logger.warning(
70+
f"Extract {filename} [{page}-{page + 9}] failed (attempt {attempt + 1}/{self.max_retries}). "
71+
f"Error: {e}. Retrying in 5s..."
72+
)
73+
if attempt < self.max_retries - 1:
74+
await asyncio.sleep(5)
75+
else:
76+
logger.error(f"aio_do_parse failed after {self.max_retries} attempts.")
77+
raise # 耗尽次数后抛出异常,交给上层 execute 处理
6478
if os.path.exists(parse_dir):
6579
content += get_infer_result(".md", filename_without_ext, parse_dir)
6680
shutil.rmtree(parse_dir)

scripts/images/runtime/Dockerfile

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \
2323
UV_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" uv pip install -e . --system --index-strategy unsafe-best-match \
2424
&& UV_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" uv pip install -r /opt/runtime/datamate/ops/pyproject.toml --system \
2525
&& uv pip uninstall torch torchvision --system \
26-
&& python -m spacy download zh_core_web_sm
26+
&& python -m spacy download zh_core_web_sm \
27+
&& echo "/usr/local/lib/ops/site-packages" > /usr/local/lib/python3.11/site-packages/ops.pth
2728

2829
RUN ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime \
2930
&& chmod +x /opt/runtime/start.sh \

0 commit comments

Comments
 (0)