Skip to content

Commit b719568

Browse files
authored
Merge pull request #65 from dataelement/feat/arm
Feat/arm
2 parents ecf72bb + 645b1cd commit b719568

File tree

6 files changed

+165
-10
lines changed

6 files changed

+165
-10
lines changed

.drone.yml

Lines changed: 120 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,13 +15,13 @@ steps: # 定义流水线执行步骤,这些步骤将顺序执行
1515
https_proxy:
1616
from_secret: PROXY
1717
commands:
18-
- git config --global core.compression 0
19-
- git clone https://github.com/dataelement/bisheng-unstructured.git .
20-
- git checkout $DRONE_COMMIT
18+
- git config --global core.compression 0
19+
- git clone https://github.com/dataelement/bisheng-unstructured.git .
20+
- git checkout $DRONE_COMMIT
2121

2222
- name: build_docker_release
2323
pull: if-not-exists
24-
image: docker:24.0.6
24+
image: plugins/docker
2525
privileged: true
2626
volumes: # 将容器内目录挂载到宿主机,仓库需要开启Trusted设置
2727
- name: apt-cache
@@ -104,3 +104,119 @@ volumes:
104104
- name: socket
105105
host:
106106
path: /var/run/docker.sock
107+
108+
109+
110+
---
111+
kind: pipeline
112+
type: docker
113+
name: unstructured-arm
114+
115+
clone:
116+
disable: true
117+
118+
platform:
119+
os: linux
120+
arch: arm64
121+
122+
steps:
123+
- name: clone
124+
image: alpine/git
125+
pull: if-not-exists
126+
environment:
127+
http_proxy:
128+
from_secret: PROXY
129+
https_proxy:
130+
from_secret: PROXY
131+
commands:
132+
- git config --global core.compression 0
133+
- git clone https://github.com/dataelement/bisheng-unstructured.git .
134+
- git checkout $DRONE_COMMIT
135+
136+
137+
138+
- name: build_docker_release
139+
pull: if-not-exists
140+
image: docker:24.0.6
141+
privileged: true
142+
volumes: # 将容器内目录挂载到宿主机,仓库需要开启Trusted设置
143+
- name: apt-cache
144+
path: /var/cache/apt/archives # 将应用打包好的Jar和执行脚本挂载出来
145+
- name: socket
146+
path: /var/run/docker.sock
147+
environment:
148+
http_proxy:
149+
from_secret: PROXY
150+
https_proxy:
151+
from_secret: PROXY
152+
no_proxy: 192.168.106.8
153+
version: release
154+
docker_repo: 192.168.106.8:6082/dataelement/bisheng-unstructured-arm
155+
docker_registry: http://192.168.106.8:6082
156+
docker_user:
157+
from_secret: NEXUS_USER
158+
docker_password:
159+
from_secret: NEXUS_PASSWORD
160+
commands:
161+
- docker login -u $docker_user -p $docker_password $docker_registry
162+
- docker buildx build --push -t $docker_repo:$version -f ./docker/Dockerfile-arm .
163+
when:
164+
status:
165+
- success
166+
branch:
167+
- release
168+
event:
169+
- push
170+
171+
172+
- name: build_docker
173+
pull: if-not-exists
174+
image: docker:24.0.6
175+
privileged: true
176+
volumes: # 将容器内目录挂载到宿主机,仓库需要开启Trusted设置
177+
- name: apt-cache
178+
path: /var/cache/apt/archives # 将应用打包好的Jar和执行脚本挂载出来
179+
- name: socket
180+
path: /var/run/docker.sock
181+
environment:
182+
http_proxy:
183+
from_secret: PROXY
184+
https_proxy:
185+
from_secret: PROXY
186+
no_proxy: 192.168.106.8,192.168.106.8
187+
version: ${DRONE_TAG}
188+
docker_repo: dataelement/bisheng-unstructured-arm
189+
docker_user:
190+
from_secret: DOCKER_USER
191+
docker_password:
192+
from_secret: DOCKER_PASSWORD
193+
cr_user:
194+
from_secret: CR_USER
195+
cr_password:
196+
from_secret: CR_PASSWORD
197+
cr_repo_host: cr.dataelem.com
198+
commands:
199+
- docker login -u $cr_user -p $cr_password $cr_repo_host # 登录官方镜像源
200+
- docker login -u $docker_user -p $docker_password # 登录私有镜像源
201+
# 推送amd的镜像到cr镜像仓库
202+
- docker buildx build --load -t $docker_repo:$version -t $docker_repo:latest -t $cr_repo_host/$docker_repo:$version -t $cr_repo_host/$docker_repo:latest -f ./docker/Dockerfile-arm .
203+
#- docker push $docker_repo:$version
204+
# - docker push $cr_repo_host/$docker_repo:$version
205+
# - docker push $docker_repo:latest
206+
# - docker push $cr_repo_host/$docker_repo:latest
207+
when:
208+
status:
209+
- success
210+
ref:
211+
- refs/tags/v*
212+
213+
volumes:
214+
- name: bisheng-cache
215+
host:
216+
path: /opt/drone/data/bisheng/
217+
- name: apt-cache
218+
host:
219+
path: /opt/drone/data/bisheng/apt/
220+
- name: socket
221+
host:
222+
path: /var/run/docker.sock

docker/Dockerfile-arm

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
FROM uns-armv8-ubuntu-20-04:v3
2+
LABEL org.opencontainers.image.authors="Dataelem inc."
3+
4+
ARG BISHENG_UNS_VER=0.0.2
5+
6+
RUN cat /etc/apt/sources.list
7+
8+
# Copy bins and configs
9+
RUN mkdir -p /opt/bisheng-unstructured/bin
10+
COPY ./docker/entrypoint-arm.sh /opt/bisheng-unstructured/bin/
11+
COPY config /opt/bisheng-unstructured/
12+
13+
14+
WORKDIR /opt/bisheng-unstructured
15+
16+
# Copy source code
17+
COPY ./src/ /opt/bisheng-unstructured/
18+
COPY ./requirements.txt /opt/bisheng-unstructured/
19+
20+
# install requirements
21+
RUN python3 -m pip install --upgrade pip
22+
RUN pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
23+
24+
RUN apt-get clean && rm -rf /var/lib/apt/lists/* && rm -rf /root/.cache/pip
25+
26+
CMD ["bash", "bin/entrypoint-arm.sh"]

docker/entrypoint-arm.sh

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
#!/bin/bash
2+
3+
4+
export PATH=/usr/local/texlive/2024/bin/aarch64-linux:$PATH
5+
export MANPATH=/usr/local/texlive/2024/texmf-dist/doc/man:$MANPATH
6+
export INFOPATH=/usr/local/texlive/2024/texmf-dist/doc/info:$INFOPATH
7+
8+
uvicorn --host 0.0.0.0 --port 10001 --workers 8 bisheng_unstructured.api.main:app

src/bisheng_unstructured/documents/pdf_parser/pdf.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -582,6 +582,7 @@ def _allocate_semantic(
582582
TEXT_ID = 4
583583
TABLE_ID = 5
584584
IMAGE_ID = 2
585+
LAYOUT_ID = 6
585586
FORMULA_ID = 1000
586587

587588
timer = Timer()
@@ -925,6 +926,10 @@ def _allocate_semantic(
925926
if label == TABLE_ID:
926927
filtered_blocks.append(b)
927928

929+
# elif label == LAYOUT_ID:
930+
# b.block_text = join_lines(b.ts, False, lang)
931+
# filtered_blocks.append(b)
932+
928933
elif label == IMAGE_ID:
929934
if self.keep_text_in_image:
930935
b.block_text = join_lines(b.ts, False, lang)

src/bisheng_unstructured/topdf/docx2pdf.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ def run(cmd):
7474
stderr=subprocess.PIPE,
7575
stdout=subprocess.PIPE,
7676
)
77-
exit_code = p.wait(timeout=30)
77+
exit_code = p.wait(timeout=300)
7878
if exit_code != 0:
7979
stdout, stderr = p.communicate()
8080
raise Exception(

src/bisheng_unstructured/topdf/text2pdf.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -93,11 +93,11 @@ def __init__(self, kwargs={}):
9393
--lua-filter=/opt/pandoc/unnested-table.lua
9494
--template /opt/pandoc/pandoc-3.1.9/share/templates/default.latex
9595
{0}
96-
-V mainfont="Alibaba PuHuiTi"
97-
-V sansfont="Alibaba PuHuiTi"
96+
-V mainfont="Alibaba PuHuiTi 3.0"
97+
-V sansfont="Alibaba PuHuiTi 3.0"
9898
-V monofont="Adobe Heiti Std"
99-
-V CJKmainfont="Alibaba PuHuiTi"
100-
-V CJKsansfont="Alibaba PuHuiTi"
99+
-V CJKmainfont="Alibaba PuHuiTi 3.0"
100+
-V CJKsansfont="Alibaba PuHuiTi 3.0"
101101
-V CJKmonofont="Adobe Heiti Std"
102102
"""
103103

@@ -128,7 +128,7 @@ def run(cmd: str, timeout: int = 30):
128128
stderr=subprocess.PIPE,
129129
stdout=subprocess.PIPE,
130130
)
131-
exit_code = p.wait(timeout=timeout)
131+
exit_code = p.wait(timeout=300)
132132
if exit_code != 0:
133133
stdout, stderr = p.communicate()
134134
raise Exception(

0 commit comments

Comments
 (0)