Skip to content

Commit e1051f3

Browse files
authored
Merge pull request #75 from dataelement/hotfix/pdfparse
Hotfix/pdfparse
2 parents 241dc1c + a0d244f commit e1051f3

File tree

10 files changed

+94
-28
lines changed

10 files changed

+94
-28
lines changed

.drone.yml

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,8 @@ steps:
142142
volumes: # 将容器内目录挂载到宿主机,仓库需要开启Trusted设置
143143
- name: apt-cache
144144
path: /var/cache/apt/archives # 将应用打包好的Jar和执行脚本挂载出来
145+
- name: apt-cache
146+
path: /root/.cache/pip/
145147
- name: socket
146148
path: /var/run/docker.sock
147149
environment:
@@ -153,13 +155,19 @@ steps:
153155
version: release
154156
docker_repo: 192.168.106.8:6082/dataelement/bisheng-unstructured-arm
155157
docker_registry: http://192.168.106.8:6082
158+
cr_user:
159+
from_secret: CR_USER
160+
cr_password:
161+
from_secret: CR_PASSWORD
162+
cr_repo_host: cr.dataelem.com
156163
docker_user:
157164
from_secret: NEXUS_USER
158165
docker_password:
159166
from_secret: NEXUS_PASSWORD
160167
commands:
161168
- docker login -u $docker_user -p $docker_password $docker_registry
162-
- docker buildx build --push -t $docker_repo:$version -f ./docker/Dockerfile-arm .
169+
- docker login -u $cr_user -p $cr_password $cr_repo_host # 登录官方镜像源
170+
- docker buildx build --push -t $cr_repo_host/dataelement/bisheng-unstructured-arm:$version -t $docker_repo:$version -f ./docker/Dockerfile-arm .
163171
when:
164172
status:
165173
- success

docker/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ RUN sh -c 'echo "deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal main res
1818
RUN cat /etc/apt/sources.list
1919
# Install Poetry
2020
RUN apt-get update && apt-get install gcc g++ curl build-essential postgresql-server-dev-all -y
21-
RUN apt-get update && apt-get install procps -y
21+
RUN apt-get update && apt-get install procps poppler-utils -y
2222
# opencv
2323
RUN apt-get install -y libglib2.0-0 libsm6 libxrender1 libxext6 libgl1
2424
RUN curl -sSL https://install.python-poetry.org | python3 -

docker/Dockerfile-arm

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ LABEL org.opencontainers.image.authors="Dataelem inc."
44
ARG BISHENG_UNS_VER=0.0.2
55

66
RUN cat /etc/apt/sources.list
7+
RUN apt update && apt-get install poppler-utils -y
78

89
# Copy bins and configs
910
RUN mkdir -p /opt/bisheng-unstructured/bin

requirements.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ pdf2image==1.16.3
4545
pdfminer-six==20221105
4646
pdfplumber==0.10.2
4747
wheel==0.41.0
48-
pypdfium2==4.23.1
48+
#pypdfium2==4.23.1
4949
pypdf==4.3.0
5050
PyMuPDF==1.23.8
5151
opencv-python==4.8.0.76
@@ -77,4 +77,4 @@ fastapi
7777
orjson
7878

7979
# client
80-
tritonclient[http]==2.41.0
80+
tritonclient[http]==2.41.0

src/bisheng_unstructured/documents/pdf_parser/idp/pdf.py

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,8 @@
1010

1111
import fitz as pymupdf
1212
import numpy as np
13-
import pypdfium2
13+
#import pypdfium2
14+
from pdf2image import convert_from_path,convert_from_bytes
1415
from PIL import Image, ImageOps
1516

1617
from bisheng_unstructured.common import Timer
@@ -354,7 +355,8 @@ def _task(textpage_info,bytes_img, img, is_scan, lang, rot_matirx):
354355

355356
with blob.as_bytes_io() as file_path:
356357
fitz_doc = pymupdf.open(file_path)
357-
pdf_doc = pypdfium2.PdfDocument(file_path, autoclose=True)
358+
#pdf_doc = pypdfium2.PdfDocument(file_path, autoclose=True)
359+
pdf_doc = convert_from_bytes(file_path.read(), dpi=72)
358360
max_page = fitz_doc.page_count - start
359361
n = self.n if self.n else max_page
360362
n = min(n, max_page)
@@ -380,13 +382,20 @@ def _task(textpage_info,bytes_img, img, is_scan, lang, rot_matirx):
380382
bytes_imgs = []
381383
page_imgs = []
382384
for idx in range(start, start + n):
383-
page = pdf_doc.get_page(idx)
384-
pil_image = page.render().to_pil()
385-
page_imgs.append(pil_image)
385+
#page = pdf_doc.get_page(idx)
386+
#pil_image = page.render().to_pil()
387+
#page_imgs.append(pil_image)
388+
#img_byte_arr = io.BytesIO()
389+
#pil_image.save(img_byte_arr, format="PNG")
390+
#bytes_img = img_byte_arr.getvalue()
391+
#bytes_imgs.append(bytes_img)
392+
page = pdf_doc[idx]
386393
img_byte_arr = io.BytesIO()
387-
pil_image.save(img_byte_arr, format="PNG")
388-
bytes_img = img_byte_arr.getvalue()
389-
bytes_imgs.append(bytes_img)
394+
page.save(img_byte_arr, format='PNG')
395+
img_byte_arr = img_byte_arr.getvalue()
396+
bytes_imgs.append(img_byte_arr)
397+
page_imgs.append(page)
398+
390399

391400
timer.toc()
392401
print("pdfium render image", timer.get())

src/bisheng_unstructured/documents/pdf_parser/pdf.py

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,10 @@
99
from dataclasses import dataclass
1010
from typing import Any, List, Optional, Union
1111

12+
from pdf2image import convert_from_path,convert_from_bytes
1213
import fitz as pymupdf
1314
import numpy as np
14-
import pypdfium2
15+
#import pypdfium2
1516
from loguru import logger
1617
from PIL import Image, ImageOps
1718
from shapely import Polygon
@@ -1217,7 +1218,8 @@ def _task(textpage_info, bytes_img, img, is_scan, lang, rot_matirx, page_index:
12171218

12181219
with blob.as_bytes_io() as file_path:
12191220
fitz_doc = pymupdf.open(file_path)
1220-
pdf_doc = pypdfium2.PdfDocument(file_path, autoclose=True)
1221+
#pdf_doc = pypdfium2.PdfDocument(file_path, autoclose=True)
1222+
pdf_doc = convert_from_bytes(file_path.read(), dpi=72)
12211223
max_page = fitz_doc.page_count - start
12221224
n = self.n if self.n else max_page
12231225
n = min(n, max_page)
@@ -1229,13 +1231,20 @@ def _task(textpage_info, bytes_img, img, is_scan, lang, rot_matirx, page_index:
12291231
bytes_imgs = []
12301232
page_imgs = []
12311233
for idx in range(start, start + n):
1232-
page = pdf_doc.get_page(idx)
1233-
pil_image = page.render().to_pil()
1234-
page_imgs.append(pil_image)
1234+
#page = pdf_doc.get_page(idx)
1235+
#pil_image = page.render().to_pil()
1236+
#page_imgs.append(pil_image)
1237+
#img_byte_arr = io.BytesIO()
1238+
#pil_image.save(img_byte_arr, format="PNG")
1239+
#bytes_img = img_byte_arr.getvalue()
1240+
#bytes_imgs.append(bytes_img)
1241+
page = pdf_doc[idx]
12351242
img_byte_arr = io.BytesIO()
1236-
pil_image.save(img_byte_arr, format="PNG")
1237-
bytes_img = img_byte_arr.getvalue()
1238-
bytes_imgs.append(bytes_img)
1243+
page.save(img_byte_arr, format='PNG')
1244+
img_byte_arr = img_byte_arr.getvalue()
1245+
bytes_imgs.append(img_byte_arr)
1246+
page_imgs.append(page)
1247+
12391248

12401249
timer.toc()
12411250
logger.info("pdfium render image size={} time={}", len(page_imgs), timer.get())

src/bisheng_unstructured/documents/pdf_parser/test_pdf.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import cv2
55
import fitz
66
import numpy as np
7-
import pypdfium2
7+
#import pypdfium2
88
from shapely import Polygon
99
from shapely import box as Rect
1010

@@ -259,8 +259,8 @@ def test_vis():
259259
with blob.as_bytes_io() as file_path:
260260
pages = fitz.open(file_path)
261261
print("pages", pages)
262-
pdf_reader = pypdfium2.PdfDocument(file_path, autoclose=True)
263-
image_blobs = get_image_blobs(pages, pdf_reader, n, start)
262+
# pdf_reader = pypdfium2.PdfDocument(file_path, autoclose=True)
263+
# image_blobs = get_image_blobs(pages, pdf_reader, n, start)
264264

265265
assert len(image_blobs) == n
266266

src/bisheng_unstructured/topdf/docx2pdf.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import signal
44
import subprocess
55

6+
from bisheng_unstructured import utils
67
from bisheng_unstructured.partition.common import convert_office_doc
78

89

@@ -18,6 +19,16 @@ def __init__(self, kwargs={}):
1819
-V CJKmonofont="Cascadia Mono"
1920
"""
2021

22+
if utils.get_architecture() == "ARM":
23+
cmd_template = """
24+
pandoc -o {1} --pdf-engine=xelatex {0}
25+
-V mainfont="Alibaba PuHuiTi 3.0"
26+
-V sansfont="Alibaba PuHuiTi 3.0"
27+
-V monofont="Cascadia Mono"
28+
-V CJKmainfont="Alibaba PuHuiTi 3.0"
29+
-V CJKsansfont="Alibaba PuHuiTi 3.0"
30+
-V CJKmonofont="Cascadia Mono"
31+
"""
2132
def _norm_cmd(cmd):
2233
return " ".join([p.strip() for p in cmd.strip().split()])
2334

src/bisheng_unstructured/topdf/text2pdf.py

Lines changed: 23 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
11
import os
2+
from platform import platform
23
import shutil
34
import signal
45
import subprocess
56
import tempfile
67
from html import parser
78
from typing import Tuple
89

10+
from bisheng_unstructured import utils
911
import lxml.html
1012
import numpy as np
1113
from lxml import etree
@@ -93,14 +95,28 @@ def __init__(self, kwargs={}):
9395
--lua-filter=/opt/pandoc/unnested-table.lua
9496
--template /opt/pandoc/pandoc-3.1.9/share/templates/default.latex
9597
{0}
96-
-V mainfont="Alibaba PuHuiTi 3.0"
97-
-V sansfont="Alibaba PuHuiTi 3.0"
98+
-V mainfont="Alibaba PuHuiTi"
99+
-V sansfont="Alibaba PuHuiTi"
98100
-V monofont="Adobe Heiti Std"
99-
-V CJKmainfont="Alibaba PuHuiTi 3.0"
100-
-V CJKsansfont="Alibaba PuHuiTi 3.0"
101+
-V CJKmainfont="Alibaba PuHuiTi"
102+
-V CJKsansfont="Alibaba PuHuiTi"
101103
-V CJKmonofont="Adobe Heiti Std"
102104
"""
103105

106+
if utils.get_architecture() == "ARM":
107+
cmd_template = """
108+
pandoc -o {1} --pdf-engine=xelatex
109+
--lua-filter=/opt/pandoc/unnested-table.lua
110+
--template /opt/pandoc/pandoc-3.1.9/share/templates/default.latex
111+
{0}
112+
-V mainfont="Alibaba PuHuiTi 3.0"
113+
-V sansfont="Alibaba PuHuiTi 3.0"
114+
-V monofont="Adobe Heiti Std"
115+
-V CJKmainfont="Alibaba PuHuiTi 3.0"
116+
-V CJKsansfont="Alibaba PuHuiTi 3.0"
117+
-V CJKmonofont="Adobe Heiti Std"
118+
"""
119+
104120
cmd_template2 = """
105121
soffice --headless -env:SingleAppInstance=\"false\" -env:UserInstallation=\"file://{1}\" --convert-to pdf --outdir \"{1}\" \"{0}\"
106122
"""
@@ -128,7 +144,9 @@ def run(cmd: str, timeout: int = 30):
128144
stderr=subprocess.PIPE,
129145
stdout=subprocess.PIPE,
130146
)
131-
exit_code = p.wait(timeout=300)
147+
if utils.get_architecture() == "ARM":
148+
timeout = 3000
149+
exit_code = p.wait(timeout=timeout)
132150
if exit_code != 0:
133151
stdout, stderr = p.communicate()
134152
raise Exception(

src/bisheng_unstructured/utils.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import json
33
from datetime import datetime
44
from functools import wraps
5+
import platform
56
from typing import Dict, List, Optional, Union
67

78
DATE_FORMATS = ("%Y-%m-%d", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%d+%H:%M:%S", "%Y-%m-%dT%H:%M:%S%z")
@@ -11,6 +12,15 @@ def save_as_jsonl(data: List[Dict], filename: str) -> None:
1112
with open(filename, "w+") as output_file:
1213
output_file.writelines(json.dumps(datum) + "\n" for datum in data)
1314

15+
def get_architecture():
16+
machine = platform.machine()
17+
if 'x86' in machine or 'i686' in machine or 'i386' in machine:
18+
return "x86"
19+
elif 'arm' in machine or 'aarch64' in machine:
20+
return "ARM"
21+
else:
22+
return "x86"
23+
1424

1525
def read_from_jsonl(filename: str) -> List[Dict]:
1626
with open(filename) as input_file:

0 commit comments

Comments
 (0)