Skip to content

Commit fc0e7e1

Browse files
authored
add mulprocessing (#7019)
1 parent 83e0b47 commit fc0e7e1

File tree

2 files changed

+41
-10
lines changed

2 files changed

+41
-10
lines changed

pipelines/examples/chatpaper/chat_paper.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -306,15 +306,15 @@ def upload_file(file_name, file_url, file_upload, history=[]):
306306
tim = time.time()
307307
image_path = os.path.join(root_path, str(tim))
308308
os.makedirs(image_path, exist_ok=True)
309-
imgs = pdf2image(pdfPath=file_name, imgPath=image_path)
309+
imgs = pdf2image(pdfPath=file_name, imgPath=image_path, number_process_page=args.number_process_page)
310310
elif file_upload:
311311
file_name = file_upload.name
312312
real_filename = os.path.split(file_name)[-1]
313313
root_path = os.path.dirname(file_name)
314314
tim = time.time()
315315
image_path = os.path.join(root_path, str(tim))
316316
os.makedirs(image_path, exist_ok=True)
317-
imgs = pdf2image(pdfPath=file_name, imgPath=image_path)
317+
imgs = pdf2image(pdfPath=file_name, imgPath=image_path, number_process_page=args.number_process_page)
318318
# 上传到bos后到文件是否需要删除
319319
filename_in_bos = real_filename
320320
url = eb.utils.upload_file_to_bos(

pipelines/examples/chatpaper/utils.py

Lines changed: 39 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
# limitations under the License.
1414
import argparse
1515
import logging
16-
import time
16+
import os
1717
from typing import Optional
1818

1919
import fitz
@@ -24,6 +24,10 @@
2424
from pipelines.pipelines import Pipeline
2525

2626
logging.getLogger().setLevel(logging.INFO)
27+
import time
28+
from functools import partial
29+
from multiprocessing import Pool
30+
2731
from pipelines.nodes import ErnieBot
2832
from pipelines.nodes.combine_documents import (
2933
MapReduceDocuments,
@@ -45,19 +49,43 @@ def load_all_json_path(path):
4549
return json_path
4650

4751

48-
def pdf2image(pdfPath, imgPath, zoom_x=10, zoom_y=10, rotation_angle=0):
52+
def pdf2image_index(start, end, pdfPath, imgPath, zoom_x=10, zoom_y=10, rotation_angle=0):
53+
pdf = fitz.open(pdfPath)
54+
image_path = []
55+
for index in range(start, end):
56+
page = pdf[index]
57+
trans = fitz.Matrix(zoom_x, zoom_y).prerotate(rotation_angle)
58+
pm = page.get_pixmap(matrix=trans, alpha=False)
59+
pm._writeIMG(imgPath + "/" + str(index) + ".png", format=1)
60+
image_path.append((imgPath + "/" + str(index) + ".png", "page:" + str(index)))
61+
return image_path
62+
63+
64+
def pdf2image(pdfPath, imgPath, zoom_x=10, zoom_y=10, rotation_angle=0, number_process_page=5):
4965
"""
5066
Convert PDF to Image
5167
"""
5268
pdf = fitz.open(pdfPath)
5369
image_path = []
54-
for pg in range(0, pdf.page_count):
55-
page = pdf[pg]
56-
trans = fitz.Matrix(zoom_x, zoom_y).prerotate(rotation_angle)
57-
pm = page.get_pixmap(matrix=trans, alpha=False)
58-
pm._writeIMG(imgPath + "/" + str(pg) + ".png", format=1)
59-
image_path.append((imgPath + "/" + str(pg) + ".png", "page:" + str(pg)))
70+
if pdf.page_count % number_process_page == 0:
71+
number_process = pdf.page_count // number_process_page
72+
else:
73+
number_process = pdf.page_count // number_process_page + 1
74+
number_process = min(number_process, os.cpu_count())
75+
pool = Pool(processes=number_process)
76+
index_list = [i for i in range(0, pdf.page_count, number_process_page)]
77+
if index_list[-1] < pdf.page_count:
78+
index_list.append(pdf.page_count)
79+
print(number_process)
80+
func = partial(
81+
pdf2image_index, pdfPath=pdfPath, imgPath=imgPath, zoom_x=zoom_x, zoom_y=zoom_y, rotation_angle=rotation_angle
82+
)
83+
result = pool.starmap(func, [(start, end) for start, end in zip(index_list, index_list[1:])])
84+
pool.close()
85+
pool.join()
6086
pdf.close()
87+
for item in result:
88+
image_path.extend(item)
6189
return image_path
6290

6391

@@ -360,5 +388,8 @@ def get_parse_args():
360388
)
361389
parser.add_argument("--serving_name", default="0.0.0.0", help="Serving ip.")
362390
parser.add_argument("--serving_port", default=8099, type=int, help="Serving port.")
391+
parser.add_argument(
392+
"--number_process_page", default=5, type=int, help="the number of PDF pages processed per process"
393+
)
363394
args = parser.parse_args()
364395
return args

0 commit comments

Comments
 (0)