13
13
# limitations under the License.
14
14
import argparse
15
15
import logging
16
- import time
16
+ import os
17
17
from typing import Optional
18
18
19
19
import fitz
24
24
from pipelines .pipelines import Pipeline
25
25
26
26
logging .getLogger ().setLevel (logging .INFO )
27
+ import time
28
+ from functools import partial
29
+ from multiprocessing import Pool
30
+
27
31
from pipelines .nodes import ErnieBot
28
32
from pipelines .nodes .combine_documents import (
29
33
MapReduceDocuments ,
@@ -45,19 +49,43 @@ def load_all_json_path(path):
45
49
return json_path
46
50
47
51
48
- def pdf2image (pdfPath , imgPath , zoom_x = 10 , zoom_y = 10 , rotation_angle = 0 ):
52
+ def pdf2image_index (start , end , pdfPath , imgPath , zoom_x = 10 , zoom_y = 10 , rotation_angle = 0 ):
53
+ pdf = fitz .open (pdfPath )
54
+ image_path = []
55
+ for index in range (start , end ):
56
+ page = pdf [index ]
57
+ trans = fitz .Matrix (zoom_x , zoom_y ).prerotate (rotation_angle )
58
+ pm = page .get_pixmap (matrix = trans , alpha = False )
59
+ pm ._writeIMG (imgPath + "/" + str (index ) + ".png" , format = 1 )
60
+ image_path .append ((imgPath + "/" + str (index ) + ".png" , "page:" + str (index )))
61
+ return image_path
62
+
63
+
64
+ def pdf2image (pdfPath , imgPath , zoom_x = 10 , zoom_y = 10 , rotation_angle = 0 , number_process_page = 5 ):
49
65
"""
50
66
Convert PDF to Image
51
67
"""
52
68
pdf = fitz .open (pdfPath )
53
69
image_path = []
54
- for pg in range (0 , pdf .page_count ):
55
- page = pdf [pg ]
56
- trans = fitz .Matrix (zoom_x , zoom_y ).prerotate (rotation_angle )
57
- pm = page .get_pixmap (matrix = trans , alpha = False )
58
- pm ._writeIMG (imgPath + "/" + str (pg ) + ".png" , format = 1 )
59
- image_path .append ((imgPath + "/" + str (pg ) + ".png" , "page:" + str (pg )))
70
+ if pdf .page_count % number_process_page == 0 :
71
+ number_process = pdf .page_count // number_process_page
72
+ else :
73
+ number_process = pdf .page_count // number_process_page + 1
74
+ number_process = min (number_process , os .cpu_count ())
75
+ pool = Pool (processes = number_process )
76
+ index_list = [i for i in range (0 , pdf .page_count , number_process_page )]
77
+ if index_list [- 1 ] < pdf .page_count :
78
+ index_list .append (pdf .page_count )
79
+ print (number_process )
80
+ func = partial (
81
+ pdf2image_index , pdfPath = pdfPath , imgPath = imgPath , zoom_x = zoom_x , zoom_y = zoom_y , rotation_angle = rotation_angle
82
+ )
83
+ result = pool .starmap (func , [(start , end ) for start , end in zip (index_list , index_list [1 :])])
84
+ pool .close ()
85
+ pool .join ()
60
86
pdf .close ()
87
+ for item in result :
88
+ image_path .extend (item )
61
89
return image_path
62
90
63
91
@@ -360,5 +388,8 @@ def get_parse_args():
360
388
)
361
389
parser .add_argument ("--serving_name" , default = "0.0.0.0" , help = "Serving ip." )
362
390
parser .add_argument ("--serving_port" , default = 8099 , type = int , help = "Serving port." )
391
+ parser .add_argument (
392
+ "--number_process_page" , default = 5 , type = int , help = "the number of PDF pages processed per process"
393
+ )
363
394
args = parser .parse_args ()
364
395
return args
0 commit comments