99from dataclasses import dataclass
1010from typing import Any , List , Optional , Union
1111
12+ from pdf2image import convert_from_path ,convert_from_bytes
1213import fitz as pymupdf
1314import numpy as np
14- import pypdfium2
15+ # import pypdfium2
1516from loguru import logger
1617from PIL import Image , ImageOps
1718from shapely import Polygon
@@ -1217,7 +1218,8 @@ def _task(textpage_info, bytes_img, img, is_scan, lang, rot_matirx, page_index:
12171218
12181219 with blob .as_bytes_io () as file_path :
12191220 fitz_doc = pymupdf .open (file_path )
1220- pdf_doc = pypdfium2 .PdfDocument (file_path , autoclose = True )
1221+ #pdf_doc = pypdfium2.PdfDocument(file_path, autoclose=True)
1222+ pdf_doc = convert_from_bytes (file_path .read (), dpi = 72 )
12211223 max_page = fitz_doc .page_count - start
12221224 n = self .n if self .n else max_page
12231225 n = min (n , max_page )
@@ -1229,13 +1231,20 @@ def _task(textpage_info, bytes_img, img, is_scan, lang, rot_matirx, page_index:
12291231 bytes_imgs = []
12301232 page_imgs = []
12311233 for idx in range (start , start + n ):
1232- page = pdf_doc .get_page (idx )
1233- pil_image = page .render ().to_pil ()
1234- page_imgs .append (pil_image )
1234+ #page = pdf_doc.get_page(idx)
1235+ #pil_image = page.render().to_pil()
1236+ #page_imgs.append(pil_image)
1237+ #img_byte_arr = io.BytesIO()
1238+ #pil_image.save(img_byte_arr, format="PNG")
1239+ #bytes_img = img_byte_arr.getvalue()
1240+ #bytes_imgs.append(bytes_img)
1241+ page = pdf_doc [idx ]
12351242 img_byte_arr = io .BytesIO ()
1236- pil_image .save (img_byte_arr , format = "PNG" )
1237- bytes_img = img_byte_arr .getvalue ()
1238- bytes_imgs .append (bytes_img )
1243+ page .save (img_byte_arr , format = 'PNG' )
1244+ img_byte_arr = img_byte_arr .getvalue ()
1245+ bytes_imgs .append (img_byte_arr )
1246+ page_imgs .append (page )
1247+
12391248
12401249 timer .toc ()
12411250 logger .info ("pdfium render image size={} time={}" , len (page_imgs ), timer .get ())
0 commit comments