File tree Expand file tree Collapse file tree 3 files changed +17
-3
lines changed
Expand file tree Collapse file tree 3 files changed +17
-3
lines changed Original file line number Diff line number Diff line change @@ -108,6 +108,9 @@ def extract_images(self, clip_image_res_ratio:float=3.0):
108108 rects = self ._page .get_image_rects (item )
109109 unrotated_page_bbox = self ._page .cropbox # note the difference to page.rect
110110 for bbox in rects :
111+ # ignore small images
112+ if bbox .get_area ()<= 4 : continue
113+
111114 # ignore images outside page
112115 if not unrotated_page_bbox .intersects (bbox ): continue
113116
Original file line number Diff line number Diff line change 44A wrapper of PyMuPDF Page as page engine.
55'''
66
7+ import logging
78from .RawPage import RawPage
89from ..image .ImagesExtractor import ImagesExtractor
910from ..shape .Paths import Paths
@@ -61,13 +62,22 @@ def _preprocess_text(self, **settings):
6162 raw = self .page_engine .get_text ('rawdict' , flags = 64 )
6263 text_blocks = raw .get ('blocks' , [])
6364
65+ # potential UnicodeDecodeError issue when trying to filter hidden text:
66+ # https://github.com/dothinking/pdf2docx/issues/144
67+ # https://github.com/dothinking/pdf2docx/issues/155
68+ try :
69+ spans = self .page_engine .get_texttrace ()
70+ except SystemError :
71+ logging .warning ('Ignore hidden text checking due to UnicodeDecodeError in upstream library.' )
72+ spans = []
73+
74+ if not spans : return text_blocks
75+
6476 # ignore hidden text if ocr=0, while extract only hidden text if ocr=2
6577 if ocr == 2 :
6678 f = lambda span : span ['type' ]!= 3 # find displayed text and ignore it
6779 else :
6880 f = lambda span : span ['type' ]== 3 # find hidden text and ignore it
69-
70- spans = self .page_engine .get_texttrace ()
7181 filtered_spans = list (filter (f , spans ))
7282
7383 def span_area (bbox ):
Original file line number Diff line number Diff line change @@ -195,7 +195,8 @@ def top_bottom_boundaries(y0, y1):
195195
196196 # Attention: avoid further infinite stream table detection.
197197 # Generally, a 1x1 stream table nested in a table cell is of no use
198- if isinstance (self ._parent , Cell ) and table .num_cols * table .num_rows == 1 :
198+ if isinstance (self ._parent , Cell ) and \
199+ table .num_cols * table .num_rows == 1 and table [0 ][0 ].bg_color is None :
199200 continue
200201
201202 table .set_stream_table_block ()
You can’t perform that action at this time.
0 commit comments