Skip to content

Commit 313be62

Browse files
committed
fixed bugs
2 parents 6186db9 + b2b0f49 commit 313be62

File tree

3 files changed

+17
-3
lines changed

3 files changed

+17
-3
lines changed

pdf2docx/image/ImagesExtractor.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,9 @@ def extract_images(self, clip_image_res_ratio:float=3.0):
108108
rects = self._page.get_image_rects(item)
109109
unrotated_page_bbox = self._page.cropbox # note the difference to page.rect
110110
for bbox in rects:
111+
# ignore small images
112+
if bbox.get_area()<=4: continue
113+
111114
# ignore images outside page
112115
if not unrotated_page_bbox.intersects(bbox): continue
113116

pdf2docx/page/RawPageFitz.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
A wrapper of PyMuPDF Page as page engine.
55
'''
66

7+
import logging
78
from .RawPage import RawPage
89
from ..image.ImagesExtractor import ImagesExtractor
910
from ..shape.Paths import Paths
@@ -61,13 +62,22 @@ def _preprocess_text(self, **settings):
6162
raw = self.page_engine.get_text('rawdict', flags=64)
6263
text_blocks = raw.get('blocks', [])
6364

65+
# potential UnicodeDecodeError issue when trying to filter hidden text:
66+
# https://github.com/dothinking/pdf2docx/issues/144
67+
# https://github.com/dothinking/pdf2docx/issues/155
68+
try:
69+
spans = self.page_engine.get_texttrace()
70+
except SystemError:
71+
logging.warning('Ignore hidden text checking due to UnicodeDecodeError in upstream library.')
72+
spans = []
73+
74+
if not spans: return text_blocks
75+
6476
# ignore hidden text if ocr=0, while extract only hidden text if ocr=2
6577
if ocr==2:
6678
f = lambda span: span['type']!=3 # find displayed text and ignore it
6779
else:
6880
f = lambda span: span['type']==3 # find hidden text and ignore it
69-
70-
spans = self.page_engine.get_texttrace()
7181
filtered_spans = list(filter(f, spans))
7282

7383
def span_area(bbox):

pdf2docx/table/TablesConstructor.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -195,7 +195,8 @@ def top_bottom_boundaries(y0, y1):
195195

196196
# Attention: avoid further infinite stream table detection.
197197
# Generally, a 1x1 stream table nested in a table cell is of no use
198-
if isinstance(self._parent, Cell) and table.num_cols*table.num_rows==1:
198+
if isinstance(self._parent, Cell) and \
199+
table.num_cols*table.num_rows==1 and table[0][0].bg_color is None:
199200
continue
200201

201202
table.set_stream_table_block()

0 commit comments

Comments
 (0)