fixed bugs

dothinking · dothinking · commit 313be6223798 · 2022-08-11T10:46:13.000+08:00
diff --git a/pdf2docx/image/ImagesExtractor.py b/pdf2docx/image/ImagesExtractor.py
@@ -108,6 +108,9 @@ def extract_images(self, clip_image_res_ratio:float=3.0):
             rects = self._page.get_image_rects(item)
             unrotated_page_bbox = self._page.cropbox # note the difference to page.rect
             for bbox in rects:
+                # ignore small images
+                if bbox.get_area()<=4: continue
+
                 # ignore images outside page
                 if not unrotated_page_bbox.intersects(bbox): continue
 
diff --git a/pdf2docx/page/RawPageFitz.py b/pdf2docx/page/RawPageFitz.py
@@ -4,6 +4,7 @@
 A wrapper of PyMuPDF Page as page engine.
 '''
 
+import logging
 from .RawPage import RawPage
 from ..image.ImagesExtractor import ImagesExtractor
 from ..shape.Paths import Paths
@@ -61,13 +62,22 @@ def _preprocess_text(self, **settings):
         raw = self.page_engine.get_text('rawdict', flags=64)
         text_blocks = raw.get('blocks', [])
 
+        # potential UnicodeDecodeError issue when trying to filter hidden text:
+        # https://github.com/dothinking/pdf2docx/issues/144
+        # https://github.com/dothinking/pdf2docx/issues/155
+        try:
+            spans = self.page_engine.get_texttrace()
+        except SystemError:
+            logging.warning('Ignore hidden text checking due to UnicodeDecodeError in upstream library.')
+            spans = []
+        
+        if not spans: return text_blocks
+
         # ignore hidden text if ocr=0, while extract only hidden text if ocr=2
         if ocr==2:
             f = lambda span: span['type']!=3  # find displayed text and ignore it
         else:
             f = lambda span: span['type']==3  # find hidden text and ignore it
-
-        spans = self.page_engine.get_texttrace()
         filtered_spans = list(filter(f, spans))
         
         def span_area(bbox):
diff --git a/pdf2docx/table/TablesConstructor.py b/pdf2docx/table/TablesConstructor.py
@@ -195,7 +195,8 @@ def top_bottom_boundaries(y0, y1):
 
             # Attention: avoid further infinite stream table detection.
             # Generally, a 1x1 stream table nested in a table cell is of no use
-            if isinstance(self._parent, Cell) and table.num_cols*table.num_rows==1:
+            if isinstance(self._parent, Cell) and \
+                table.num_cols*table.num_rows==1 and table[0][0].bg_color is None:
                 continue
 
             table.set_stream_table_block()