Feat/save embedded images in pdf (#208)

christinestraub · web-flow · commit b9f032c7568b · 2023-09-20T22:09:05.000-07:00
Addresses unstructured issue [#1332](Unstructured-IO/unstructured#1332). This PR will work together with unstructured PR [#1371](Unstructured-IO/unstructured#1371). This PR also addresses `"true" embedded images` issue #215. ### Summary - Add functionality to extract and save images from the page - add the `extract_images` method to the `PageLayout` class - pass parameters related to extracting images from the page - add Python script to evaluate image extraction with various PDF processing libraries - Add functionality to get only "true" embedded images when extracting elements from PDF pages - add functionality to extract image objects (`LTImage`) from a `PDF layout element` parsed by `pdfminer.high_level.extract_pages` - update logic to determine `ImageTextRegion` in `load_pdf()` - Update the `layout visualization` script to be able to show only image elements if need The following documents can be used for testing and evaluation. - [Captur-1317-5_ENG-p23.pdf](https://utic-dev-tech-fixtures.s3.us-east-2.amazonaws.com/pastebin/Captur-1317-5_ENG-p23.pdf) - [23-BERKSHIRE.pdf](https://utic-dev-tech-fixtures.s3.us-east-2.amazonaws.com/pastebin/23-BERKSHIRE.pdf) - [main.PMC6312790-p1.pdf](https://github.com/Unstructured-IO/unstructured-inference/files/12675967/main.PMC6312790_1-1.pdf) ### Testing ``` from unstructured_inference.inference.layout import DocumentLayout f_path = "sample-docs/embedded-images.pdf" # default image output directory doc = DocumentLayout.from_file( filename=f_path, extract_images_in_pdf=True, ) # specific image output directory doc = DocumentLayout.from_file( filename=f_path, extract_images_in_pdf=True, image_output_dir_path=<directory_path>, ) ``` ### Evaluation ``` // Extracting Images $ PYTHONPATH=. python examples/image-extraction/embedded-image-extraction.py Captur-1317-5_ENG-p23.pdf unstructured // Layout Visualziation $ PYTHONPATH=. python examples/layout_analysis/visualization.py Captur-1317-5_ENG-p23.pdf image_oly ``` **NOTE:** To reproduce the original results for comparision, you need to replace [the lines](https://github.com/Unstructured-IO/unstructured-inference/blob/feat/save-embedded-images-in-pdf/unstructured_inference/inference/layout.py#L650-L659) with the following code snippet ``` _text, element_class = ( (element.get_text(), EmbeddedTextRegion) if hasattr(element, "get_text") else (None, ImageTextRegion) ) ```
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,9 @@
+## 0.5.31
+
+* Add functionality to extract and save images from the page
+* Add functionality to get only "true" embedded images when extracting elements from PDF pages
+* Update the layout visualization script to be able to show only image elements if need
+
 ## 0.5.30
 
 * add an evaluation metric for table comparison based on token similarity
diff --git a/examples/image-extraction/README.md b/examples/image-extraction/README.md
@@ -0,0 +1,21 @@
+# Extracting Images
+
+This directory contains examples of how to extract images in PDF's separately as images.
+
+## How to run
+
+Run `pip install -r requirements.txt` to install the Python dependencies.
+
+### Extracting Embedded Images
+- Python script (embedded-image-extraction.py)
+```
+ $ PYTHONPATH=. python examples/image-extraction/embedded-image-extraction.py <file_path> <library>
+```
+The library can be  `unstructured`, `pymupdf`, and `pypdf2`. For example,
+```
+$ PYTHONPATH=. python examples/image-extraction/embedded-image-extraction.py embedded-images.pdf unstructured
+// or
+$ PYTHONPATH=. python examples/image-extraction/embedded-image-extraction.py embedded-images.pdf pymupdf
+// or
+$ PYTHONPATH=. python examples/image-extraction/embedded-image-extraction.py embedded-images.pdf pypdf2
+```
diff --git a/examples/image-extraction/embedded-image-extraction.py b/examples/image-extraction/embedded-image-extraction.py
@@ -0,0 +1,94 @@
+import io
+import os.path
+import pathlib
+import sys
+
+import fitz  # PyMuPDF
+from PIL import Image
+from PyPDF2 import PdfReader
+
+from unstructured_inference.inference.layout import DocumentLayout
+
+CUR_DIR = pathlib.Path(__file__).parent.resolve()
+
+
+def print_result(images, page_index):
+    if images:
+        print(f"[+] Found a total of {len(images)} images in page {page_index}")
+    else:
+        print(f"[!] No images found on page {page_index}")
+
+
+def run_with_unstructured(f_path, output_dir_path):
+    doc = DocumentLayout.from_file(
+        filename=f_path,
+        extract_images_in_pdf=True,
+        image_output_dir_path=output_dir_path,
+    )
+
+    for page_index, page in enumerate(doc.pages, start=1):
+        image_elements = [el for el in page.elements if el.type == "Image"]
+        print_result(image_elements, page_index)
+
+
+def run_with_pymupdf(f_path, output_dir_path):
+    doc = fitz.open(f_path)
+    for page_index, page in enumerate(doc, start=1):
+        image_list = page.get_images(full=True)
+        print_result(image_list, page_index)
+
+        for image_index, img in enumerate(image_list, start=1):
+            # Get the XREF of the image
+            xref = img[0]
+            # Extract the image bytes
+            base_image = doc.extract_image(xref)
+            image_bytes = base_image["image"]
+            # Get the image extension
+            image_ext = base_image["ext"]
+            # Load it to PIL
+            image = Image.open(io.BytesIO(image_bytes))
+            output_f_path = os.path.join(output_dir_path, f"image_{page_index}_{image_index}.{image_ext}")
+            image.save(output_f_path)
+
+
+def run_with_pypdf2(f_path, output_dir_path):
+    reader = PdfReader(f_path)
+    for page_index, page in enumerate(reader.pages, start=1):
+        images = page.images
+        print_result(images, page_index)
+
+        for image_file_object in images:
+            output_f_path = os.path.join(output_dir_path, f"figure_{page_index}_{image_file_object.name}")
+            with open(output_f_path, "wb") as fp:
+                fp.write(image_file_object.data)
+
+
+def run(f_path, library):
+    f_basename = os.path.splitext(os.path.basename(f_path))[0]
+    output_dir_path = os.path.join(output_basedir_path, library, f_basename)
+    os.makedirs(output_dir_path, exist_ok=True)
+
+    if library == "unstructured":
+        run_with_unstructured(f_path, output_dir_path)
+    elif library == "pymupdf":
+        run_with_pymupdf(f_path, output_dir_path)
+    elif library == "pypdf2":
+        run_with_pypdf2(f_path, output_dir_path)
+
+
+if __name__ == '__main__':
+    if len(sys.argv) < 3:
+        print(
+            "Please provide the path to the file name as the first argument and the image "
+            "extraction library as the second argument.",
+        )
+        sys.exit(1)
+
+    if sys.argv[2] not in ["unstructured", "pymupdf", "pypdf2"]:
+        print("Invalid pdf library")
+        sys.exit(1)
+
+    output_basedir_path = os.path.join(CUR_DIR, "output")
+    os.makedirs(output_basedir_path, exist_ok=True)
+
+    run(f_path=sys.argv[1], library=sys.argv[2])
diff --git a/examples/image-extraction/requirements.txt b/examples/image-extraction/requirements.txt
@@ -0,0 +1 @@
+unstructured-inference
diff --git a/examples/layout_analysis/README.md b/examples/layout_analysis/README.md
@@ -9,11 +9,13 @@ Run `pip install -r requirements.txt` to install the Python dependencies.
 ### Visualization
 - Python script (visualization.py)
 ```
-PYTHONPATH=. python examples/layout_analysis/visualization.py <file_path>
+$ PYTHONPATH=. python examples/layout_analysis/visualization.py <file_path> <scope>
 ```
-For example,
+The scope can be `image_only` to show only image elements or `all` to show all elements. For example,
 ```
-PYTHONPATH=. python examples/layout_analysis/visualization.py sample-docs/loremipsum.pdf
+$ PYTHONPATH=. python examples/layout_analysis/visualization.py sample-docs/loremipsum.pdf all
+// or 
+$ PYTHONPATH=. python examples/layout_analysis/visualization.py sample-docs/loremipsum.pdf image_oly
 ```
 - Jupyter Notebook (visualization.ipynb)
   - Run `jupyter-notebook` to start.
diff --git a/examples/layout_analysis/visualization.py b/examples/layout_analysis/visualization.py
@@ -2,13 +2,14 @@
 import pathlib
 import sys
 
+from unstructured_inference.inference.elements import ImageTextRegion
 from unstructured_inference.inference.layout import process_file_with_model
 from unstructured_inference.utils import write_image
 
 CUR_DIR = pathlib.Path(__file__).parent.resolve()
 
 
-def run(f_path):
+def run(f_path, scope):
     annotation_data_map = {
         "final": None,
         "extracted": {"layout": {"color": "green", "width": 2}},
@@ -27,21 +28,42 @@ def run(f_path):
     )
 
     for idx, page in enumerate(doc.pages):
+        if scope == "image_only":
+            embedded_image_elements = [
+                el for el in page.layout if isinstance(el, ImageTextRegion)
+            ]
+            inferred_image_elements = [
+                el for el in page.inferred_layout if el.type == "Figure"
+            ]
+            final_image_elements = [el for el in page.elements if el.type == "Image"]
+
+            page.layout = embedded_image_elements
+            page.inferred_layout = inferred_image_elements
+            page.elements = final_image_elements
+
         for action_type, action_value in annotation_data_map.items():
             img = page.annotate(annotation_data=action_value)
             output_f_path = os.path.join(output_dir_path, f"{f_basename}_{idx+1}_{action_type}.jpg")
             write_image(img, output_f_path)
 
+        print(f"page_num: {idx+1} - n_total_elements: {len(page.elements)} - n_extracted_elements: "
+              f"{len(page.layout)} - n_inferred_elements: {len(page.inferred_layout)} - "
+              f"n_ocr_elements: {len(page.ocr_layout)}")
+
 
 if __name__ == '__main__':
-    if len(sys.argv) < 2:
+    if len(sys.argv) < 3:
         print(
-            "Please provide the path to the file name as the first argument and the strategy as the "
+            "Please provide the path to the file name as the first argument and the scope as the "
             "second argument.",
         )
         sys.exit(1)
 
+    if sys.argv[2] not in ["all", "image_only"]:
+        print("Invalid scope")
+        sys.exit(1)
+
     output_basedir_path = os.path.join(CUR_DIR, "output")
     os.makedirs(output_basedir_path, exist_ok=True)
 
-    run(f_path=sys.argv[1])
+    run(f_path=sys.argv[1], scope=sys.argv[2])
diff --git a/test_unstructured_inference/inference/test_layout.py b/test_unstructured_inference/inference/test_layout.py
@@ -356,18 +356,21 @@ def points(self):
 class MockPageLayout(layout.PageLayout):
     def __init__(
         self,
+        number=1,
+        image=None,
         layout=None,
         model=None,
         ocr_strategy="auto",
         ocr_languages="eng",
         extract_tables=False,
     ):
-        self.image = None
+        self.image = image
         self.layout = layout
         self.model = model
         self.ocr_strategy = ocr_strategy
         self.ocr_languages = ocr_languages
         self.extract_tables = extract_tables
+        self.number = number
 
     def ocr(self, text_block: MockEmbeddedTextRegion):
         return text_block.ocr_text
@@ -878,6 +881,22 @@ def test_from_image(
         assert mock_detection.called == detection_model_called
 
 
+def test_extract_images(mock_pil_image):
+    page = MockPageLayout(image=mock_pil_image)
+    page.elements = [
+        layoutelement.LayoutElement(1, 1, 10, 10, text=None, type="Image"),
+        layoutelement.LayoutElement(11, 11, 20, 20, text=None, type="Image"),
+    ]
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        page.extract_images(output_dir_path=str(tmpdir))
+
+        for i, el in enumerate(page.elements):
+            expected_image_path = os.path.join(str(tmpdir), f"figure-{page.number}-{i + 1}.jpg")
+            assert os.path.isfile(el.image_path)
+            assert el.image_path == expected_image_path
+
+
 class MockUnstructuredElementExtractionModel(UnstructuredElementExtractionModel):
     def initialize(self, *args, **kwargs):
         return super().initialize(*args, **kwargs)
diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py
@@ -1 +1 @@
-__version__ = "0.5.30"  # pragma: no cover
+__version__ = "0.5.31"  # pragma: no cover
diff --git a/unstructured_inference/inference/layout.py b/unstructured_inference/inference/layout.py
@@ -27,6 +27,7 @@
     merge_inferred_layout_with_ocr_layout,
 )
 from unstructured_inference.inference.ordering import order_layout
+from unstructured_inference.inference.pdf import get_images_from_pdf_element
 from unstructured_inference.logger import logger
 from unstructured_inference.models.base import get_model
 from unstructured_inference.models.detectron2onnx import (
@@ -37,6 +38,7 @@
     UnstructuredObjectDetectionModel,
 )
 from unstructured_inference.patches.pdfminer import parse_keyword
+from unstructured_inference.utils import write_image
 from unstructured_inference.visualize import draw_bbox
 
 # NOTE(alan): Patching this to fix a bug in pdfminer.six. Submitted this PR into pdfminer.six to fix
@@ -356,6 +358,33 @@ def get_elements_from_layout(self, layout: List[TextRegion]) -> List[LayoutEleme
         ]
         return elements
 
+    def extract_images(self, output_dir_path: Optional[str] = None):
+        """
+        Extract and save images from the page. This method iterates through the layout elements
+        of the page, identifies image regions, and extracts and saves them as separate image files.
+        """
+
+        if not output_dir_path:
+            output_dir_path = os.path.join(os.getcwd(), "figures")
+        os.makedirs(output_dir_path, exist_ok=True)
+
+        figure_number = 0
+        for el in self.elements:
+            if isinstance(el, LocationlessLayoutElement) or el.type not in ["Image"]:
+                continue
+
+            figure_number += 1
+            try:
+                output_f_path = os.path.join(
+                    output_dir_path,
+                    f"figure-{self.number}-{figure_number}.jpg",
+                )
+                cropped_image = self.image.crop((el.x1, el.y1, el.x2, el.y2))
+                write_image(cropped_image, output_f_path)
+                el.image_path = output_f_path
+            except (ValueError, IOError):
+                logger.warning("Image Extraction Error: Skipping the failed image", exc_info=True)
+
     def _get_image_array(self) -> Union[np.ndarray, None]:
         """Converts the raw image into a numpy array."""
         if self.image_array is None:
@@ -439,11 +468,12 @@ def from_image(
         ocr_mode: str = OCRMode.FULL_PAGE.value,
         extract_tables: bool = False,
         fixed_layout: Optional[List[TextRegion]] = None,
-        **kwargs,
+        supplement_with_ocr_elements: bool = True,
+        extract_images_in_pdf: bool = False,
+        image_output_dir_path: Optional[str] = None,
+        analysis: bool = False,
     ):
         """Creates a PageLayout from an already-loaded PIL Image."""
-        analysis = kwargs.get("analysis", False)
-        supplement_with_ocr_elements = kwargs.get("supplement_with_ocr_elements", True)
 
         page = cls(
             number=number,
@@ -474,6 +504,9 @@ def from_image(
         page.image_path = os.path.abspath(image_path) if image_path else None
         page.document_filename = os.path.abspath(document_filename) if document_filename else None
 
+        if extract_images_in_pdf:
+            page.extract_images(image_output_dir_path)
+
         # Clear the image to save memory
         page.image = None
 
@@ -602,21 +635,29 @@ def load_pdf(
 ) -> Tuple[List[List[TextRegion]], Union[List[Image.Image], List[str]]]:
     """Loads the image and word objects from a pdf using pdfplumber and the image renderings of the
     pdf pages using pdf2image"""
+
     layouts = []
     for page in extract_pages(filename):
-        layout = []
+        layout: List[TextRegion] = []
         height = page.height
         for element in page:
             x1, y2, x2, y1 = element.bbox
             y1 = height - y1
             y2 = height - y2
             # Coefficient to rescale bounding box to be compatible with images
             coef = dpi / 72
-            _text, element_class = (
-                (element.get_text(), EmbeddedTextRegion)
-                if hasattr(element, "get_text")
-                else (None, ImageTextRegion)
-            )
+
+            if hasattr(element, "get_text"):
+                _text = element.get_text()
+                element_class = EmbeddedTextRegion  # type: ignore
+            else:
+                embedded_images = get_images_from_pdf_element(element)
+                if len(embedded_images) > 0:
+                    _text = None
+                    element_class = ImageTextRegion  # type: ignore
+                else:
+                    continue
+
             text_region = element_class(x1 * coef, y1 * coef, x2 * coef, y2 * coef, text=_text)
 
             if text_region.area() > 0:
diff --git a/unstructured_inference/inference/layoutelement.py b/unstructured_inference/inference/layoutelement.py
@@ -24,6 +24,7 @@
 class LayoutElement(TextRegion):
     type: Optional[str] = None
     prob: Optional[float] = None
+    image_path: Optional[str] = None
 
     def extract_text(
         self,
@@ -98,7 +99,8 @@ def merge_inferred_layout_with_extracted_layout(
     w, h = page_image_size
     full_page_region = Rectangle(0, 0, w, h)
     for extracted_region in extracted_layout:
-        if isinstance(extracted_region, ImageTextRegion):
+        extracted_is_image = isinstance(extracted_region, ImageTextRegion)
+        if extracted_is_image:
             # Skip extracted images for this purpose, we don't have the text from them and they
             # don't provide good text bounding boxes.
 
@@ -122,7 +124,6 @@ def merge_inferred_layout_with_extracted_layout(
                     extracted_region,
                     subregion_threshold=subregion_threshold,
                 )
-                extracted_is_image = isinstance(extracted_region, ImageTextRegion)
                 inferred_is_text = inferred_region.type not in (
                     "Figure",
                     "Image",
diff --git a/unstructured_inference/inference/pdf.py b/unstructured_inference/inference/pdf.py

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.5.30" # pragma: no cover`
	`1`	`+__version__ = "0.5.31" # pragma: no cover`