fix: use temporary instead of fixed directories for storing images of pdfs being processed (#184)

newelh · web-flow · commit 9b6aa8ec1e43 · 2023-08-22T20:55:59.000-07:00
**Summary**

- Replaces using a created directory for storing image outputs with a
temporary directory
- Deprecates `create_image_output_dir` method
- Adds hot-loading for annotating images because images are no longer
stored long-term in a directory
- Adds a document_filename keyword arg to the PageLayout to enable
hot-loading

**Tests**

Removes tests associated with `create_image_output_dir`
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,8 @@
+## 0.5.16
+
+* Fix to no longer create a directory for storing processed images
+* Hot-load images for annotation
+
 ## 0.5.15
 
 * Handle an uncaught TesseractError
diff --git a/test_unstructured_inference/inference/test_layout.py b/test_unstructured_inference/inference/test_layout.py
@@ -10,7 +10,6 @@
 
 import unstructured_inference.models.base as models
 from unstructured_inference.inference import elements, layout, layoutelement
-from unstructured_inference.inference.layout import create_image_output_dir
 from unstructured_inference.models import chipper, detectron2, tesseract
 from unstructured_inference.models.unstructuredmodel import (
     UnstructuredElementExtractionModel,
@@ -404,10 +403,6 @@ def mock_get_elements(self, *args, **kwargs):
         }
 
         with patch.object(
-            layout,
-            "create_image_output_dir",
-            return_value=tmpdir,
-        ), patch.object(
             layout,
             "load_pdf",
             lambda *args, **kwargs: ([[]], [image_path]),
@@ -416,7 +411,6 @@ def mock_get_elements(self, *args, **kwargs):
             page = doc.pages[0]
             assert page.elements[0] == mock_final_layout
             assert page.image_metadata == image_metadata
-            assert page.image_path == image_path
             assert page.image is None
 
 
@@ -868,26 +862,6 @@ def test_exposed_pdf_image_dpi(pdf_image_dpi, expected, monkeypatch):
         assert mock_from_image.call_args[0][0].height == expected
 
 
-def test_create_image_output_dir():
-    with tempfile.TemporaryDirectory() as tmpdir:
-        tmp_f_path = os.path.join(tmpdir, "loremipsum.pdf")
-        output_dir = create_image_output_dir(tmp_f_path)
-        expected_output_dir = os.path.join(os.path.abspath(tmpdir), "loremipsum_images")
-        assert os.path.isdir(output_dir)
-        assert os.path.isabs(output_dir)
-        assert output_dir == expected_output_dir
-
-
-def test_create_image_output_dir_no_ext():
-    with tempfile.TemporaryDirectory() as tmpdir:
-        tmp_f_path = os.path.join(tmpdir, "loremipsum_no_ext")
-        output_dir = create_image_output_dir(tmp_f_path)
-        expected_output_dir = os.path.join(os.path.abspath(tmpdir), "loremipsum_no_ext_images")
-        assert os.path.isdir(output_dir)
-        assert os.path.isabs(output_dir)
-        assert output_dir == expected_output_dir
-
-
 def test_warning_if_chipper_and_low_dpi(caplog):
     with patch.object(layout.DocumentLayout, "from_file") as mock_from_file, patch.object(
         chipper.UnstructuredChipperModel,
diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py
@@ -1 +1 @@
-__version__ = "0.5.15"  # pragma: no cover
+__version__ = "0.5.16"  # pragma: no cover
diff --git a/unstructured_inference/inference/layout.py b/unstructured_inference/inference/layout.py
@@ -87,44 +87,44 @@ def from_file(
         """Creates a DocumentLayout from a pdf file."""
         logger.info(f"Reading PDF for file: {filename} ...")
 
-        # Store pdf images for later use
-        output_dir = create_image_output_dir(filename)
-        layouts, _image_paths = load_pdf(
-            filename,
-            pdf_image_dpi,
-            output_folder=output_dir,
-            path_only=True,
-        )
-        image_paths = cast(List[str], _image_paths)
-        if len(layouts) > len(image_paths):
-            raise RuntimeError(
-                "Some images were not loaded. "
-                "Check that poppler is installed and in your $PATH.",
+        with tempfile.TemporaryDirectory() as temp_dir:
+            layouts, _image_paths = load_pdf(
+                filename,
+                pdf_image_dpi,
+                output_folder=temp_dir,
+                path_only=True,
             )
-        pages: List[PageLayout] = []
-        if fixed_layouts is None:
-            fixed_layouts = [None for _ in layouts]
-        for i, (image_path, layout, fixed_layout) in enumerate(
-            zip(image_paths, layouts, fixed_layouts),
-        ):
-            # NOTE(robinson) - In the future, maybe we detect the page number and default
-            # to the index if it is not detected
-            with Image.open(image_path) as image:
-                page = PageLayout.from_image(
-                    image,
-                    image_path=image_path,
-                    number=i + 1,
-                    detection_model=detection_model,
-                    element_extraction_model=element_extraction_model,
-                    layout=layout,
-                    ocr_strategy=ocr_strategy,
-                    ocr_languages=ocr_languages,
-                    ocr_mode=ocr_mode,
-                    fixed_layout=fixed_layout,
-                    extract_tables=extract_tables,
+            image_paths = cast(List[str], _image_paths)
+            if len(layouts) > len(image_paths):
+                raise RuntimeError(
+                    "Some images were not loaded. "
+                    "Check that poppler is installed and in your $PATH.",
                 )
-                pages.append(page)
-        return cls.from_pages(pages)
+
+            pages: List[PageLayout] = []
+            if fixed_layouts is None:
+                fixed_layouts = [None for _ in layouts]
+            for i, (image_path, layout, fixed_layout) in enumerate(
+                zip(image_paths, layouts, fixed_layouts),
+            ):
+                # NOTE(robinson) - In the future, maybe we detect the page number and default
+                # to the index if it is not detected
+                with Image.open(image_path) as image:
+                    page = PageLayout.from_image(
+                        image,
+                        number=i + 1,
+                        document_filename=filename,
+                        detection_model=detection_model,
+                        element_extraction_model=element_extraction_model,
+                        layout=layout,
+                        ocr_strategy=ocr_strategy,
+                        ocr_languages=ocr_languages,
+                        ocr_mode=ocr_mode,
+                        fixed_layout=fixed_layout,
+                        extract_tables=extract_tables,
+                    )
+                    pages.append(page)
+            return cls.from_pages(pages)
 
     @classmethod
     def from_image_file(
@@ -180,7 +180,8 @@ def __init__(
         image: Image.Image,
         layout: Optional[List[TextRegion]],
         image_metadata: Optional[dict] = None,
-        image_path: Optional[Union[str, PurePath]] = None,
+        image_path: Optional[Union[str, PurePath]] = None,  # TODO: Deprecate
+        document_filename: Optional[Union[str, PurePath]] = None,
         detection_model: Optional[UnstructuredObjectDetectionModel] = None,
         element_extraction_model: Optional[UnstructuredElementExtractionModel] = None,
         ocr_strategy: str = "auto",
@@ -196,6 +197,7 @@ def __init__(
         self.image_metadata = image_metadata
         self.image_path = image_path
         self.image_array: Union[np.ndarray, None] = None
+        self.document_filename = document_filename
         self.layout = layout
         self.number = number
         self.detection_model = detection_model
@@ -305,7 +307,11 @@ def _get_image_array(self) -> Union[np.ndarray, None]:
                 self.image_array = np.array(image)
         return self.image_array
 
-    def annotate(self, colors: Optional[Union[List[str], str]] = None) -> Image.Image:
+    def annotate(
+        self,
+        colors: Optional[Union[List[str], str]] = None,
+        image_dpi: int = 200,
+    ) -> Image.Image:
         """Annotates the elements on the page image."""
         if colors is None:
             colors = ["red" for _ in self.elements]
@@ -315,18 +321,46 @@ def annotate(self, colors: Optional[Union[List[str], str]] = None) -> Image.Imag
         if len(colors) < len(self.elements):
             n_copies = (len(self.elements) // len(colors)) + 1
             colors = colors * n_copies
-        img = self.image.copy() if self.image else Image.open(self.image_path)
+
+        # Hotload image if it hasn't been loaded yet
+        if self.image:
+            img = self.image.copy()
+        elif self.image_path:
+            img = Image.open(self.image_path)
+        else:
+            img = self._get_image(self.document_filename, self.number, image_dpi)
 
         for el, color in zip(self.elements, colors):
             if isinstance(el, Rectangle):
                 img = draw_bbox(img, el, color=color)
+
         return img
 
+    def _get_image(self, filename, page_number, pdf_image_dpi: int = 200) -> Image.Image:
+        """Hotloads a page image from a pdf file."""
+
+        with tempfile.TemporaryDirectory() as temp_dir:
+            _image_paths = pdf2image.convert_from_path(
+                filename,
+                dpi=pdf_image_dpi,
+                output_folder=temp_dir,
+                paths_only=True,
+            )
+            image_paths = cast(List[str], _image_paths)
+            if page_number > len(image_paths):
+                raise ValueError(
+                    f"Page number {page_number} is greater than the number of pages in the PDF.",
+                )
+
+            with Image.open(image_paths[page_number - 1]) as image:
+                return image.copy()
+
     @classmethod
     def from_image(
         cls,
         image: Image.Image,
-        image_path: Optional[Union[str, PurePath]],
+        image_path: Optional[Union[str, PurePath]] = None,
+        document_filename: Optional[Union[str, PurePath]] = None,
         number: int = 1,
         detection_model: Optional[UnstructuredObjectDetectionModel] = None,
         element_extraction_model: Optional[UnstructuredElementExtractionModel] = None,
@@ -363,6 +397,9 @@ def from_image(
             "height": page.image.height if page.image else None,
         }
         page.image_path = os.path.abspath(image_path) if image_path else None
+        page.document_filename = os.path.abspath(document_filename) if document_filename else None
+
+        # Clear the image to save memory
         page.image = None
 
         return page
@@ -480,7 +517,7 @@ def get_element_from_block(
 def load_pdf(
     filename: str,
     dpi: int = 200,
-    output_folder: Union[str, PurePath] = None,  # type: ignore
+    output_folder: Optional[Union[str, PurePath]] = None,
     path_only: bool = False,
 ) -> Tuple[List[List[TextRegion]], Union[List[Image.Image], List[str]]]:
     """Loads the image and word objects from a pdf using pdfplumber and the image renderings of the
@@ -509,30 +546,23 @@ def load_pdf(
     if path_only and not output_folder:
         raise ValueError("output_folder must be specified if path_only is true")
 
-    images = pdf2image.convert_from_path(
-        filename,
-        dpi=dpi,
-        output_folder=output_folder,
-        paths_only=path_only,
-    )
+    if output_folder is not None:
+        images = pdf2image.convert_from_path(
+            filename,
+            dpi=dpi,
+            output_folder=output_folder,
+            paths_only=path_only,
+        )
+    else:
+        images = pdf2image.convert_from_path(
+            filename,
+            dpi=dpi,
+            paths_only=path_only,
+        )
 
     return layouts, images
 
 
-def create_image_output_dir(
-    filename: Union[str, PurePath],
-) -> Union[str, PurePath]:
-    """Creates a directory to store the converted images from the pdf pages and returns the
-    directory path"""
-    parent_dir = os.path.abspath(os.path.dirname(filename))
-    f_name_without_extension = os.path.splitext(os.path.basename(filename))[0]
-
-    # Add a suffix to avoid conflicts in case original file doesn't have an extension
-    output_dir = os.path.join(parent_dir, f"{f_name_without_extension}_images")
-    os.makedirs(output_dir, exist_ok=True)
-    return output_dir
-
-
 def parse_ocr_data(ocr_data: dict) -> List[TextRegion]:
     """
     Parse the OCR result data to extract a list of TextRegion objects.

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.5.15" # pragma: no cover`
	`1`	`+__version__ = "0.5.16" # pragma: no cover`