Klaijan/add tiff image file support case in from_image_file function (#173)

Klaijan · qued · web-flow · commit d162c56bfc42 · 2023-08-18T13:02:55.000-04:00
feat: supports multipage image (TIFF) in DocumentLayout

style: change viz to f-string formatting

style: change type comparison from == to is

test: add test tiff file and tiff test in test_layout
---------

Co-authored-by: qued &lt;64741807+qued@users.noreply.github.com&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,9 +1,14 @@
+## 0.5.14
+
+* Add TIFF test file and TIFF filetype to `test_from_image_file` in `test_layout`
+
 ## 0.5.13
 
 * Fix extracted image elements being included in layout merge
 
 ## 0.5.12
 
+* Add multipage TIFF extraction support
 * Fix a pdfminer error when using `process_data_with_model`
 
 ## 0.5.11
diff --git a/sample-docs/loremipsum.tiff b/sample-docs/loremipsum.tiff
diff --git a/test_unstructured_inference/inference/test_layout.py b/test_unstructured_inference/inference/test_layout.py
@@ -209,7 +209,7 @@ def test_process_data_with_model(monkeypatch, mock_final_layout, model_name):
     )
 
     def new_isinstance(obj, cls):
-        if type(obj) == MockLayoutModel:
+        if type(obj) is MockLayoutModel:
             return True
         else:
             return isinstance(obj, cls)
@@ -345,7 +345,7 @@ def test_get_elements_from_block_raises():
         layout.get_element_from_block(block, None, None)
 
 
-@pytest.mark.parametrize("filetype", ["png", "jpg"])
+@pytest.mark.parametrize("filetype", ["png", "jpg", "tiff"])
 def test_from_image_file(monkeypatch, mock_final_layout, filetype):
     def mock_get_elements(self, *args, **kwargs):
         self.elements = [mock_final_layout]
diff --git a/test_unstructured_inference/models/test_donut.py b/test_unstructured_inference/models/test_donut.py
@@ -41,7 +41,7 @@ def test_load_donut_model(model_path, processor_path, config_path):
         config=config_path,
         task_prompt="<s>",
     )
-    assert type(donut_model.model.encoder) == DonutSwinModel
+    assert type(donut_model.model.encoder) is DonutSwinModel
 
 
 @pytest.fixture()
diff --git a/test_unstructured_inference/models/test_tables.py b/test_unstructured_inference/models/test_tables.py
@@ -31,7 +31,7 @@ def test_load_table_model_raises_when_not_available(model_path):
 def test_load_donut_model(model_path):
     table_model = tables.UnstructuredTableTransformerModel()
     table_model.initialize(model=model_path)
-    assert type(table_model.model.model.decoder) == TableTransformerDecoder
+    assert type(table_model.model.model.decoder) is TableTransformerDecoder
 
 
 @pytest.fixture()
diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py
@@ -1 +1 @@
-__version__ = "0.5.13"  # pragma: no cover
+__version__ = "0.5.14"  # pragma: no cover
diff --git a/unstructured_inference/inference/layout.py b/unstructured_inference/inference/layout.py
@@ -10,7 +10,7 @@
 import pytesseract
 from pdfminer import psparser
 from pdfminer.high_level import extract_pages
-from PIL import Image
+from PIL import Image, ImageSequence
 from pytesseract import Output
 
 from unstructured_inference.inference.elements import (
@@ -143,26 +143,32 @@ def from_image_file(
         try:
             image = Image.open(filename)
             format = image.format
-            image = image.convert("RGB")
-            image.format = format
+            images = []
+            for i, im in enumerate(ImageSequence.Iterator(image)):
+                im = im.convert("RGB")
+                im.format = format
+                images.append(im)
         except Exception as e:
             if os.path.isdir(filename) or os.path.isfile(filename):
                 raise e
             else:
                 raise FileNotFoundError(f'File "{filename}" not found!') from e
-        page = PageLayout.from_image(
-            image,
-            image_path=filename,
-            detection_model=detection_model,
-            element_extraction_model=element_extraction_model,
-            layout=None,
-            ocr_strategy=ocr_strategy,
-            ocr_languages=ocr_languages,
-            ocr_mode=ocr_mode,
-            fixed_layout=fixed_layout,
-            extract_tables=extract_tables,
-        )
-        return cls.from_pages([page])
+        pages = []
+        for i, image in enumerate(images):
+            page = PageLayout.from_image(
+                image,
+                image_path=filename,
+                number=i,
+                detection_model=detection_model,
+                element_extraction_model=element_extraction_model,
+                layout=None,
+                ocr_strategy=ocr_strategy,
+                ocr_languages=ocr_languages,
+                fixed_layout=fixed_layout,
+                extract_tables=extract_tables,
+            )
+            pages.append(page)
+        return cls.from_pages(pages)
 
 
 class PageLayout:
diff --git a/unstructured_inference/visualize.py b/unstructured_inference/visualize.py
@@ -44,7 +44,7 @@ def draw_yolox_bounding_boxes(img, boxes, scores, cls_ids, conf=0.5, class_names
         y1 = int(box[3])
 
         color = (_COLORS[cls_id] * 255).astype(np.uint8).tolist()
-        text = "{}:{:.1f}%".format(class_names[cls_id], score * 100)
+        text = f"{class_names[cls_id]}:{score * 100:.1f}%"
         txt_color = (0, 0, 0) if np.mean(_COLORS[cls_id]) > 0.5 else (255, 255, 255)
         font = cv2.FONT_HERSHEY_SIMPLEX
 

Original file line number	Diff line number	Diff line change
`@@ -41,7 +41,7 @@ def test_load_donut_model(model_path, processor_path, config_path):`
`41`	`41`	`config=config_path,`
`42`	`42`	`task_prompt="<s>",`
`43`	`43`	`)`
`44`		`- assert type(donut_model.model.encoder) == DonutSwinModel`
	`44`	`+ assert type(donut_model.model.encoder) is DonutSwinModel`
`45`	`45`
`46`	`46`
`47`	`47`	`@pytest.fixture()`
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.5.13" # pragma: no cover`
	`1`	`+__version__ = "0.5.14" # pragma: no cover`