add padding before structure detection (#205)

badGarnet · qued · web-flow · commit d6ccdc17b5be · 2023-09-12T21:34:46.000Z
Experiments show that structure detection model works better when padding is added around the image: https://www.notion.so/Investigate-structure-detection-model-9cf53d2aeb6c4a63b44c5324217f7adf For example, this sample table image (part of this PR) represents a typical input into the structure detection model: a crop of a table that is relatively tightly bounded around the table content. ![ilpa-example-1](https://github.com/Unstructured-IO/unstructured-inference/assets/647930/7a462e94-b47e-4687-b9fe-a2282c34f444) Without padding (current method), we got extraction html result on this image like: ![Screenshot 2023-09-08 at 3 20 49 PM](https://github.com/Unstructured-IO/unstructured-inference/assets/647930/17b023c0-23e8-4465-8a3f-5debe8c4b740) which misses a few rows that on the top and bottom of the image. However, once padding is added (this PR, default to 25 pixels on each side) we are able to increase the recall of the structure detection model: ![Screenshot 2023-09-08 at 3 22 10 PM](https://github.com/Unstructured-IO/unstructured-inference/assets/647930/768b4b0e-6315-4505-b96f-65bce25d3b7a) ## testing Please experiment with changing the `pad_for_structure_detection` value and see different results by running ```python from unstructured_inference.models.tables import UnstructuredTableTransformerModel from PIL import Image model = UnstructuredTableTransformerModel() model.initialize("microsoft/table-transformer-structure-recognition") prediction = model.predict(Image.open('table.png')) ``` and view the generated html in a browser ## open questions 1. do we want to make this parameter tunable with env variable? are we overdue for a config file? 2. do we want to explore automatically choosing a good padding by experimenting with different images? --------- Co-authored-by: qued <64741807+qued@users.noreply.github.com>
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,7 @@
+## 0.5.27
+
+* table structure detection now pads the input image by 25 pixels in all 4 directions to improve its recall
+
 ## 0.5.26
 
 * support paddle with both cpu and gpu and assumed it is pre-installed
diff --git a/sample-docs/ilpa-example-1.jpg b/sample-docs/ilpa-example-1.jpg
diff --git a/test_unstructured_inference/test_utils.py b/test_unstructured_inference/test_utils.py
@@ -2,6 +2,7 @@
 import tempfile
 from unittest.mock import patch
 
+import numpy as np
 import pytest
 from PIL import Image
 
@@ -12,6 +13,7 @@
     LazyDict,
     LazyEvaluateInfo,
     annotate_layout_elements,
+    pad_image_with_background_color,
     write_image,
 )
 
@@ -128,3 +130,20 @@ def test_annotate_layout_elements_with_plot_result():
         )
 
     mock_show_plot.assert_called_with("mock_image", desired_width=14)
+
+
+def test_pad_image_with_background_color(mock_pil_image):
+    pad = 10
+    height, width = mock_pil_image.size
+    padded = pad_image_with_background_color(mock_pil_image, pad * 2, "black")
+    assert padded.size == (height + 2 * pad, width + 2 * pad)
+    np.testing.assert_array_almost_equal(
+        np.array(padded.crop((pad, pad, width + pad, height + pad))),
+        np.array(mock_pil_image),
+    )
+    assert padded.getpixel((1, 1)) == (0, 0, 0)
+
+
+def test_pad_image_with_invalid_input(mock_pil_image):
+    with pytest.raises(ValueError, match="Can not pad an image with negative space!"):
+        pad_image_with_background_color(mock_pil_image, -1)
diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py
@@ -1 +1 @@
-__version__ = "0.5.26"  # pragma: no cover
+__version__ = "0.5.27"  # pragma: no cover
diff --git a/unstructured_inference/models/tables.py b/unstructured_inference/models/tables.py
@@ -17,6 +17,7 @@
 from unstructured_inference.logger import logger
 from unstructured_inference.models.table_postprocess import Rect
 from unstructured_inference.models.unstructuredmodel import UnstructuredModel
+from unstructured_inference.utils import pad_image_with_background_color
 
 from . import table_postprocess as postprocess
 
@@ -99,11 +100,16 @@ def get_tokens(self, x: Image):
                 )
             return tokens
 
-    def run_prediction(self, x: Image):
+    def run_prediction(self, x: Image, pad_for_structure_detection: int = 50):
         """Predict table structure"""
         with torch.no_grad():
-            encoding = self.feature_extractor(x, return_tensors="pt").to(self.device)
+            logger.info(f"padding image by {pad_for_structure_detection} for structure detection")
+            encoding = self.feature_extractor(
+                pad_image_with_background_color(x, pad_for_structure_detection),
+                return_tensors="pt",
+            ).to(self.device)
             outputs_structure = self.model(**encoding)
+            outputs_structure["pad_for_structure_detection"] = pad_for_structure_detection
 
         tokens = self.get_tokens(x=x)
 
@@ -195,7 +201,13 @@ def outputs_to_objects(outputs, img_size, class_idx2name):
     pred_labels = list(m.indices.detach().cpu().numpy())[0]
     pred_scores = list(m.values.detach().cpu().numpy())[0]
     pred_bboxes = outputs["pred_boxes"].detach().cpu()[0]
-    pred_bboxes = [elem.tolist() for elem in rescale_bboxes(pred_bboxes, img_size)]
+
+    pad = outputs.get("pad_for_structure_detection", 0)
+    scale_size = (img_size[0] + pad, img_size[1] + pad)
+    pred_bboxes = [elem.tolist() for elem in rescale_bboxes(pred_bboxes, scale_size)]
+    # unshift the padding; padding effectively shifted the bounding boxes of structures in the
+    # original image with half of the total pad
+    shift_size = pad / 2
 
     objects = []
     for label, score, bbox in zip(pred_labels, pred_scores, pred_bboxes):
@@ -205,7 +217,7 @@ def outputs_to_objects(outputs, img_size, class_idx2name):
                 {
                     "label": class_label,
                     "score": float(score),
-                    "bbox": [float(elem) for elem in bbox],
+                    "bbox": [float(elem) - shift_size for elem in bbox],
                 },
             )
 
diff --git a/unstructured_inference/utils.py b/unstructured_inference/utils.py
@@ -4,7 +4,7 @@
 
 import cv2
 import numpy as np
-from PIL.Image import Image
+from PIL import Image
 
 from unstructured_inference.constants import AnnotationResult
 from unstructured_inference.visualize import show_plot
@@ -52,13 +52,13 @@ def __len__(self) -> int:
         return len(self._raw_dict)
 
 
-def write_image(image: Union[Image, np.ndarray], output_image_path: str):
+def write_image(image: Union[Image.Image, np.ndarray], output_image_path: str):
     """
     Write an image to a specified file path, supporting both PIL Image and numpy ndarray formats.
 
     Parameters:
-    - image (Union[Image, np.ndarray]): The image to be written, which can be in PIL Image format
-     or a numpy ndarray format.
+    - image (Union[Image.Image, np.ndarray]): The image to be written, which can be in PIL Image
+      format or a numpy ndarray format.
     - output_image_path (str): The path to which the image will be written.
 
     Raises:
@@ -68,7 +68,7 @@ def write_image(image: Union[Image, np.ndarray], output_image_path: str):
     - None: The function writes the image to the specified path but does not return any value.
     """
 
-    if isinstance(image, Image):
+    if isinstance(image, Image.Image):
         image.save(output_image_path)
     elif isinstance(image, np.ndarray):
         cv2.imwrite(output_image_path, image)
@@ -123,3 +123,22 @@ def annotate_layout_elements(
                 print(f"wrote {output_f_path}")
             elif result == AnnotationResult.PLOT:
                 show_plot(img, desired_width=plot_desired_width)
+
+
+def pad_image_with_background_color(
+    image: Image.Image,
+    pad: int = 10,
+    background_color: str = "white",
+) -> Image.Image:
+    """pads an input image with the same background color around it by pad//2 on all 4 sides
+
+    The original image is kept intact and a new image is returned with padding added.
+    """
+    width, height = image.size
+    if pad < 0:
+        raise ValueError(
+            "Can not pad an image with negative space! Please use a positive value for `pad`.",
+        )
+    new = Image.new(image.mode, (width + pad, height + pad), background_color)
+    new.paste(image, (pad // 2, pad // 2))
+    return new

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.5.26" # pragma: no cover`
	`1`	`+__version__ = "0.5.27" # pragma: no cover`