Skip to content

Commit d6ccdc1

Browse files
badGarnetqued
andauthored
add padding before structure detection (#205)
Experiments show that structure detection model works better when padding is added around the image: https://www.notion.so/Investigate-structure-detection-model-9cf53d2aeb6c4a63b44c5324217f7adf For example, this sample table image (part of this PR) represents a typical input into the structure detection model: a crop of a table that is relatively tightly bounded around the table content. ![ilpa-example-1](https://github.com/Unstructured-IO/unstructured-inference/assets/647930/7a462e94-b47e-4687-b9fe-a2282c34f444) Without padding (current method), we got extraction html result on this image like: ![Screenshot 2023-09-08 at 3 20 49 PM](https://github.com/Unstructured-IO/unstructured-inference/assets/647930/17b023c0-23e8-4465-8a3f-5debe8c4b740) which misses a few rows that on the top and bottom of the image. However, once padding is added (this PR, default to 25 pixels on each side) we are able to increase the recall of the structure detection model: ![Screenshot 2023-09-08 at 3 22 10 PM](https://github.com/Unstructured-IO/unstructured-inference/assets/647930/768b4b0e-6315-4505-b96f-65bce25d3b7a) ## testing Please experiment with changing the `pad_for_structure_detection` value and see different results by running ```python from unstructured_inference.models.tables import UnstructuredTableTransformerModel from PIL import Image model = UnstructuredTableTransformerModel() model.initialize("microsoft/table-transformer-structure-recognition") prediction = model.predict(Image.open('table.png')) ``` and view the generated html in a browser ## open questions 1. do we want to make this parameter tunable with env variable? are we overdue for a config file? 2. do we want to explore automatically choosing a good padding by experimenting with different images? --------- Co-authored-by: qued <[email protected]>
1 parent 5bc67b5 commit d6ccdc1

File tree

6 files changed

+64
-10
lines changed

6 files changed

+64
-10
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
## 0.5.27
2+
3+
* table structure detection now pads the input image by 25 pixels in all 4 directions to improve its recall
4+
15
## 0.5.26
26

37
* support paddle with both cpu and gpu and assumed it is pre-installed

sample-docs/ilpa-example-1.jpg

46.9 KB
Loading

test_unstructured_inference/test_utils.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import tempfile
33
from unittest.mock import patch
44

5+
import numpy as np
56
import pytest
67
from PIL import Image
78

@@ -12,6 +13,7 @@
1213
LazyDict,
1314
LazyEvaluateInfo,
1415
annotate_layout_elements,
16+
pad_image_with_background_color,
1517
write_image,
1618
)
1719

@@ -128,3 +130,20 @@ def test_annotate_layout_elements_with_plot_result():
128130
)
129131

130132
mock_show_plot.assert_called_with("mock_image", desired_width=14)
133+
134+
135+
def test_pad_image_with_background_color(mock_pil_image):
136+
pad = 10
137+
height, width = mock_pil_image.size
138+
padded = pad_image_with_background_color(mock_pil_image, pad * 2, "black")
139+
assert padded.size == (height + 2 * pad, width + 2 * pad)
140+
np.testing.assert_array_almost_equal(
141+
np.array(padded.crop((pad, pad, width + pad, height + pad))),
142+
np.array(mock_pil_image),
143+
)
144+
assert padded.getpixel((1, 1)) == (0, 0, 0)
145+
146+
147+
def test_pad_image_with_invalid_input(mock_pil_image):
148+
with pytest.raises(ValueError, match="Can not pad an image with negative space!"):
149+
pad_image_with_background_color(mock_pil_image, -1)
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.5.26" # pragma: no cover
1+
__version__ = "0.5.27" # pragma: no cover

unstructured_inference/models/tables.py

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
from unstructured_inference.logger import logger
1818
from unstructured_inference.models.table_postprocess import Rect
1919
from unstructured_inference.models.unstructuredmodel import UnstructuredModel
20+
from unstructured_inference.utils import pad_image_with_background_color
2021

2122
from . import table_postprocess as postprocess
2223

@@ -99,11 +100,16 @@ def get_tokens(self, x: Image):
99100
)
100101
return tokens
101102

102-
def run_prediction(self, x: Image):
103+
def run_prediction(self, x: Image, pad_for_structure_detection: int = 50):
103104
"""Predict table structure"""
104105
with torch.no_grad():
105-
encoding = self.feature_extractor(x, return_tensors="pt").to(self.device)
106+
logger.info(f"padding image by {pad_for_structure_detection} for structure detection")
107+
encoding = self.feature_extractor(
108+
pad_image_with_background_color(x, pad_for_structure_detection),
109+
return_tensors="pt",
110+
).to(self.device)
106111
outputs_structure = self.model(**encoding)
112+
outputs_structure["pad_for_structure_detection"] = pad_for_structure_detection
107113

108114
tokens = self.get_tokens(x=x)
109115

@@ -195,7 +201,13 @@ def outputs_to_objects(outputs, img_size, class_idx2name):
195201
pred_labels = list(m.indices.detach().cpu().numpy())[0]
196202
pred_scores = list(m.values.detach().cpu().numpy())[0]
197203
pred_bboxes = outputs["pred_boxes"].detach().cpu()[0]
198-
pred_bboxes = [elem.tolist() for elem in rescale_bboxes(pred_bboxes, img_size)]
204+
205+
pad = outputs.get("pad_for_structure_detection", 0)
206+
scale_size = (img_size[0] + pad, img_size[1] + pad)
207+
pred_bboxes = [elem.tolist() for elem in rescale_bboxes(pred_bboxes, scale_size)]
208+
# unshift the padding; padding effectively shifted the bounding boxes of structures in the
209+
# original image with half of the total pad
210+
shift_size = pad / 2
199211

200212
objects = []
201213
for label, score, bbox in zip(pred_labels, pred_scores, pred_bboxes):
@@ -205,7 +217,7 @@ def outputs_to_objects(outputs, img_size, class_idx2name):
205217
{
206218
"label": class_label,
207219
"score": float(score),
208-
"bbox": [float(elem) for elem in bbox],
220+
"bbox": [float(elem) - shift_size for elem in bbox],
209221
},
210222
)
211223

unstructured_inference/utils.py

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
import cv2
66
import numpy as np
7-
from PIL.Image import Image
7+
from PIL import Image
88

99
from unstructured_inference.constants import AnnotationResult
1010
from unstructured_inference.visualize import show_plot
@@ -52,13 +52,13 @@ def __len__(self) -> int:
5252
return len(self._raw_dict)
5353

5454

55-
def write_image(image: Union[Image, np.ndarray], output_image_path: str):
55+
def write_image(image: Union[Image.Image, np.ndarray], output_image_path: str):
5656
"""
5757
Write an image to a specified file path, supporting both PIL Image and numpy ndarray formats.
5858
5959
Parameters:
60-
- image (Union[Image, np.ndarray]): The image to be written, which can be in PIL Image format
61-
or a numpy ndarray format.
60+
- image (Union[Image.Image, np.ndarray]): The image to be written, which can be in PIL Image
61+
format or a numpy ndarray format.
6262
- output_image_path (str): The path to which the image will be written.
6363
6464
Raises:
@@ -68,7 +68,7 @@ def write_image(image: Union[Image, np.ndarray], output_image_path: str):
6868
- None: The function writes the image to the specified path but does not return any value.
6969
"""
7070

71-
if isinstance(image, Image):
71+
if isinstance(image, Image.Image):
7272
image.save(output_image_path)
7373
elif isinstance(image, np.ndarray):
7474
cv2.imwrite(output_image_path, image)
@@ -123,3 +123,22 @@ def annotate_layout_elements(
123123
print(f"wrote {output_f_path}")
124124
elif result == AnnotationResult.PLOT:
125125
show_plot(img, desired_width=plot_desired_width)
126+
127+
128+
def pad_image_with_background_color(
129+
image: Image.Image,
130+
pad: int = 10,
131+
background_color: str = "white",
132+
) -> Image.Image:
133+
"""pads an input image with the same background color around it by pad//2 on all 4 sides
134+
135+
The original image is kept intact and a new image is returned with padding added.
136+
"""
137+
width, height = image.size
138+
if pad < 0:
139+
raise ValueError(
140+
"Can not pad an image with negative space! Please use a positive value for `pad`.",
141+
)
142+
new = Image.new(image.mode, (width + pad, height + pad), background_color)
143+
new.paste(image, (pad // 2, pad // 2))
144+
return new

0 commit comments

Comments
 (0)