Skip to content

Commit bdee102

Browse files
fix: bring back embedded images in pdf (#198)
### Summary - Fix inferred layout visualization - Add functionality to check if the extracted image is a full-page image ### Testing from unstructured_inference.inference.layout import DocumentLayout doc = DocumentLayout.from_file("sample-docs/embedded-images.pdf") ### Evaluation The Python script (or Jupyter Notebook) for "layout analysis" can be used to evaluate the feature implemented in this branch. PYTHONPATH=. python examples/layout_analysis/visualization.py sample-docs/embedded-images.pdf
1 parent 5c295c5 commit bdee102

File tree

9 files changed

+201
-15
lines changed

9 files changed

+201
-15
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
## 0.5.23
2+
3+
* Add functionality to bring back embedded images in PDF
4+
15
## 0.5.22
26

37
* Add object-detection classification probabilities to LayoutElement for all currently implemented object detection models

sample-docs/embedded-images.pdf

163 KB
Binary file not shown.

test_unstructured_inference/conftest.py

Lines changed: 58 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,8 @@
22
import pytest
33
from PIL import Image
44

5-
from unstructured_inference.inference.elements import EmbeddedTextRegion
5+
from unstructured_inference.inference.elements import EmbeddedTextRegion, Rectangle, TextRegion
6+
from unstructured_inference.inference.layoutelement import LayoutElement
67

78

89
@pytest.fixture()
@@ -15,9 +16,23 @@ def mock_numpy_image():
1516
return np.zeros((50, 50, 3), np.uint8)
1617

1718

18-
# TODO(alan): Make a better test layout
1919
@pytest.fixture()
20-
def sample_layout():
20+
def mock_rectangle():
21+
return Rectangle(100, 100, 300, 300)
22+
23+
24+
@pytest.fixture()
25+
def mock_text_region():
26+
return TextRegion(100, 100, 300, 300, text="Sample text")
27+
28+
29+
@pytest.fixture()
30+
def mock_layout_element():
31+
return LayoutElement(100, 100, 300, 300, text="Sample text", type="Text")
32+
33+
34+
@pytest.fixture()
35+
def mock_embedded_text_regions():
2136
return [
2237
EmbeddedTextRegion(
2338
x1=453.00277777777774,
@@ -90,3 +105,43 @@ def sample_layout():
90105
text="Image",
91106
),
92107
]
108+
109+
110+
@pytest.fixture()
111+
def mock_ocr_regions():
112+
return [
113+
EmbeddedTextRegion(10, 10, 90, 90, "0"),
114+
EmbeddedTextRegion(200, 200, 300, 300, "1"),
115+
EmbeddedTextRegion(500, 320, 600, 350, "3"),
116+
]
117+
118+
119+
# TODO(alan): Make a better test layout
120+
@pytest.fixture()
121+
def mock_layout(mock_embedded_text_regions):
122+
return [
123+
LayoutElement(
124+
r.x1,
125+
r.y1,
126+
r.x2,
127+
r.y2,
128+
text=r.text,
129+
type="UncategorizedText",
130+
)
131+
for r in mock_embedded_text_regions
132+
]
133+
134+
135+
@pytest.fixture()
136+
def mock_inferred_layout(mock_embedded_text_regions):
137+
return [
138+
LayoutElement(
139+
r.x1,
140+
r.y1,
141+
r.x2,
142+
r.y2,
143+
text=None,
144+
type="Text",
145+
)
146+
for r in mock_embedded_text_regions
147+
]

test_unstructured_inference/inference/test_layout_element.py

Lines changed: 116 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,16 @@
1+
import pytest
2+
from layoutparser.elements import TextBlock
3+
from layoutparser.elements.layout_elements import Rectangle as LPRectangle
4+
5+
from unstructured_inference.constants import SUBREGION_THRESHOLD_FOR_OCR
16
from unstructured_inference.inference.elements import TextRegion
27
from unstructured_inference.inference.layoutelement import (
38
LayoutElement,
49
aggregate_ocr_text_by_block,
510
get_elements_from_ocr_regions,
11+
merge_inferred_layout_with_ocr_layout,
612
merge_text_regions,
13+
supplement_layout_with_ocr_elements,
714
)
815

916

@@ -21,7 +28,7 @@ def test_aggregate_ocr_text_by_block():
2128
assert text == expected
2229

2330

24-
def test_merge_text_regions(sample_layout):
31+
def test_merge_text_regions(mock_embedded_text_regions):
2532
expected = TextRegion(
2633
x1=437.83888888888885,
2734
y1=317.319341111111,
@@ -30,11 +37,11 @@ def test_merge_text_regions(sample_layout):
3037
text="LayoutParser: A Unified Toolkit for Deep Learning Based Document Image",
3138
)
3239

33-
merged_text_region = merge_text_regions(sample_layout)
40+
merged_text_region = merge_text_regions(mock_embedded_text_regions)
3441
assert merged_text_region == expected
3542

3643

37-
def test_get_elements_from_ocr_regions(sample_layout):
44+
def test_get_elements_from_ocr_regions(mock_embedded_text_regions):
3845
expected = [
3946
LayoutElement(
4047
x1=437.83888888888885,
@@ -46,5 +53,110 @@ def test_get_elements_from_ocr_regions(sample_layout):
4653
),
4754
]
4855

49-
elements = get_elements_from_ocr_regions(sample_layout)
56+
elements = get_elements_from_ocr_regions(mock_embedded_text_regions)
5057
assert elements == expected
58+
59+
60+
def test_supplement_layout_with_ocr_elements(mock_layout, mock_ocr_regions):
61+
ocr_elements = [
62+
LayoutElement(
63+
r.x1,
64+
r.y1,
65+
r.x2,
66+
r.y2,
67+
text=r.text,
68+
type="UncategorizedText",
69+
)
70+
for r in mock_ocr_regions
71+
]
72+
73+
final_layout = supplement_layout_with_ocr_elements(mock_layout, mock_ocr_regions)
74+
75+
# Check if the final layout contains the original layout elements
76+
for element in mock_layout:
77+
assert element in final_layout
78+
79+
# Check if the final layout contains the OCR-derived elements
80+
assert any(ocr_element in final_layout for ocr_element in ocr_elements)
81+
82+
# Check if the OCR-derived elements that are subregions of layout elements are removed
83+
for element in mock_layout:
84+
for ocr_element in ocr_elements:
85+
if ocr_element.is_almost_subregion_of(element, SUBREGION_THRESHOLD_FOR_OCR):
86+
assert ocr_element not in final_layout
87+
88+
89+
def test_merge_inferred_layout_with_ocr_layout(mock_inferred_layout, mock_ocr_regions):
90+
ocr_elements = [
91+
LayoutElement(
92+
r.x1,
93+
r.y1,
94+
r.x2,
95+
r.y2,
96+
text=r.text,
97+
type="UncategorizedText",
98+
)
99+
for r in mock_ocr_regions
100+
]
101+
102+
final_layout = merge_inferred_layout_with_ocr_layout(mock_inferred_layout, mock_ocr_regions)
103+
104+
# Check if the inferred layout's text attribute is updated with aggregated OCR text
105+
assert final_layout[0].text == mock_ocr_regions[2].text
106+
107+
# Check if the final layout contains both original elements and OCR-derived elements
108+
assert all(element in final_layout for element in mock_inferred_layout)
109+
assert any(element in final_layout for element in ocr_elements)
110+
111+
112+
@pytest.mark.parametrize("is_table", [False, True])
113+
def test_layout_element_extract_text(
114+
mock_layout_element,
115+
mock_text_region,
116+
mock_pil_image,
117+
is_table,
118+
):
119+
if is_table:
120+
mock_layout_element.type = "Table"
121+
122+
extracted_text = mock_layout_element.extract_text(
123+
objects=[mock_text_region],
124+
image=mock_pil_image,
125+
extract_tables=True,
126+
)
127+
128+
assert isinstance(extracted_text, str)
129+
assert "Sample text" in extracted_text
130+
131+
if mock_layout_element.type == "Table":
132+
assert hasattr(mock_layout_element, "text_as_html")
133+
134+
135+
def test_layout_element_do_dict(mock_layout_element):
136+
expected = {
137+
"coordinates": ((100, 100), (100, 300), (300, 300), (300, 100)),
138+
"text": "Sample text",
139+
"type": "Text",
140+
"prob": None,
141+
}
142+
143+
assert mock_layout_element.to_dict() == expected
144+
145+
146+
def test_layout_element_from_region(mock_rectangle):
147+
expected = LayoutElement(100, 100, 300, 300, None, None)
148+
149+
assert LayoutElement.from_region(mock_rectangle) == expected
150+
151+
152+
def test_layout_element_from_lp_textblock():
153+
mock_text_block = TextBlock(
154+
block=LPRectangle(100, 100, 300, 300),
155+
text="Sample Text",
156+
type="Text",
157+
score=0.99,
158+
)
159+
160+
expected = LayoutElement(100, 100, 300, 300, "Sample Text", "Text", 0.99)
161+
162+
assert LayoutElement.from_lp_textblock(mock_text_block) == expected

test_unstructured_inference/test_elements.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -87,8 +87,8 @@ def test_minimal_containing_rect():
8787
assert rect2.is_in(big_rect)
8888

8989

90-
def test_partition_groups_from_regions(sample_layout):
91-
words = sample_layout
90+
def test_partition_groups_from_regions(mock_embedded_text_regions):
91+
words = mock_embedded_text_regions
9292
groups = elements.partition_groups_from_regions(words)
9393
assert len(groups) == 1
9494
sorted_groups = sorted(groups, key=lambda group: group[0].y1)
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.5.22" # pragma: no cover
1+
__version__ = "0.5.23" # pragma: no cover

unstructured_inference/constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,3 +12,4 @@ class AnnotationResult(Enum):
1212

1313

1414
SUBREGION_THRESHOLD_FOR_OCR = 0.5
15+
FULL_PAGE_REGION_THRESHOLD = 0.99

unstructured_inference/inference/layout.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -285,9 +285,10 @@ def get_elements_with_detection_model(
285285
and "R_50" not in self.detection_model.model_path
286286
):
287287
threshold_kwargs = {"same_region_threshold": 0.5, "subregion_threshold": 0.5}
288-
inferred_layout = merge_inferred_layout_with_extracted_layout(
288+
merged_layout = merge_inferred_layout_with_extracted_layout(
289289
inferred_layout=inferred_layout,
290290
extracted_layout=self.layout,
291+
page_image_size=self.image.size,
291292
ocr_layout=ocr_layout,
292293
supplement_with_ocr_elements=self.supplement_with_ocr_elements,
293294
**threshold_kwargs,
@@ -301,14 +302,16 @@ def get_elements_with_detection_model(
301302
and "R_50" not in self.detection_model.model_path
302303
):
303304
threshold_kwargs = {"subregion_threshold": 0.3}
304-
inferred_layout = merge_inferred_layout_with_ocr_layout(
305+
merged_layout = merge_inferred_layout_with_ocr_layout(
305306
inferred_layout=inferred_layout,
306307
ocr_layout=ocr_layout,
307308
supplement_with_ocr_elements=self.supplement_with_ocr_elements,
308309
**threshold_kwargs,
309310
)
311+
else:
312+
merged_layout = inferred_layout
310313

311-
elements = self.get_elements_from_layout(cast(List[TextRegion], inferred_layout))
314+
elements = self.get_elements_from_layout(cast(List[TextRegion], merged_layout))
312315

313316
if self.analysis:
314317
self.inferred_layout = inferred_layout

unstructured_inference/inference/layoutelement.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from layoutparser.elements.layout import TextBlock
77
from PIL import Image
88

9-
from unstructured_inference.constants import SUBREGION_THRESHOLD_FOR_OCR
9+
from unstructured_inference.constants import FULL_PAGE_REGION_THRESHOLD, SUBREGION_THRESHOLD_FOR_OCR
1010
from unstructured_inference.inference.elements import (
1111
ImageTextRegion,
1212
Rectangle,
@@ -84,6 +84,7 @@ def interpret_table_block(text_block: TextRegion, image: Image.Image) -> str:
8484
def merge_inferred_layout_with_extracted_layout(
8585
inferred_layout: Collection[LayoutElement],
8686
extracted_layout: Collection[TextRegion],
87+
page_image_size: tuple,
8788
ocr_layout: Optional[List[TextRegion]] = None,
8889
supplement_with_ocr_elements: bool = True,
8990
same_region_threshold: float = 0.75,
@@ -92,11 +93,21 @@ def merge_inferred_layout_with_extracted_layout(
9293
"""Merge two layouts to produce a single layout."""
9394
extracted_elements_to_add: List[TextRegion] = []
9495
inferred_regions_to_remove = []
96+
w, h = page_image_size
97+
full_page_region = Rectangle(0, 0, w, h)
9598
for extracted_region in extracted_layout:
9699
if isinstance(extracted_region, ImageTextRegion):
97100
# Skip extracted images for this purpose, we don't have the text from them and they
98101
# don't provide good text bounding boxes.
99-
continue
102+
103+
is_full_page_image = region_bounding_boxes_are_almost_the_same(
104+
extracted_region,
105+
full_page_region,
106+
FULL_PAGE_REGION_THRESHOLD,
107+
)
108+
109+
if is_full_page_image:
110+
continue
100111
region_matched = False
101112
for inferred_region in inferred_layout:
102113
if inferred_region.intersects(extracted_region):

0 commit comments

Comments
 (0)