Skip to content

Commit 4e1ae5c

Browse files
authored
enhancement: combine layouts (#123)
Add logic to combine inferred elements with extracted elements.
1 parent dd08038 commit 4e1ae5c

File tree

9 files changed

+463
-104
lines changed

9 files changed

+463
-104
lines changed

CHANGELOG.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
1-
## 0.5.2-dev0
1+
## 0.5.2-dev1
22

3+
* Combine inferred elements with extracted elements
34
* Add ruff to keep code consistent with unstructured
45

56
## 0.5.1

setup.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,14 +17,15 @@
1717
See the License for the specific language governing permissions and
1818
limitations under the License.
1919
"""
20-
from typing import List
20+
from typing import List, Optional, Union
2121

2222
from setuptools import find_packages, setup
2323

2424
from unstructured_inference.__version__ import __version__
2525

2626

27-
def load_requirements(file_list=None):
27+
def load_requirements(file_list: Optional[Union[str, List[str]]] = None):
28+
"""Loads the requirements from a .in file or list of .in files."""
2829
if file_list is None:
2930
file_list = ["requirements/base.in"]
3031
if isinstance(file_list, str):
@@ -33,7 +34,9 @@ def load_requirements(file_list=None):
3334
for file in file_list:
3435
with open(file, encoding="utf-8") as f:
3536
requirements.extend(f.readlines())
36-
requirements = [req for req in requirements if not req.startswith("#")]
37+
requirements = [
38+
req for req in requirements if not req.startswith("#") and not req.startswith("-")
39+
]
3740
return requirements
3841

3942

test_unstructured_inference/inference/test_layout.py

Lines changed: 81 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from PIL import Image
99

1010
import unstructured_inference.models.base as models
11-
from unstructured_inference.inference import elements, layout
11+
from unstructured_inference.inference import elements, layout, layoutelement
1212
from unstructured_inference.models import detectron2, tesseract
1313

1414

@@ -18,14 +18,30 @@ def mock_image():
1818

1919

2020
@pytest.fixture()
21-
def mock_page_layout():
21+
def mock_initial_layout():
2222
text_block = layout.EmbeddedTextRegion(2, 4, 6, 8, text="A very repetitive narrative. " * 10)
2323

2424
title_block = layout.EmbeddedTextRegion(1, 2, 3, 4, text="A Catchy Title")
2525

2626
return [text_block, title_block]
2727

2828

29+
@pytest.fixture()
30+
def mock_final_layout():
31+
text_block = layoutelement.LayoutElement(
32+
2,
33+
4,
34+
6,
35+
8,
36+
text="A very repetitive narrative. " * 10,
37+
type="NarrativeText",
38+
)
39+
40+
title_block = layoutelement.LayoutElement(1, 2, 3, 4, text="A Catchy Title", type="Title")
41+
42+
return [text_block, title_block]
43+
44+
2945
def test_pdf_page_converts_images_to_array(mock_image):
3046
page = layout.PageLayout(number=0, image=mock_image, layout=[])
3147
assert page.image_array is None
@@ -62,13 +78,13 @@ def initialize(self, *args, **kwargs):
6278
pass
6379

6480

65-
def test_get_page_elements(monkeypatch, mock_page_layout):
81+
def test_get_page_elements(monkeypatch, mock_final_layout):
6682
image = np.random.randint(12, 24, (40, 40))
6783
page = layout.PageLayout(
6884
number=0,
6985
image=image,
70-
layout=mock_page_layout,
71-
model=MockLayoutModel(mock_page_layout),
86+
layout=mock_final_layout,
87+
model=MockLayoutModel(mock_final_layout),
7288
)
7389

7490
elements = page.get_elements_with_model(inplace=False)
@@ -94,7 +110,17 @@ def join(self):
94110
def test_get_page_elements_with_ocr(monkeypatch):
95111
text_block = layout.TextRegion(2, 4, 6, 8, text=None)
96112
image_block = layout.ImageTextRegion(8, 14, 16, 18)
97-
doc_layout = [text_block, image_block]
113+
doc_initial_layout = [text_block, image_block]
114+
text_layoutelement = layoutelement.LayoutElement(
115+
2,
116+
4,
117+
6,
118+
8,
119+
text=None,
120+
type="UncategorizedText",
121+
)
122+
image_layoutelement = layoutelement.LayoutElement(8, 14, 16, 18, text=None, type="Image")
123+
doc_final_layout = [text_layoutelement, image_layoutelement]
98124

99125
monkeypatch.setattr(detectron2, "is_detectron2_available", lambda *args: True)
100126
monkeypatch.setattr(elements, "ocr", lambda *args, **kwargs: "An Even Catchier Title")
@@ -103,24 +129,24 @@ def test_get_page_elements_with_ocr(monkeypatch):
103129
page = layout.PageLayout(
104130
number=0,
105131
image=image,
106-
layout=doc_layout,
107-
model=MockLayoutModel(doc_layout),
132+
layout=doc_initial_layout,
133+
model=MockLayoutModel(doc_final_layout),
108134
)
109135
page.get_elements_with_model()
110136

111137
assert str(page) == "\n\nAn Even Catchier Title"
112138

113139

114-
def test_read_pdf(monkeypatch, mock_page_layout):
140+
def test_read_pdf(monkeypatch, mock_initial_layout, mock_final_layout):
115141
image = np.random.randint(12, 24, (40, 40))
116142
images = [image, image]
117143

118-
layouts = [mock_page_layout, mock_page_layout]
144+
layouts = [mock_initial_layout, mock_initial_layout]
119145

120146
monkeypatch.setattr(
121147
models,
122148
"UnstructuredDetectronModel",
123-
partial(MockLayoutModel, layout=mock_page_layout),
149+
partial(MockLayoutModel, layout=mock_final_layout),
124150
)
125151
monkeypatch.setattr(detectron2, "is_detectron2_available", lambda *args: True)
126152

@@ -139,8 +165,8 @@ def test_read_pdf(monkeypatch, mock_page_layout):
139165

140166

141167
@pytest.mark.parametrize("model_name", [None, "checkbox", "fake"])
142-
def test_process_data_with_model(monkeypatch, mock_page_layout, model_name):
143-
monkeypatch.setattr(layout, "get_model", lambda x: MockLayoutModel(mock_page_layout))
168+
def test_process_data_with_model(monkeypatch, mock_final_layout, model_name):
169+
monkeypatch.setattr(layout, "get_model", lambda x: MockLayoutModel(mock_final_layout))
144170
monkeypatch.setattr(
145171
layout.DocumentLayout,
146172
"from_file",
@@ -158,11 +184,10 @@ def test_process_data_with_model_raises_on_invalid_model_name():
158184

159185

160186
@pytest.mark.parametrize("model_name", [None, "checkbox"])
161-
def test_process_file_with_model(monkeypatch, mock_page_layout, model_name):
187+
def test_process_file_with_model(monkeypatch, mock_final_layout, model_name):
162188
def mock_initialize(self, *args, **kwargs):
163-
self.model = MockLayoutModel(mock_page_layout)
189+
self.model = MockLayoutModel(mock_final_layout)
164190

165-
monkeypatch.setattr(models, "get_model", lambda x: MockLayoutModel(mock_page_layout))
166191
monkeypatch.setattr(
167192
layout.DocumentLayout,
168193
"from_file",
@@ -276,17 +301,17 @@ def test_get_elements_from_block_raises():
276301

277302

278303
@pytest.mark.parametrize("filetype", ["png", "jpg"])
279-
def test_from_image_file(monkeypatch, mock_page_layout, filetype):
304+
def test_from_image_file(monkeypatch, mock_final_layout, filetype):
280305
def mock_get_elements(self, *args, **kwargs):
281-
self.elements = [mock_page_layout]
306+
self.elements = [mock_final_layout]
282307

283308
monkeypatch.setattr(layout.PageLayout, "get_elements_with_model", mock_get_elements)
284309
elements = (
285310
layout.DocumentLayout.from_image_file(f"sample-docs/loremipsum.{filetype}")
286311
.pages[0]
287312
.elements
288313
)
289-
assert elements[0] == mock_page_layout
314+
assert elements[0] == mock_final_layout
290315

291316

292317
def test_from_image_file_raises_with_empty_fn():
@@ -307,9 +332,9 @@ def test_from_file_raises_on_length_mismatch(monkeypatch):
307332

308333

309334
@pytest.mark.parametrize("idx", range(2))
310-
def test_get_elements_from_layout(mock_page_layout, idx):
311-
page = MockPageLayout(layout=mock_page_layout)
312-
block = mock_page_layout[idx].pad(3)
335+
def test_get_elements_from_layout(mock_initial_layout, idx):
336+
page = MockPageLayout(layout=mock_initial_layout)
337+
block = mock_initial_layout[idx].pad(3)
313338
fixed_layout = [block]
314339
elements = page.get_elements_from_layout(fixed_layout)
315340
assert elements[0].text == block.text
@@ -483,6 +508,7 @@ def test_load_pdf_image_placement():
483508
assert image_region.y2 < images[5].height / 2
484509

485510

511+
@pytest.mark.skip("Temporarily removed multicolumn to fix ordering")
486512
def test_load_pdf_with_multicolumn_layout_and_ocr(filename="sample-docs/design-thinking.pdf"):
487513
layouts, images = layout.load_pdf(filename)
488514
doc = layout.process_file_with_model(filename=filename, model_name=None)
@@ -522,6 +548,34 @@ def test_annotate():
522548
assert ((annotated_array[:, :, 2] == 1).mean()) > 0.992
523549

524550

551+
def test_textregion_returns_empty_ocr_never(mock_image):
552+
tr = elements.TextRegion(0, 0, 24, 24)
553+
assert tr.extract_text(objects=None, image=mock_image, ocr_strategy="never") == ""
554+
555+
556+
@pytest.mark.parametrize(("text", "expected"), [("asdf", "asdf"), (None, "")])
557+
def test_embedded_text_region(text, expected):
558+
etr = elements.EmbeddedTextRegion(0, 0, 24, 24, text=text)
559+
assert etr.extract_text(objects=None) == expected
560+
561+
562+
@pytest.mark.parametrize(
563+
("text", "ocr_strategy", "expected"),
564+
[
565+
(None, "never", ""),
566+
(None, "always", "asdf"),
567+
("i have text", "never", "i have text"),
568+
("i have text", "always", "i have text"),
569+
],
570+
)
571+
def test_image_text_region(text, ocr_strategy, expected, mock_image):
572+
itr = elements.ImageTextRegion(0, 0, 24, 24, text=text)
573+
with patch.object(elements, "ocr", return_value="asdf"):
574+
assert (
575+
itr.extract_text(objects=None, image=mock_image, ocr_strategy=ocr_strategy) == expected
576+
)
577+
578+
525579
@pytest.fixture()
526580
def ordering_layout():
527581
elements = [
@@ -537,7 +591,11 @@ def ordering_layout():
537591

538592

539593
def test_layout_order(ordering_layout):
540-
with patch.object(layout, "get_model", lambda: lambda x: ordering_layout):
594+
with patch.object(layout, "get_model", lambda: lambda x: ordering_layout), patch.object(
595+
layout,
596+
"load_pdf",
597+
lambda *args, **kwargs: ([[]], [mock_image]),
598+
):
541599
doc = layout.DocumentLayout.from_file("sample-docs/layout-parser-paper.pdf")
542600
page = doc.pages[0]
543601
for n, element in enumerate(page.elements):

test_unstructured_inference/models/test_yolox.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,4 +68,5 @@ def test_layout_yolox_local_parsing_empty_pdf_soft():
6868
document_layout = process_file_with_model(filename, model_name="yolox_tiny")
6969
assert len(document_layout.pages) == 1
7070
# NOTE(benjamin) The example sent to the test contains 0 detections
71-
assert len(document_layout.pages[0].elements) == 0
71+
text_elements_page_1 = [el for el in document_layout.pages[0].elements if el.type != "Image"]
72+
assert len(text_elements_page_1) == 0

0 commit comments

Comments
 (0)