Skip to content

Commit eaa8d65

Browse files
benjats07quedbadGarnet
authored
Fix/nested bounding boxes (#201)
This PR implements two major changes: * Replaces detectron2 with Yolox_quantized as default model * Introduces an algorithm for reducing nested elements detected by any model. As a benefit of these now is possible to detect more diverse element types. * Adds a property to `Rectangle` class to register the origin of the data. * Adds functionality to `annotate` function, to skip elements of certain origins * Adds functionality to `annotate` function to print additional details of bounding boxes * Tests updates --------- Co-authored-by: Alan Bertl <[email protected]> Co-authored-by: qued <[email protected]> Co-authored-by: Yao You <[email protected]>
1 parent 5e73202 commit eaa8d65

File tree

18 files changed

+506
-169
lines changed

18 files changed

+506
-169
lines changed

CHANGELOG.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,11 @@
1+
## 0.6.1
2+
3+
* YoloX_quantized is now the default model. This models detects most diverse types and detect tables better than previous model.
4+
* Since detection models tend to nest elements inside others(specifically in Tables), an algorithm has been added for reducing this
5+
behavior. Now all the elements produced by detection models are disjoint and they don't produce overlapping regions, which helps
6+
reduce duplicated content.
7+
* Add `source` property to our elements, so you can know where the information was generated (OCR or detection model)
8+
19
## 0.6.0
210

311
* add a config class to handle parameter configurations for inference tasks; parameters in the config class can be set via environement variables

test_unstructured_inference/conftest.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ def mock_text_region():
2828

2929
@pytest.fixture()
3030
def mock_layout_element():
31-
return LayoutElement(100, 100, 300, 300, text="Sample text", type="Text")
31+
return LayoutElement(100, 100, 300, 300, text="Sample text", source=None, type="Text")
3232

3333

3434
@pytest.fixture()
@@ -110,9 +110,9 @@ def mock_embedded_text_regions():
110110
@pytest.fixture()
111111
def mock_ocr_regions():
112112
return [
113-
EmbeddedTextRegion(10, 10, 90, 90, "0"),
114-
EmbeddedTextRegion(200, 200, 300, 300, "1"),
115-
EmbeddedTextRegion(500, 320, 600, 350, "3"),
113+
EmbeddedTextRegion(10, 10, 90, 90, text="0", source=None),
114+
EmbeddedTextRegion(200, 200, 300, 300, text="1", source=None),
115+
EmbeddedTextRegion(500, 320, 600, 350, text="3", source=None),
116116
]
117117

118118

@@ -141,6 +141,7 @@ def mock_inferred_layout(mock_embedded_text_regions):
141141
r.x2,
142142
r.y2,
143143
text=None,
144+
source=None,
144145
type="Text",
145146
)
146147
for r in mock_embedded_text_regions

test_unstructured_inference/inference/test_layout.py

Lines changed: 65 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -28,9 +28,16 @@ def mock_image():
2828

2929
@pytest.fixture()
3030
def mock_initial_layout():
31-
text_block = layout.EmbeddedTextRegion(2, 4, 6, 8, text="A very repetitive narrative. " * 10)
31+
text_block = layout.EmbeddedTextRegion(
32+
2,
33+
4,
34+
6,
35+
8,
36+
text="A very repetitive narrative. " * 10,
37+
source="Mock",
38+
)
3239

33-
title_block = layout.EmbeddedTextRegion(1, 2, 3, 4, text="A Catchy Title")
40+
title_block = layout.EmbeddedTextRegion(1, 2, 3, 4, text="A Catchy Title", source="Mock")
3441

3542
return [text_block, title_block]
3643

@@ -42,11 +49,20 @@ def mock_final_layout():
4249
4,
4350
6,
4451
8,
52+
source="Mock",
4553
text="A very repetitive narrative. " * 10,
4654
type="NarrativeText",
4755
)
4856

49-
title_block = layoutelement.LayoutElement(1, 2, 3, 4, text="A Catchy Title", type="Title")
57+
title_block = layoutelement.LayoutElement(
58+
1,
59+
2,
60+
3,
61+
4,
62+
source="Mock",
63+
text="A Catchy Title",
64+
type="Title",
65+
)
5066

5167
return [text_block, title_block]
5268

@@ -709,8 +725,11 @@ def test_load_pdf_with_multicolumn_layout_and_ocr(filename="sample-docs/design-t
709725
assert element.text.startswith(test_snippets[i])
710726

711727

712-
@pytest.mark.parametrize("colors", ["red", None])
713-
def test_annotate(colors):
728+
@pytest.mark.parametrize(
729+
("colors", "add_details", "threshold"),
730+
[("red", False, 0.992), (None, False, 0.992), ("red", True, 0.8)],
731+
)
732+
def test_annotate(colors, add_details, threshold):
714733
def check_annotated_image():
715734
annotated_array = np.array(annotated_image)
716735
for coords in [coords1, coords2]:
@@ -722,9 +741,9 @@ def check_annotated_image():
722741
assert all(annotated_array[y1:y2, x1, i] == expected)
723742
assert all(annotated_array[y1:y2, x2, i] == expected)
724743
# Make sure almost all the pixels are not changed
725-
assert ((annotated_array[:, :, 0] == 1).mean()) > 0.992
726-
assert ((annotated_array[:, :, 1] == 1).mean()) > 0.992
727-
assert ((annotated_array[:, :, 2] == 1).mean()) > 0.992
744+
assert ((annotated_array[:, :, 0] == 1).mean()) > threshold
745+
assert ((annotated_array[:, :, 1] == 1).mean()) > threshold
746+
assert ((annotated_array[:, :, 2] == 1).mean()) > threshold
728747

729748
test_image_arr = np.ones((100, 100, 3), dtype="uint8")
730749
image = Image.fromarray(test_image_arr)
@@ -735,15 +754,18 @@ def check_annotated_image():
735754
rect2 = elements.Rectangle(*coords2)
736755
page.elements = [rect1, rect2]
737756

757+
annotated_image = page.annotate(colors=colors, add_details=add_details, sources=["all"])
758+
check_annotated_image()
759+
738760
# Scenario 1: where self.image exists
739-
annotated_image = page.annotate(colors=colors)
761+
annotated_image = page.annotate(colors=colors, add_details=add_details)
740762
check_annotated_image()
741763

742764
# Scenario 2: where self.image is None, but self.image_path exists
743765
with patch.object(Image, "open", return_value=image):
744766
page.image = None
745767
page.image_path = "mock_path_to_image"
746-
annotated_image = page.annotate(colors=colors)
768+
annotated_image = page.annotate(colors=colors, add_details=add_details)
747769
check_annotated_image()
748770

749771

@@ -775,32 +797,30 @@ def test_image_text_region(text, ocr_strategy, expected, mock_image):
775797
)
776798

777799

778-
@pytest.fixture()
779-
def ordering_layout():
780-
elements = [
781-
layout.LayoutElement(x1=447.0, y1=315.0, x2=1275.7, y2=413.0, text="0"),
782-
layout.LayoutElement(x1=380.6, y1=473.4, x2=1334.8, y2=533.9, text="1"),
783-
layout.LayoutElement(x1=578.6, y1=556.8, x2=1109.0, y2=874.4, text="2"),
784-
layout.LayoutElement(x1=444.5, y1=942.3, x2=1261.1, y2=1584.1, text="3"),
785-
layout.LayoutElement(x1=444.8, y1=1609.4, x2=1257.2, y2=1665.2, text="4"),
786-
layout.LayoutElement(x1=414.0, y1=1718.8, x2=635.0, y2=1755.2, text="5"),
787-
layout.LayoutElement(x1=372.6, y1=1786.9, x2=1333.6, y2=1848.7, text="6"),
788-
]
789-
return elements
800+
class MockDetectionModel(layout.UnstructuredObjectDetectionModel):
801+
def initialize(self, *args, **kwargs):
802+
pass
803+
804+
def predict(self, x):
805+
return [
806+
layout.LayoutElement(x1=447.0, y1=315.0, x2=1275.7, y2=413.0, text="0"),
807+
layout.LayoutElement(x1=380.6, y1=473.4, x2=1334.8, y2=533.9, text="1"),
808+
layout.LayoutElement(x1=578.6, y1=556.8, x2=1109.0, y2=874.4, text="2"),
809+
layout.LayoutElement(x1=444.5, y1=942.3, x2=1261.1, y2=1584.1, text="3"),
810+
layout.LayoutElement(x1=444.8, y1=1609.4, x2=1257.2, y2=1665.2, text="4"),
811+
layout.LayoutElement(x1=414.0, y1=1718.8, x2=635.0, y2=1755.2, text="5"),
812+
layout.LayoutElement(x1=372.6, y1=1786.9, x2=1333.6, y2=1848.7, text="6"),
813+
]
790814

791815

792-
def test_layout_order(mock_image, ordering_layout):
816+
def test_layout_order(mock_image):
793817
with tempfile.TemporaryDirectory() as tmpdir:
794818
mock_image_path = os.path.join(tmpdir, "mock.jpg")
795819
mock_image.save(mock_image_path)
796-
with patch.object(layout, "get_model", lambda: lambda x: ordering_layout), patch.object(
820+
with patch.object(layout, "get_model", lambda: MockDetectionModel()), patch.object(
797821
layout,
798822
"load_pdf",
799823
lambda *args, **kwargs: ([[]], [mock_image_path]),
800-
), patch.object(
801-
layout,
802-
"UnstructuredObjectDetectionModel",
803-
object,
804824
):
805825
doc = layout.DocumentLayout.from_file("sample-docs/layout-parser-paper.pdf")
806826
page = doc.pages[0]
@@ -960,3 +980,20 @@ def test_warning_if_chipper_and_low_dpi(caplog):
960980
mock_from_file.assert_called_once()
961981
assert caplog.records[0].levelname == "WARNING"
962982
assert "DPI >= 300" in caplog.records[0].msg
983+
984+
985+
@pytest.mark.parametrize(
986+
("filename", "img_num", "should_complete"),
987+
[("sample-docs/empty-document.pdf", 0, True), ("sample-docs/empty-document.pdf", 10, False)],
988+
)
989+
def test_get_image(filename, img_num, should_complete):
990+
doc = layout.DocumentLayout.from_file(filename)
991+
page = doc.pages[0]
992+
try:
993+
img = page._get_image(filename, img_num)
994+
# transform img to numpy array
995+
img = np.array(img)
996+
# is a blank image with all pixels white
997+
assert img.mean() == 255.0
998+
except ValueError:
999+
assert not should_complete

test_unstructured_inference/inference/test_layout_element.py

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,12 @@
1717
def test_aggregate_ocr_text_by_block():
1818
expected = "A Unified Toolkit"
1919
ocr_layout = [
20-
TextRegion(0, 0, 20, 20, "A"),
21-
TextRegion(50, 50, 150, 150, "Unified"),
22-
TextRegion(150, 150, 300, 250, "Toolkit"),
23-
TextRegion(200, 250, 300, 350, "Deep"),
20+
TextRegion(0, 0, 20, 20, source="OCR", text="A"),
21+
TextRegion(50, 50, 150, 150, source="OCR", text="Unified"),
22+
TextRegion(150, 150, 300, 250, source="OCR", text="Toolkit"),
23+
TextRegion(200, 250, 300, 350, source="OCR", text="Deep"),
2424
]
25-
region = TextRegion(0, 0, 250, 350, "")
25+
region = TextRegion(0, 0, 250, 350, text="")
2626

2727
text = aggregate_ocr_text_by_block(ocr_layout, region, 0.5)
2828
assert text == expected
@@ -65,6 +65,7 @@ def test_supplement_layout_with_ocr_elements(mock_layout, mock_ocr_regions):
6565
r.x2,
6666
r.y2,
6767
text=r.text,
68+
source=None,
6869
type="UncategorizedText",
6970
)
7071
for r in mock_ocr_regions
@@ -94,6 +95,7 @@ def test_merge_inferred_layout_with_ocr_layout(mock_inferred_layout, mock_ocr_re
9495
r.x2,
9596
r.y2,
9697
text=r.text,
98+
source=None,
9799
type="UncategorizedText",
98100
)
99101
for r in mock_ocr_regions
@@ -138,6 +140,7 @@ def test_layout_element_do_dict(mock_layout_element):
138140
"text": "Sample text",
139141
"type": "Text",
140142
"prob": None,
143+
"source": None,
141144
}
142145

143146
assert mock_layout_element.to_dict() == expected
@@ -157,6 +160,14 @@ def test_layout_element_from_lp_textblock():
157160
score=0.99,
158161
)
159162

160-
expected = LayoutElement(100, 100, 300, 300, "Sample Text", "Text", 0.99)
161-
163+
expected = LayoutElement(
164+
100,
165+
100,
166+
300,
167+
300,
168+
text="Sample Text",
169+
source="detectron2_lp",
170+
type="Text",
171+
prob=0.99,
172+
)
162173
assert LayoutElement.from_lp_textblock(mock_text_block) == expected

test_unstructured_inference/models/test_model.py

Lines changed: 75 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ def test_raises_uninitialized():
5858
def test_model_initializes_once():
5959
from unstructured_inference.inference import layout
6060

61-
with mock.patch.object(models, "UnstructuredDetectronONNXModel", MockModel), mock.patch.object(
61+
with mock.patch.object(models, "UnstructuredYoloXModel", MockModel), mock.patch.object(
6262
models,
6363
"models",
6464
{},
@@ -72,3 +72,77 @@ def test_model_initializes_once():
7272
assert (
7373
doc.pages[0].elements[0].prob is None
7474
) # NOTE(pravin) New Assertion to Make Sure Uncategorized Text has None Probability
75+
76+
77+
def test_deduplicate_detected_elements():
78+
import numpy as np
79+
80+
from unstructured_inference.inference.elements import intersections
81+
from unstructured_inference.inference.layout import DocumentLayout
82+
from unstructured_inference.models.base import get_model
83+
84+
model = get_model("yolox_quantized")
85+
# model.confidence_threshold=0.5
86+
file = "sample-docs/example_table.jpg"
87+
doc = DocumentLayout.from_image_file(
88+
file,
89+
model,
90+
ocr_strategy="never",
91+
supplement_with_ocr_elements=False,
92+
)
93+
known_elements = [e for e in doc.pages[0].elements if e.type != "UncategorizedText"]
94+
# Compute intersection matrix
95+
intersections_mtx = intersections(*known_elements)
96+
# Get rid off diagonal (cause an element will always intersect itself)
97+
np.fill_diagonal(intersections_mtx, False)
98+
# Now all the elements should be False, because any intersection remains
99+
return not intersections_mtx.all()
100+
101+
102+
def test_enhance_regions():
103+
from unstructured_inference.inference.elements import Rectangle
104+
from unstructured_inference.models.base import get_model
105+
106+
elements = [
107+
Rectangle(0, 0, 1, 1),
108+
Rectangle(0.01, 0.01, 1.01, 1.01),
109+
Rectangle(0.02, 0.02, 1.02, 1.02),
110+
Rectangle(0.03, 0.03, 1.03, 1.03),
111+
Rectangle(0.04, 0.04, 1.04, 1.04),
112+
Rectangle(0.05, 0.05, 1.05, 1.05),
113+
Rectangle(0.06, 0.06, 1.06, 1.06),
114+
Rectangle(0.07, 0.07, 1.07, 1.07),
115+
Rectangle(0.08, 0.08, 1.08, 1.08),
116+
Rectangle(0.09, 0.09, 1.09, 1.09),
117+
Rectangle(0.10, 0.10, 1.10, 1.10),
118+
]
119+
model = get_model("yolox_tiny")
120+
elements = model.enhance_regions(elements, 0.5)
121+
assert len(elements) == 1
122+
assert (elements[0].x1, elements[0].y1, elements[0].x2, elements[0].x2) == (0, 0, 1.10, 1.10)
123+
124+
125+
def test_clean_type():
126+
from unstructured_inference.inference.layout import LayoutElement
127+
from unstructured_inference.models.base import get_model
128+
129+
elements = [
130+
LayoutElement(
131+
0.6,
132+
0.6,
133+
0.65,
134+
0.65,
135+
type="Table",
136+
), # One little table nested inside all the others
137+
LayoutElement(0.5, 0.5, 0.7, 0.7, type="Table"), # One nested table
138+
LayoutElement(0, 0, 1, 1, type="Table"), # Big table
139+
LayoutElement(0.01, 0.01, 1.01, 1.01),
140+
LayoutElement(0.02, 0.02, 1.02, 1.02),
141+
LayoutElement(0.03, 0.03, 1.03, 1.03),
142+
LayoutElement(0.04, 0.04, 1.04, 1.04),
143+
LayoutElement(0.05, 0.05, 1.05, 1.05),
144+
]
145+
model = get_model("yolox_tiny")
146+
elements = model.clean_type(elements, type_to_clean="Table")
147+
assert len(elements) == 1
148+
assert (elements[0].x1, elements[0].y1, elements[0].x2, elements[0].x2) == (0, 0, 1, 1)

test_unstructured_inference/models/test_yolox.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,9 @@ def test_layout_yolox_local_parsing_image():
1414
# NOTE(benjamin) The example image should result in one page result
1515
assert len(document_layout.pages) == 1
1616
# NOTE(benjamin) The example sent to the test contains 13 detections
17-
assert len(document_layout.pages[0].elements) == 13
17+
types_known = ["Text", "Section-header", "Page-header"]
18+
known_regions = [e for e in document_layout.pages[0].elements if e.type in types_known]
19+
assert len(known_regions) == 13
1820
assert hasattr(
1921
document_layout.pages[0].elements[0],
2022
"prob",
@@ -32,8 +34,9 @@ def test_layout_yolox_local_parsing_pdf():
3234
content = str(document_layout)
3335
assert "libero fringilla" in content
3436
assert len(document_layout.pages) == 1
35-
# NOTE(benjamin) The example sent to the test contains 5 detections
36-
assert len(document_layout.pages[0].elements) == 5
37+
# NOTE(benjamin) The example sent to the test contains 5 text detections
38+
text_elements = [e for e in document_layout.pages[0].elements if e.type == "Text"]
39+
assert len(text_elements) == 5
3740
assert hasattr(
3841
document_layout.pages[0].elements[0],
3942
"prob",
@@ -59,10 +62,10 @@ def test_layout_yolox_local_parsing_empty_pdf():
5962

6063

6164
def test_layout_yolox_local_parsing_image_soft():
62-
filename = os.path.join("sample-docs", "test-image.jpg")
65+
filename = os.path.join("sample-docs", "example_table.jpg")
6366
# NOTE(benjamin) keep_output = True create a file for each image in
6467
# localstorage for visualization of the result
65-
document_layout = process_file_with_model(filename, model_name="yolox_tiny", is_image=True)
68+
document_layout = process_file_with_model(filename, model_name="yolox_quantized", is_image=True)
6669
# NOTE(benjamin) The example image should result in one page result
6770
assert len(document_layout.pages) == 1
6871
# NOTE(benjamin) Soft version of the test, run make test-long in order to run with full model

0 commit comments

Comments
 (0)