Skip to content

Commit 575957b

Browse files
pawel-kmiecikchristinestraubmicmarty-deepsense
authored
feat: enhance analysis options with od model dump and better vis (#3234)
This PR adds new capabilities for drawing bboxes for each layout (extracted, inferred, ocr and final) + OD model output dump as a json file for better analysis. --------- Co-authored-by: Christine Straub <[email protected]> Co-authored-by: Michal Martyniak <[email protected]>
1 parent f2fee0c commit 575957b

File tree

12 files changed

+1127
-11
lines changed

12 files changed

+1127
-11
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -204,3 +204,6 @@ examples/**/output/
204204

205205
outputdiff.txt
206206
metricsdiff.txt
207+
208+
# analysis
209+
annotated/

CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,11 @@
22

33
### Enhancements
44

5+
* **Added visualization and OD model result dump for PDF** In PDF `hi_res` strategy the `analysis` parameter can be used
6+
to visualize the result of the OD model and dump the result to a file.
7+
Additionally, the visualization of bounding boxes of each layout source is rendered and saved
8+
for each page.
9+
510
### Features
611

712
### Fixes

Dockerfile

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,10 @@ COPY unstructured unstructured
99
COPY test_unstructured test_unstructured
1010
COPY example-docs example-docs
1111

12-
RUN chown -R notebook-user:notebook-user /app && ln -s /usr/bin/python3.11 /usr/bin/python3
12+
RUN chown -R notebook-user:notebook-user /app && \
13+
apk add font-ubuntu && \
14+
fc-cache -fv && \
15+
ln -s /usr/bin/python3.11 /usr/bin/python3
1316

1417
USER notebook-user
1518

Lines changed: 154 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,154 @@
1+
import numpy as np
2+
import pytest
3+
from PIL import Image
4+
from unstructured_inference.inference.elements import Rectangle
5+
from unstructured_inference.inference.layout import DocumentLayout, PageLayout
6+
from unstructured_inference.inference.layoutelement import LayoutElement
7+
8+
from unstructured.partition.pdf_image.analysis.bbox_visualisation import (
9+
TextAlignment,
10+
get_bbox_text_size,
11+
get_bbox_thickness,
12+
get_label_rect_and_coords,
13+
get_rgb_color,
14+
get_text_color,
15+
)
16+
from unstructured.partition.pdf_image.analysis.layout_dump import ObjectDetectionLayoutDumper
17+
18+
19+
@pytest.mark.parametrize("color", ["red", "green", "blue", "yellow", "black", "white"])
20+
def test_get_rgb_color(color: str):
21+
color_tuple = get_rgb_color(color)
22+
23+
assert isinstance(color_tuple, tuple)
24+
assert len(color_tuple) == 3
25+
assert all(isinstance(c, int) for c in color_tuple)
26+
assert all(0 <= c <= 255 for c in color_tuple)
27+
28+
29+
@pytest.mark.parametrize(
30+
("bbox", "expected_text_size"),
31+
[
32+
((0, 0, 90, 90), 17),
33+
((0, 0, 500, 200), 21),
34+
((0, 0, 10000, 10000), 32),
35+
],
36+
)
37+
def test_get_bbox_text_size(bbox: tuple[int, int, int, int], expected_text_size):
38+
page_size = (1700, 2200) # standard size of a page
39+
text_size = get_bbox_text_size(bbox, page_size)
40+
41+
assert text_size == expected_text_size
42+
43+
44+
@pytest.mark.parametrize(
45+
("bbox", "expected_box_thickness"),
46+
[
47+
((0, 0, 90, 90), 1),
48+
((0, 0, 450, 250), 2),
49+
((0, 0, 600, 1000), 3),
50+
],
51+
)
52+
def test_get_bbox_thickness(bbox: tuple[int, int, int, int], expected_box_thickness):
53+
page_size = (1700, 2200) # standard size of a page
54+
box_thickness = get_bbox_thickness(bbox, page_size)
55+
56+
assert box_thickness == expected_box_thickness
57+
58+
59+
@pytest.mark.parametrize(
60+
("color", "expected_text_color"),
61+
[
62+
("navy", "white"),
63+
("crimson", "white"),
64+
("maroon", "white"),
65+
("dimgray", "white"),
66+
("darkgreen", "white"),
67+
("darkcyan", "white"),
68+
("fuchsia", "white"),
69+
("violet", "black"),
70+
("gold", "black"),
71+
("aqua", "black"),
72+
("greenyellow", "black"),
73+
],
74+
)
75+
def test_best_text_color(color, expected_text_color):
76+
color_tuple = get_rgb_color(color)
77+
expected_text_color_tuple = get_rgb_color(expected_text_color)
78+
79+
_, text_color_tuple = get_text_color(color_tuple)
80+
assert text_color_tuple == expected_text_color_tuple
81+
82+
83+
@pytest.mark.parametrize(
84+
("alignment", "expected_text_bbox"),
85+
[
86+
(TextAlignment.CENTER, ((145, 145), (155, 155))),
87+
(TextAlignment.TOP_LEFT, ((100, 90), (120, 100))),
88+
(TextAlignment.TOP_RIGHT, ((180, 100), (200, 110))),
89+
(TextAlignment.BOTTOM_LEFT, ((100, 190), (120, 200))),
90+
(TextAlignment.BOTTOM_RIGHT, ((180, 190), (200, 200))),
91+
],
92+
)
93+
def test_get_text_bbox(alignment, expected_text_bbox):
94+
text_bbox, text_xy = get_label_rect_and_coords(
95+
alignment=alignment, bbox_points=(100, 100, 200, 200), text_width=10, text_height=10
96+
)
97+
# adding high atol to account for the text-based extending of the bbox
98+
assert np.allclose(text_bbox, expected_text_bbox, atol=10)
99+
100+
101+
def test_od_document_layout_dump():
102+
page1 = PageLayout(
103+
number=1,
104+
image=Image.new("1", (1, 1)),
105+
image_metadata={"width": 100, "height": 100},
106+
)
107+
page1.elements = [
108+
LayoutElement(type="Title", bbox=Rectangle(x1=0, y1=0, x2=10, y2=10), prob=0.7),
109+
LayoutElement(type="Paragraph", bbox=Rectangle(x1=0, y1=100, x2=10, y2=110), prob=0.8),
110+
]
111+
page2 = PageLayout(
112+
number=2,
113+
image=Image.new("1", (1, 1)),
114+
image_metadata={"width": 100, "height": 100},
115+
)
116+
page2.elements = [
117+
LayoutElement(type="Table", bbox=Rectangle(x1=0, y1=0, x2=10, y2=10), prob=0.9),
118+
LayoutElement(type="Image", bbox=Rectangle(x1=0, y1=100, x2=10, y2=110), prob=1.0),
119+
]
120+
od_document_layout = DocumentLayout(pages=[page1, page2])
121+
122+
expected_dump = {
123+
"pages": [
124+
{
125+
"number": 1,
126+
"size": {
127+
"width": 100,
128+
"height": 100,
129+
},
130+
"elements": [
131+
{"bbox": [0, 0, 10, 10], "type": "Title", "prob": 0.7},
132+
{"bbox": [0, 100, 10, 110], "type": "Paragraph", "prob": 0.8},
133+
],
134+
},
135+
{
136+
"number": 2,
137+
"size": {
138+
"width": 100,
139+
"height": 100,
140+
},
141+
"elements": [
142+
{"bbox": [0, 0, 10, 10], "type": "Table", "prob": 0.9},
143+
{"bbox": [0, 100, 10, 110], "type": "Image", "prob": 1.0},
144+
],
145+
},
146+
]
147+
}
148+
od_layout_dump = ObjectDetectionLayoutDumper(od_document_layout).dump()
149+
150+
assert {"pages": od_layout_dump.get("pages")} == expected_dump
151+
152+
# check OD model classes are attached but do not depend on a specific model instance
153+
assert "object_detection_classes" in od_layout_dump
154+
assert len(od_layout_dump["object_detection_classes"]) > 0

test_unstructured/partition/pdf_image/test_pdf.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import math
44
import os
55
import tempfile
6+
from pathlib import Path
67
from tempfile import SpooledTemporaryFile
78
from unittest import mock
89

@@ -1322,3 +1323,32 @@ def test_unique_and_deterministic_element_ids(strategy, expected_ids):
13221323
)
13231324
ids = [element.id for element in elements]
13241325
assert ids == expected_ids, "Element IDs do not match expected IDs"
1326+
1327+
1328+
def test_analysis_artifacts_saved():
1329+
with tempfile.TemporaryDirectory() as temp_dir:
1330+
filename = example_doc_path("layout-parser-paper-fast.pdf")
1331+
pdf.partition_pdf(
1332+
filename=filename,
1333+
strategy=PartitionStrategy.HI_RES,
1334+
analysis=True,
1335+
analyzed_image_output_dir_path=temp_dir,
1336+
)
1337+
1338+
analysis_dir = Path(temp_dir)
1339+
layout_dump_dir = analysis_dir / "analysis" / "layout-parser-paper-fast" / "layout_dump"
1340+
assert layout_dump_dir.exists()
1341+
layout_dump_files = list(layout_dump_dir.iterdir())
1342+
assert len(layout_dump_files) == 1
1343+
assert (layout_dump_dir / "object_detection.json").exists()
1344+
1345+
bboxes_dir = analysis_dir / "analysis" / "layout-parser-paper-fast" / "bboxes"
1346+
assert bboxes_dir.exists()
1347+
bboxes_files = list(bboxes_dir.iterdir())
1348+
assert len(bboxes_files) == 2 * 4 # 2 pages * 4 different layouts per page
1349+
1350+
expected_layouts = ["od_model", "ocr", "pdfminer", "final"]
1351+
expected_pages = [1, 2]
1352+
for el in expected_layouts:
1353+
for page in expected_pages:
1354+
assert bboxes_dir / f"page{page}_layout_{el}.png" in bboxes_files

unstructured/partition/pdf.py

Lines changed: 83 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -46,9 +46,19 @@
4646
spooled_to_bytes_io_if_needed,
4747
)
4848
from unstructured.partition.lang import check_language_args, prepare_languages_for_tesseract
49+
from unstructured.partition.pdf_image.analysis.bbox_visualisation import (
50+
AnalysisDrawer,
51+
FinalLayoutDrawer,
52+
OCRLayoutDrawer,
53+
ODModelLayoutDrawer,
54+
PdfminerLayoutDrawer,
55+
)
56+
from unstructured.partition.pdf_image.analysis.layout_dump import (
57+
JsonLayoutDumper,
58+
ObjectDetectionLayoutDumper,
59+
)
4960
from unstructured.partition.pdf_image.form_extraction import run_form_extraction
5061
from unstructured.partition.pdf_image.pdf_image_utils import (
51-
annotate_layout_elements,
5262
check_element_types_to_extract,
5363
convert_pdf_to_images,
5464
get_the_last_modification_date_pdf_or_img,
@@ -533,6 +543,13 @@ def _partition_pdf_or_image_local(
533543
f"(currently {pdf_image_dpi}).",
534544
)
535545

546+
pdfminer_drawer: Optional[PdfminerLayoutDrawer] = None
547+
od_model_drawer: Optional[ODModelLayoutDrawer] = None
548+
ocr_drawer: Optional[OCRLayoutDrawer] = None
549+
od_model_layout_dumper: Optional[ObjectDetectionLayoutDumper] = None
550+
skip_bboxes = env_config.ANALYSIS_BBOX_SKIP
551+
skip_dump_od = env_config.ANALYSIS_DUMP_OD_SKIP
552+
536553
if file is None:
537554
inferred_document_layout = process_file_with_model(
538555
filename,
@@ -561,15 +578,19 @@ def _partition_pdf_or_image_local(
561578
else:
562579
analyzed_image_output_dir_path = str(Path.cwd() / "annotated")
563580
os.makedirs(analyzed_image_output_dir_path, exist_ok=True)
564-
annotate_layout_elements(
565-
inferred_document_layout=inferred_document_layout,
566-
extracted_layout=extracted_layout,
567-
filename=filename,
568-
output_dir_path=analyzed_image_output_dir_path,
569-
pdf_image_dpi=pdf_image_dpi,
570-
is_image=is_image,
571-
)
572-
581+
if not skip_bboxes:
582+
pdfminer_drawer = PdfminerLayoutDrawer(
583+
layout=extracted_layout,
584+
)
585+
od_model_drawer = ODModelLayoutDrawer(
586+
layout=inferred_document_layout,
587+
)
588+
ocr_drawer = OCRLayoutDrawer()
589+
if not skip_dump_od:
590+
od_model_layout_dumper = ObjectDetectionLayoutDumper(
591+
layout=inferred_document_layout,
592+
model_name=hi_res_model_name,
593+
)
573594
# NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
574595
merged_document_layout = merge_inferred_with_extracted_layout(
575596
inferred_document_layout=inferred_document_layout,
@@ -586,6 +607,7 @@ def _partition_pdf_or_image_local(
586607
ocr_languages=ocr_languages,
587608
ocr_mode=ocr_mode,
588609
pdf_image_dpi=pdf_image_dpi,
610+
ocr_drawer=ocr_drawer,
589611
)
590612
else:
591613
inferred_document_layout = process_data_with_model(
@@ -609,6 +631,23 @@ def _partition_pdf_or_image_local(
609631
else []
610632
)
611633

634+
if analysis:
635+
if not analyzed_image_output_dir_path:
636+
if env_config.GLOBAL_WORKING_DIR_ENABLED:
637+
analyzed_image_output_dir_path = str(
638+
Path(env_config.GLOBAL_WORKING_PROCESS_DIR) / "annotated"
639+
)
640+
else:
641+
analyzed_image_output_dir_path = str(Path.cwd() / "annotated")
642+
os.makedirs(analyzed_image_output_dir_path, exist_ok=True)
643+
pdfminer_drawer = PdfminerLayoutDrawer(
644+
layout=extracted_layout,
645+
)
646+
od_model_drawer = ODModelLayoutDrawer(
647+
layout=inferred_document_layout,
648+
)
649+
ocr_drawer = OCRLayoutDrawer()
650+
612651
# NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
613652
merged_document_layout = merge_inferred_with_extracted_layout(
614653
inferred_document_layout=inferred_document_layout,
@@ -627,6 +666,7 @@ def _partition_pdf_or_image_local(
627666
ocr_languages=ocr_languages,
628667
ocr_mode=ocr_mode,
629668
pdf_image_dpi=pdf_image_dpi,
669+
ocr_drawer=ocr_drawer,
630670
)
631671

632672
# NOTE(alan): starting with v2, chipper sorts the elements itself.
@@ -715,6 +755,39 @@ def _partition_pdf_or_image_local(
715755
)
716756
out_elements.extend(forms)
717757

758+
if analysis and not skip_bboxes:
759+
final_drawer = FinalLayoutDrawer(
760+
layout=out_elements,
761+
)
762+
analysis_drawer = AnalysisDrawer(
763+
filename=filename,
764+
save_dir=analyzed_image_output_dir_path,
765+
draw_grid=env_config.ANALYSIS_BBOX_DRAW_GRID,
766+
draw_caption=env_config.ANALYSIS_BBOX_DRAW_CAPTION,
767+
resize=env_config.ANALYSIS_BBOX_RESIZE,
768+
format=env_config.ANALYSIS_BBOX_FORMAT,
769+
)
770+
771+
if od_model_drawer:
772+
analysis_drawer.add_drawer(od_model_drawer)
773+
774+
if pdfminer_drawer:
775+
analysis_drawer.add_drawer(pdfminer_drawer)
776+
777+
if ocr_drawer:
778+
analysis_drawer.add_drawer(ocr_drawer)
779+
analysis_drawer.add_drawer(final_drawer)
780+
analysis_drawer.process()
781+
782+
if analysis and not skip_dump_od:
783+
json_layout_dumper = JsonLayoutDumper(
784+
filename=filename,
785+
save_dir=analyzed_image_output_dir_path,
786+
)
787+
if od_model_layout_dumper:
788+
json_layout_dumper.add_layout_dumper(od_model_layout_dumper)
789+
json_layout_dumper.process()
790+
718791
return out_elements
719792

720793

unstructured/partition/pdf_image/analysis/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)