Skip to content

Commit 404f780

Browse files
feat: make analysis drawing more flexible (#3574)
This PR changes the way the analysis tools can be used: - by default if `analysis` is set to `True` in `partition_pdf` and the strategy is resolved to `hi_res`: - for each file 4 layout dumps are produced and saved as JSON files (`object_detection`, `extracted`, `ocr`, `final`) - similar way to the current `object_detection` dump - the drawing functions/classes now accept these dumps accordingly instead of the internal classes instances (like `TextRegion`, `DocumentLayout` - it makes it possible to use the lightweight JSON files to render the bboxes of a given file after the partition is done - `_partition_pdf_or_image_local` has been refactored and most of the analysis code is now encapsulated in `save_analysis_artifiacts` function - to do this, helper function `render_bboxes_for_file` is added <img width="338" alt="Screenshot 2024-08-28 at 14 37 56" src="https://github.com/user-attachments/assets/10b6fbbd-7824-448d-8c11-52fc1b1b0dd0">
1 parent 04322d1 commit 404f780

File tree

8 files changed

+510
-207
lines changed

8 files changed

+510
-207
lines changed

CHANGELOG.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
1-
## 0.15.10-dev0
1+
## 0.15.10-dev1
22

33
### Enhancements
4+
* **Modified analysis drawing tools to dump to files and draw from dumps** If the parameter `analysis` of the `partition_pdf` function is set to `True`, the layout for Object Detection, Pdfminer Extraction, OCR and final layouts will be dumped as json files. The drawers now accept dict (dump) objects instead of internal classes instances.
45

56
### Features
67

test_unstructured/partition/pdf_image/test_pdf.py

Lines changed: 44 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1336,33 +1336,67 @@ def test_unique_and_deterministic_element_ids(strategy, expected_ids):
13361336
assert ids == expected_ids, "Element IDs do not match expected IDs"
13371337

13381338

1339-
def test_analysis_artifacts_saved():
1339+
@pytest.mark.parametrize("is_path", [True, False])
1340+
@pytest.mark.parametrize(
1341+
("example_doc", "doc_pages"),
1342+
[
1343+
("pdf/layout-parser-paper-fast.pdf", 2),
1344+
("img/DA-1p.png", 1),
1345+
],
1346+
)
1347+
def test_analysis_artifacts_saved(is_path: bool, example_doc: str, doc_pages: int):
13401348
with tempfile.TemporaryDirectory() as temp_dir:
1341-
filename = example_doc_path("pdf/layout-parser-paper-fast.pdf")
1349+
file = None
1350+
filename = example_doc_path(example_doc)
1351+
is_image = not Path(filename).suffix.endswith("pdf")
1352+
if not is_path:
1353+
file = open(filename, "rb") # noqa: SIM115
1354+
filename = None
13421355
pdf.partition_pdf(
13431356
filename=filename,
1357+
file=file,
1358+
is_image=is_image,
13441359
strategy=PartitionStrategy.HI_RES,
13451360
analysis=True,
13461361
analyzed_image_output_dir_path=temp_dir,
13471362
)
13481363

13491364
analysis_dir = Path(temp_dir)
1350-
layout_dump_dir = analysis_dir / "analysis" / "layout-parser-paper-fast" / "layout_dump"
1365+
file_analysis_root = None
1366+
if is_path:
1367+
file_analysis_root = analysis_dir / "analysis" / Path(example_doc).stem
1368+
else:
1369+
# if file is not a path, the filename is None and the analysis directory
1370+
# for the document is generated
1371+
generated_file_stem_path = list((analysis_dir / "analysis").iterdir())[0]
1372+
if is_image:
1373+
assert "image" in generated_file_stem_path.name
1374+
else:
1375+
assert "pdf" in generated_file_stem_path.name
1376+
file_analysis_root = generated_file_stem_path
1377+
layout_dump_dir = file_analysis_root / "layout_dump"
13511378
assert layout_dump_dir.exists()
13521379
layout_dump_files = list(layout_dump_dir.iterdir())
1353-
assert len(layout_dump_files) == 1
1354-
assert (layout_dump_dir / "object_detection.json").exists()
13551380

1356-
bboxes_dir = analysis_dir / "analysis" / "layout-parser-paper-fast" / "bboxes"
1381+
expected_layout_dumps = ["object_detection", "ocr", "pdfminer", "final"]
1382+
assert len(layout_dump_files) == len(expected_layout_dumps)
1383+
1384+
for expected_layout_dump in expected_layout_dumps:
1385+
assert (layout_dump_dir / f"{expected_layout_dump}.json").exists()
1386+
1387+
bboxes_dir = file_analysis_root / "bboxes"
13571388
assert bboxes_dir.exists()
13581389
bboxes_files = list(bboxes_dir.iterdir())
1359-
assert len(bboxes_files) == 2 * 4 # 2 pages * 4 different layouts per page
13601390

1361-
expected_layouts = ["od_model", "ocr", "pdfminer", "final"]
1362-
expected_pages = [1, 2]
1363-
for el in expected_layouts:
1391+
expected_renders = ["od_model", "ocr", "pdfminer", "final"]
1392+
assert len(bboxes_files) == doc_pages * len(expected_renders)
1393+
1394+
expected_pages = range(1, doc_pages + 1)
1395+
for el in expected_renders:
13641396
for page in expected_pages:
13651397
assert bboxes_dir / f"page{page}_layout_{el}.png" in bboxes_files
1398+
if file:
1399+
file.close()
13661400

13671401

13681402
@pytest.mark.parametrize(

unstructured/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.15.10-dev0" # pragma: no cover
1+
__version__ = "0.15.10-dev1" # pragma: no cover

unstructured/partition/pdf.py

Lines changed: 46 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -53,16 +53,12 @@
5353
prepare_languages_for_tesseract,
5454
tesseract_to_paddle_language,
5555
)
56-
from unstructured.partition.pdf_image.analysis.bbox_visualisation import (
57-
AnalysisDrawer,
58-
FinalLayoutDrawer,
59-
OCRLayoutDrawer,
60-
ODModelLayoutDrawer,
61-
PdfminerLayoutDrawer,
62-
)
56+
from unstructured.partition.pdf_image.analysis import save_analysis_artifiacts
6357
from unstructured.partition.pdf_image.analysis.layout_dump import (
64-
JsonLayoutDumper,
58+
ExtractedLayoutDumper,
59+
FinalLayoutDumper,
6560
ObjectDetectionLayoutDumper,
61+
OCRLayoutDumper,
6662
)
6763
from unstructured.partition.pdf_image.form_extraction import run_form_extraction
6864
from unstructured.partition.pdf_image.pdf_image_utils import (
@@ -589,12 +585,12 @@ def _partition_pdf_or_image_local(
589585
f"(currently {pdf_image_dpi}).",
590586
)
591587

592-
pdfminer_drawer: Optional[PdfminerLayoutDrawer] = None
593-
od_model_drawer: Optional[ODModelLayoutDrawer] = None
594-
ocr_drawer: Optional[OCRLayoutDrawer] = None
595588
od_model_layout_dumper: Optional[ObjectDetectionLayoutDumper] = None
596-
skip_bboxes = env_config.ANALYSIS_BBOX_SKIP
597-
skip_dump_od = env_config.ANALYSIS_DUMP_OD_SKIP
589+
extracted_layout_dumper: Optional[ExtractedLayoutDumper] = None
590+
ocr_layout_dumper: Optional[OCRLayoutDumper] = None
591+
final_layout_dumper: Optional[FinalLayoutDumper] = None
592+
593+
skip_analysis_dump = env_config.ANALYSIS_DUMP_OD_SKIP
598594

599595
if file is None:
600596
inferred_document_layout = process_file_with_model(
@@ -624,19 +620,15 @@ def _partition_pdf_or_image_local(
624620
else:
625621
analyzed_image_output_dir_path = str(Path.cwd() / "annotated")
626622
os.makedirs(analyzed_image_output_dir_path, exist_ok=True)
627-
if not skip_bboxes:
628-
pdfminer_drawer = PdfminerLayoutDrawer(
629-
layout=extracted_layout,
630-
)
631-
od_model_drawer = ODModelLayoutDrawer(
632-
layout=inferred_document_layout,
633-
)
634-
ocr_drawer = OCRLayoutDrawer()
635-
if not skip_dump_od:
623+
if not skip_analysis_dump:
636624
od_model_layout_dumper = ObjectDetectionLayoutDumper(
637625
layout=inferred_document_layout,
638626
model_name=hi_res_model_name,
639627
)
628+
extracted_layout_dumper = ExtractedLayoutDumper(
629+
layout=extracted_layout,
630+
)
631+
ocr_layout_dumper = OCRLayoutDumper()
640632
# NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
641633
merged_document_layout = merge_inferred_with_extracted_layout(
642634
inferred_document_layout=inferred_document_layout,
@@ -653,7 +645,7 @@ def _partition_pdf_or_image_local(
653645
ocr_languages=ocr_languages,
654646
ocr_mode=ocr_mode,
655647
pdf_image_dpi=pdf_image_dpi,
656-
ocr_drawer=ocr_drawer,
648+
ocr_layout_dumper=ocr_layout_dumper,
657649
)
658650
else:
659651
inferred_document_layout = process_data_with_model(
@@ -685,14 +677,15 @@ def _partition_pdf_or_image_local(
685677
)
686678
else:
687679
analyzed_image_output_dir_path = str(Path.cwd() / "annotated")
688-
os.makedirs(analyzed_image_output_dir_path, exist_ok=True)
689-
pdfminer_drawer = PdfminerLayoutDrawer(
690-
layout=extracted_layout,
691-
)
692-
od_model_drawer = ODModelLayoutDrawer(
693-
layout=inferred_document_layout,
694-
)
695-
ocr_drawer = OCRLayoutDrawer()
680+
if not skip_analysis_dump:
681+
od_model_layout_dumper = ObjectDetectionLayoutDumper(
682+
layout=inferred_document_layout,
683+
model_name=hi_res_model_name,
684+
)
685+
extracted_layout_dumper = ExtractedLayoutDumper(
686+
layout=extracted_layout,
687+
)
688+
ocr_layout_dumper = OCRLayoutDumper()
696689

697690
# NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
698691
merged_document_layout = merge_inferred_with_extracted_layout(
@@ -712,7 +705,7 @@ def _partition_pdf_or_image_local(
712705
ocr_languages=ocr_languages,
713706
ocr_mode=ocr_mode,
714707
pdf_image_dpi=pdf_image_dpi,
715-
ocr_drawer=ocr_drawer,
708+
ocr_layout_dumper=ocr_layout_dumper,
716709
)
717710

718711
# NOTE(alan): starting with v2, chipper sorts the elements itself.
@@ -801,38 +794,29 @@ def _partition_pdf_or_image_local(
801794
)
802795
out_elements.extend(forms)
803796

804-
if analysis and not skip_bboxes:
805-
final_drawer = FinalLayoutDrawer(
806-
layout=out_elements,
807-
)
808-
analysis_drawer = AnalysisDrawer(
809-
filename=filename,
810-
save_dir=analyzed_image_output_dir_path,
811-
draw_grid=env_config.ANALYSIS_BBOX_DRAW_GRID,
812-
draw_caption=env_config.ANALYSIS_BBOX_DRAW_CAPTION,
813-
resize=env_config.ANALYSIS_BBOX_RESIZE,
814-
format=env_config.ANALYSIS_BBOX_FORMAT,
815-
)
816-
817-
if od_model_drawer:
818-
analysis_drawer.add_drawer(od_model_drawer)
819-
820-
if pdfminer_drawer:
821-
analysis_drawer.add_drawer(pdfminer_drawer)
822-
823-
if ocr_drawer:
824-
analysis_drawer.add_drawer(ocr_drawer)
825-
analysis_drawer.add_drawer(final_drawer)
826-
analysis_drawer.process()
827-
828-
if analysis and not skip_dump_od:
829-
json_layout_dumper = JsonLayoutDumper(
797+
if analysis:
798+
if not skip_analysis_dump:
799+
final_layout_dumper = FinalLayoutDumper(
800+
layout=out_elements,
801+
)
802+
layout_dumpers = []
803+
if od_model_layout_dumper:
804+
layout_dumpers.append(od_model_layout_dumper)
805+
if extracted_layout_dumper:
806+
layout_dumpers.append(extracted_layout_dumper)
807+
if ocr_layout_dumper:
808+
layout_dumpers.append(ocr_layout_dumper)
809+
if final_layout_dumper:
810+
layout_dumpers.append(final_layout_dumper)
811+
save_analysis_artifiacts(
812+
*layout_dumpers,
830813
filename=filename,
831-
save_dir=analyzed_image_output_dir_path,
814+
file=file,
815+
is_image=is_image,
816+
analyzed_image_output_dir_path=analyzed_image_output_dir_path,
817+
skip_bboxes=env_config.ANALYSIS_BBOX_SKIP,
818+
skip_dump_od=env_config.ANALYSIS_DUMP_OD_SKIP,
832819
)
833-
if od_model_layout_dumper:
834-
json_layout_dumper.add_layout_dumper(od_model_layout_dumper)
835-
json_layout_dumper.process()
836820

837821
return out_elements
838822

0 commit comments

Comments
 (0)