Unstructured-IO
diff --git a/‎CHANGELOG.md‎
Lines changed: 2 additions & 1 deletion b/‎CHANGELOG.md‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎test_unstructured/partition/pdf_image/test_pdf.py‎
Lines changed: 44 additions & 10 deletions b/‎test_unstructured/partition/pdf_image/test_pdf.py‎
Lines changed: 44 additions & 10 deletions
diff --git a/‎unstructured/__version__.py‎
Lines changed: 1 addition & 1 deletion b/‎unstructured/__version__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎unstructured/partition/pdf.py‎
Lines changed: 46 additions & 62 deletions b/‎unstructured/partition/pdf.py‎
Lines changed: 46 additions & 62 deletions
@@ -1,6 +1,7 @@
-## 0.15.10-dev0
+## 0.15.10-dev1
 
 ### Enhancements
+* **Modified analysis drawing tools to dump to files and draw from dumps** If the parameter `analysis` of the `partition_pdf` function is set to `True`, the layout for Object Detection, Pdfminer Extraction, OCR and final layouts will be dumped as json files. The drawers now accept dict (dump) objects instead of internal classes instances.
 
 ### Features
 
 
@@ -1336,33 +1336,67 @@ def test_unique_and_deterministic_element_ids(strategy, expected_ids):
     assert ids == expected_ids, "Element IDs do not match expected IDs"
 
 
-def test_analysis_artifacts_saved():
+@pytest.mark.parametrize("is_path", [True, False])
+@pytest.mark.parametrize(
+    ("example_doc", "doc_pages"),
+    [
+        ("pdf/layout-parser-paper-fast.pdf", 2),
+        ("img/DA-1p.png", 1),
+    ],
+)
+def test_analysis_artifacts_saved(is_path: bool, example_doc: str, doc_pages: int):
     with tempfile.TemporaryDirectory() as temp_dir:
-        filename = example_doc_path("pdf/layout-parser-paper-fast.pdf")
+        file = None
+        filename = example_doc_path(example_doc)
+        is_image = not Path(filename).suffix.endswith("pdf")
+        if not is_path:
+            file = open(filename, "rb")  # noqa: SIM115
+            filename = None
         pdf.partition_pdf(
             filename=filename,
+            file=file,
+            is_image=is_image,
             strategy=PartitionStrategy.HI_RES,
             analysis=True,
             analyzed_image_output_dir_path=temp_dir,
         )
 
         analysis_dir = Path(temp_dir)
-        layout_dump_dir = analysis_dir / "analysis" / "layout-parser-paper-fast" / "layout_dump"
+        file_analysis_root = None
+        if is_path:
+            file_analysis_root = analysis_dir / "analysis" / Path(example_doc).stem
+        else:
+            # if file is not a path, the filename is None and the analysis directory
+            # for the document is generated
+            generated_file_stem_path = list((analysis_dir / "analysis").iterdir())[0]
+            if is_image:
+                assert "image" in generated_file_stem_path.name
+            else:
+                assert "pdf" in generated_file_stem_path.name
+            file_analysis_root = generated_file_stem_path
+        layout_dump_dir = file_analysis_root / "layout_dump"
         assert layout_dump_dir.exists()
         layout_dump_files = list(layout_dump_dir.iterdir())
-        assert len(layout_dump_files) == 1
-        assert (layout_dump_dir / "object_detection.json").exists()
 
-        bboxes_dir = analysis_dir / "analysis" / "layout-parser-paper-fast" / "bboxes"
+        expected_layout_dumps = ["object_detection", "ocr", "pdfminer", "final"]
+        assert len(layout_dump_files) == len(expected_layout_dumps)
+
+        for expected_layout_dump in expected_layout_dumps:
+            assert (layout_dump_dir / f"{expected_layout_dump}.json").exists()
+
+        bboxes_dir = file_analysis_root / "bboxes"
         assert bboxes_dir.exists()
         bboxes_files = list(bboxes_dir.iterdir())
-        assert len(bboxes_files) == 2 * 4  # 2 pages * 4 different layouts per page
 
-        expected_layouts = ["od_model", "ocr", "pdfminer", "final"]
-        expected_pages = [1, 2]
-        for el in expected_layouts:
+        expected_renders = ["od_model", "ocr", "pdfminer", "final"]
+        assert len(bboxes_files) == doc_pages * len(expected_renders)
+
+        expected_pages = range(1, doc_pages + 1)
+        for el in expected_renders:
             for page in expected_pages:
                 assert bboxes_dir / f"page{page}_layout_{el}.png" in bboxes_files
+        if file:
+            file.close()
 
 
 @pytest.mark.parametrize(
 
@@ -1 +1 @@
-__version__ = "0.15.10-dev0"  # pragma: no cover
+__version__ = "0.15.10-dev1"  # pragma: no cover
@@ -53,16 +53,12 @@
     prepare_languages_for_tesseract,
     tesseract_to_paddle_language,
 )
-from unstructured.partition.pdf_image.analysis.bbox_visualisation import (
-    AnalysisDrawer,
-    FinalLayoutDrawer,
-    OCRLayoutDrawer,
-    ODModelLayoutDrawer,
-    PdfminerLayoutDrawer,
-)
+from unstructured.partition.pdf_image.analysis import save_analysis_artifiacts
 from unstructured.partition.pdf_image.analysis.layout_dump import (
-    JsonLayoutDumper,
+    ExtractedLayoutDumper,
+    FinalLayoutDumper,
     ObjectDetectionLayoutDumper,
+    OCRLayoutDumper,
 )
 from unstructured.partition.pdf_image.form_extraction import run_form_extraction
 from unstructured.partition.pdf_image.pdf_image_utils import (
@@ -589,12 +585,12 @@ def _partition_pdf_or_image_local(
             f"(currently {pdf_image_dpi}).",
         )
 
-    pdfminer_drawer: Optional[PdfminerLayoutDrawer] = None
-    od_model_drawer: Optional[ODModelLayoutDrawer] = None
-    ocr_drawer: Optional[OCRLayoutDrawer] = None
     od_model_layout_dumper: Optional[ObjectDetectionLayoutDumper] = None
-    skip_bboxes = env_config.ANALYSIS_BBOX_SKIP
-    skip_dump_od = env_config.ANALYSIS_DUMP_OD_SKIP
+    extracted_layout_dumper: Optional[ExtractedLayoutDumper] = None
+    ocr_layout_dumper: Optional[OCRLayoutDumper] = None
+    final_layout_dumper: Optional[FinalLayoutDumper] = None
+
+    skip_analysis_dump = env_config.ANALYSIS_DUMP_OD_SKIP
 
     if file is None:
         inferred_document_layout = process_file_with_model(
@@ -624,19 +620,15 @@ def _partition_pdf_or_image_local(
                     else:
                         analyzed_image_output_dir_path = str(Path.cwd() / "annotated")
                 os.makedirs(analyzed_image_output_dir_path, exist_ok=True)
-                if not skip_bboxes:
-                    pdfminer_drawer = PdfminerLayoutDrawer(
-                        layout=extracted_layout,
-                    )
-                    od_model_drawer = ODModelLayoutDrawer(
-                        layout=inferred_document_layout,
-                    )
-                    ocr_drawer = OCRLayoutDrawer()
-                if not skip_dump_od:
+                if not skip_analysis_dump:
                     od_model_layout_dumper = ObjectDetectionLayoutDumper(
                         layout=inferred_document_layout,
                         model_name=hi_res_model_name,
                     )
+                    extracted_layout_dumper = ExtractedLayoutDumper(
+                        layout=extracted_layout,
+                    )
+                    ocr_layout_dumper = OCRLayoutDumper()
             # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
             merged_document_layout = merge_inferred_with_extracted_layout(
                 inferred_document_layout=inferred_document_layout,
@@ -653,7 +645,7 @@ def _partition_pdf_or_image_local(
                 ocr_languages=ocr_languages,
                 ocr_mode=ocr_mode,
                 pdf_image_dpi=pdf_image_dpi,
-                ocr_drawer=ocr_drawer,
+                ocr_layout_dumper=ocr_layout_dumper,
             )
     else:
         inferred_document_layout = process_data_with_model(
@@ -685,14 +677,15 @@ def _partition_pdf_or_image_local(
                         )
                     else:
                         analyzed_image_output_dir_path = str(Path.cwd() / "annotated")
-                os.makedirs(analyzed_image_output_dir_path, exist_ok=True)
-                pdfminer_drawer = PdfminerLayoutDrawer(
-                    layout=extracted_layout,
-                )
-                od_model_drawer = ODModelLayoutDrawer(
-                    layout=inferred_document_layout,
-                )
-                ocr_drawer = OCRLayoutDrawer()
+                if not skip_analysis_dump:
+                    od_model_layout_dumper = ObjectDetectionLayoutDumper(
+                        layout=inferred_document_layout,
+                        model_name=hi_res_model_name,
+                    )
+                    extracted_layout_dumper = ExtractedLayoutDumper(
+                        layout=extracted_layout,
+                    )
+                    ocr_layout_dumper = OCRLayoutDumper()
 
             # NOTE(christine): merged_document_layout = extracted_layout + inferred_layout
             merged_document_layout = merge_inferred_with_extracted_layout(
@@ -712,7 +705,7 @@ def _partition_pdf_or_image_local(
                 ocr_languages=ocr_languages,
                 ocr_mode=ocr_mode,
                 pdf_image_dpi=pdf_image_dpi,
-                ocr_drawer=ocr_drawer,
+                ocr_layout_dumper=ocr_layout_dumper,
             )
 
     # NOTE(alan): starting with v2, chipper sorts the elements itself.
@@ -801,38 +794,29 @@ def _partition_pdf_or_image_local(
         )
         out_elements.extend(forms)
 
-    if analysis and not skip_bboxes:
-        final_drawer = FinalLayoutDrawer(
-            layout=out_elements,
-        )
-        analysis_drawer = AnalysisDrawer(
-            filename=filename,
-            save_dir=analyzed_image_output_dir_path,
-            draw_grid=env_config.ANALYSIS_BBOX_DRAW_GRID,
-            draw_caption=env_config.ANALYSIS_BBOX_DRAW_CAPTION,
-            resize=env_config.ANALYSIS_BBOX_RESIZE,
-            format=env_config.ANALYSIS_BBOX_FORMAT,
-        )
-
-        if od_model_drawer:
-            analysis_drawer.add_drawer(od_model_drawer)
-
-        if pdfminer_drawer:
-            analysis_drawer.add_drawer(pdfminer_drawer)
-
-        if ocr_drawer:
-            analysis_drawer.add_drawer(ocr_drawer)
-        analysis_drawer.add_drawer(final_drawer)
-        analysis_drawer.process()
-
-    if analysis and not skip_dump_od:
-        json_layout_dumper = JsonLayoutDumper(
+    if analysis:
+        if not skip_analysis_dump:
+            final_layout_dumper = FinalLayoutDumper(
+                layout=out_elements,
+            )
+        layout_dumpers = []
+        if od_model_layout_dumper:
+            layout_dumpers.append(od_model_layout_dumper)
+        if extracted_layout_dumper:
+            layout_dumpers.append(extracted_layout_dumper)
+        if ocr_layout_dumper:
+            layout_dumpers.append(ocr_layout_dumper)
+        if final_layout_dumper:
+            layout_dumpers.append(final_layout_dumper)
+        save_analysis_artifiacts(
+            *layout_dumpers,
             filename=filename,
-            save_dir=analyzed_image_output_dir_path,
+            file=file,
+            is_image=is_image,
+            analyzed_image_output_dir_path=analyzed_image_output_dir_path,
+            skip_bboxes=env_config.ANALYSIS_BBOX_SKIP,
+            skip_dump_od=env_config.ANALYSIS_DUMP_OD_SKIP,
         )
-        if od_model_layout_dumper:
-            json_layout_dumper.add_layout_dumper(od_model_layout_dumper)
-        json_layout_dumper.process()
 
     return out_elements
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.15.10-dev0" # pragma: no cover`
	`1`	`+__version__ = "0.15.10-dev1" # pragma: no cover`