Unstructured-IO
diff --git a/‎CHANGELOG.md‎
Lines changed: 8 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎test_unstructured_inference/conftest.py‎
Lines changed: 5 additions & 4 deletions b/‎test_unstructured_inference/conftest.py‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎test_unstructured_inference/inference/test_layout.py‎
Lines changed: 65 additions & 28 deletions b/‎test_unstructured_inference/inference/test_layout.py‎
Lines changed: 65 additions & 28 deletions
diff --git a/‎test_unstructured_inference/inference/test_layout_element.py‎
Lines changed: 18 additions & 7 deletions b/‎test_unstructured_inference/inference/test_layout_element.py‎
Lines changed: 18 additions & 7 deletions
diff --git a/‎test_unstructured_inference/models/test_model.py‎
Lines changed: 75 additions & 1 deletion b/‎test_unstructured_inference/models/test_model.py‎
Lines changed: 75 additions & 1 deletion
diff --git a/‎test_unstructured_inference/models/test_yolox.py‎
Lines changed: 8 additions & 5 deletions b/‎test_unstructured_inference/models/test_yolox.py‎
Lines changed: 8 additions & 5 deletions
@@ -1,3 +1,11 @@
+## 0.6.1
+
+* YoloX_quantized is now the default model. This models detects most diverse types and detect tables better than previous model.
+* Since detection models tend to nest elements inside others(specifically in Tables), an algorithm has been added for reducing this 
+  behavior. Now all the elements produced by detection models are disjoint and they don't produce overlapping regions, which helps 
+  reduce duplicated content.
+* Add `source` property to our elements, so you can know where the information was generated (OCR or detection model)
+
 ## 0.6.0
 
 * add a config class to handle parameter configurations for inference tasks; parameters in the config class can be set via environement variables
 
@@ -28,7 +28,7 @@ def mock_text_region():
 
 @pytest.fixture()
 def mock_layout_element():
-    return LayoutElement(100, 100, 300, 300, text="Sample text", type="Text")
+    return LayoutElement(100, 100, 300, 300, text="Sample text", source=None, type="Text")
 
 
 @pytest.fixture()
@@ -110,9 +110,9 @@ def mock_embedded_text_regions():
 @pytest.fixture()
 def mock_ocr_regions():
     return [
-        EmbeddedTextRegion(10, 10, 90, 90, "0"),
-        EmbeddedTextRegion(200, 200, 300, 300, "1"),
-        EmbeddedTextRegion(500, 320, 600, 350, "3"),
+        EmbeddedTextRegion(10, 10, 90, 90, text="0", source=None),
+        EmbeddedTextRegion(200, 200, 300, 300, text="1", source=None),
+        EmbeddedTextRegion(500, 320, 600, 350, text="3", source=None),
     ]
 
 
@@ -141,6 +141,7 @@ def mock_inferred_layout(mock_embedded_text_regions):
             r.x2,
             r.y2,
             text=None,
+            source=None,
             type="Text",
         )
         for r in mock_embedded_text_regions
 
@@ -28,9 +28,16 @@ def mock_image():
 
 @pytest.fixture()
 def mock_initial_layout():
-    text_block = layout.EmbeddedTextRegion(2, 4, 6, 8, text="A very repetitive narrative. " * 10)
+    text_block = layout.EmbeddedTextRegion(
+        2,
+        4,
+        6,
+        8,
+        text="A very repetitive narrative. " * 10,
+        source="Mock",
+    )
 
-    title_block = layout.EmbeddedTextRegion(1, 2, 3, 4, text="A Catchy Title")
+    title_block = layout.EmbeddedTextRegion(1, 2, 3, 4, text="A Catchy Title", source="Mock")
 
     return [text_block, title_block]
 
@@ -42,11 +49,20 @@ def mock_final_layout():
         4,
         6,
         8,
+        source="Mock",
         text="A very repetitive narrative. " * 10,
         type="NarrativeText",
     )
 
-    title_block = layoutelement.LayoutElement(1, 2, 3, 4, text="A Catchy Title", type="Title")
+    title_block = layoutelement.LayoutElement(
+        1,
+        2,
+        3,
+        4,
+        source="Mock",
+        text="A Catchy Title",
+        type="Title",
+    )
 
     return [text_block, title_block]
 
@@ -709,8 +725,11 @@ def test_load_pdf_with_multicolumn_layout_and_ocr(filename="sample-docs/design-t
         assert element.text.startswith(test_snippets[i])
 
 
-@pytest.mark.parametrize("colors", ["red", None])
-def test_annotate(colors):
+@pytest.mark.parametrize(
+    ("colors", "add_details", "threshold"),
+    [("red", False, 0.992), (None, False, 0.992), ("red", True, 0.8)],
+)
+def test_annotate(colors, add_details, threshold):
     def check_annotated_image():
         annotated_array = np.array(annotated_image)
         for coords in [coords1, coords2]:
@@ -722,9 +741,9 @@ def check_annotated_image():
                 assert all(annotated_array[y1:y2, x1, i] == expected)
                 assert all(annotated_array[y1:y2, x2, i] == expected)
             # Make sure almost all the pixels are not changed
-            assert ((annotated_array[:, :, 0] == 1).mean()) > 0.992
-            assert ((annotated_array[:, :, 1] == 1).mean()) > 0.992
-            assert ((annotated_array[:, :, 2] == 1).mean()) > 0.992
+            assert ((annotated_array[:, :, 0] == 1).mean()) > threshold
+            assert ((annotated_array[:, :, 1] == 1).mean()) > threshold
+            assert ((annotated_array[:, :, 2] == 1).mean()) > threshold
 
     test_image_arr = np.ones((100, 100, 3), dtype="uint8")
     image = Image.fromarray(test_image_arr)
@@ -735,15 +754,18 @@ def check_annotated_image():
     rect2 = elements.Rectangle(*coords2)
     page.elements = [rect1, rect2]
 
+    annotated_image = page.annotate(colors=colors, add_details=add_details, sources=["all"])
+    check_annotated_image()
+
     # Scenario 1: where self.image exists
-    annotated_image = page.annotate(colors=colors)
+    annotated_image = page.annotate(colors=colors, add_details=add_details)
     check_annotated_image()
 
     # Scenario 2: where self.image is None, but self.image_path exists
     with patch.object(Image, "open", return_value=image):
         page.image = None
         page.image_path = "mock_path_to_image"
-        annotated_image = page.annotate(colors=colors)
+        annotated_image = page.annotate(colors=colors, add_details=add_details)
         check_annotated_image()
 
 
@@ -775,32 +797,30 @@ def test_image_text_region(text, ocr_strategy, expected, mock_image):
         )
 
 
-@pytest.fixture()
-def ordering_layout():
-    elements = [
-        layout.LayoutElement(x1=447.0, y1=315.0, x2=1275.7, y2=413.0, text="0"),
-        layout.LayoutElement(x1=380.6, y1=473.4, x2=1334.8, y2=533.9, text="1"),
-        layout.LayoutElement(x1=578.6, y1=556.8, x2=1109.0, y2=874.4, text="2"),
-        layout.LayoutElement(x1=444.5, y1=942.3, x2=1261.1, y2=1584.1, text="3"),
-        layout.LayoutElement(x1=444.8, y1=1609.4, x2=1257.2, y2=1665.2, text="4"),
-        layout.LayoutElement(x1=414.0, y1=1718.8, x2=635.0, y2=1755.2, text="5"),
-        layout.LayoutElement(x1=372.6, y1=1786.9, x2=1333.6, y2=1848.7, text="6"),
-    ]
-    return elements
+class MockDetectionModel(layout.UnstructuredObjectDetectionModel):
+    def initialize(self, *args, **kwargs):
+        pass
+
+    def predict(self, x):
+        return [
+            layout.LayoutElement(x1=447.0, y1=315.0, x2=1275.7, y2=413.0, text="0"),
+            layout.LayoutElement(x1=380.6, y1=473.4, x2=1334.8, y2=533.9, text="1"),
+            layout.LayoutElement(x1=578.6, y1=556.8, x2=1109.0, y2=874.4, text="2"),
+            layout.LayoutElement(x1=444.5, y1=942.3, x2=1261.1, y2=1584.1, text="3"),
+            layout.LayoutElement(x1=444.8, y1=1609.4, x2=1257.2, y2=1665.2, text="4"),
+            layout.LayoutElement(x1=414.0, y1=1718.8, x2=635.0, y2=1755.2, text="5"),
+            layout.LayoutElement(x1=372.6, y1=1786.9, x2=1333.6, y2=1848.7, text="6"),
+        ]
 
 
-def test_layout_order(mock_image, ordering_layout):
+def test_layout_order(mock_image):
     with tempfile.TemporaryDirectory() as tmpdir:
         mock_image_path = os.path.join(tmpdir, "mock.jpg")
         mock_image.save(mock_image_path)
-        with patch.object(layout, "get_model", lambda: lambda x: ordering_layout), patch.object(
+        with patch.object(layout, "get_model", lambda: MockDetectionModel()), patch.object(
             layout,
             "load_pdf",
             lambda *args, **kwargs: ([[]], [mock_image_path]),
-        ), patch.object(
-            layout,
-            "UnstructuredObjectDetectionModel",
-            object,
         ):
             doc = layout.DocumentLayout.from_file("sample-docs/layout-parser-paper.pdf")
             page = doc.pages[0]
@@ -960,3 +980,20 @@ def test_warning_if_chipper_and_low_dpi(caplog):
         mock_from_file.assert_called_once()
         assert caplog.records[0].levelname == "WARNING"
         assert "DPI >= 300" in caplog.records[0].msg
+
+
+@pytest.mark.parametrize(
+    ("filename", "img_num", "should_complete"),
+    [("sample-docs/empty-document.pdf", 0, True), ("sample-docs/empty-document.pdf", 10, False)],
+)
+def test_get_image(filename, img_num, should_complete):
+    doc = layout.DocumentLayout.from_file(filename)
+    page = doc.pages[0]
+    try:
+        img = page._get_image(filename, img_num)
+        # transform img to numpy array
+        img = np.array(img)
+        # is a blank image with all pixels white
+        assert img.mean() == 255.0
+    except ValueError:
+        assert not should_complete
@@ -17,12 +17,12 @@
 def test_aggregate_ocr_text_by_block():
     expected = "A Unified Toolkit"
     ocr_layout = [
-        TextRegion(0, 0, 20, 20, "A"),
-        TextRegion(50, 50, 150, 150, "Unified"),
-        TextRegion(150, 150, 300, 250, "Toolkit"),
-        TextRegion(200, 250, 300, 350, "Deep"),
+        TextRegion(0, 0, 20, 20, source="OCR", text="A"),
+        TextRegion(50, 50, 150, 150, source="OCR", text="Unified"),
+        TextRegion(150, 150, 300, 250, source="OCR", text="Toolkit"),
+        TextRegion(200, 250, 300, 350, source="OCR", text="Deep"),
     ]
-    region = TextRegion(0, 0, 250, 350, "")
+    region = TextRegion(0, 0, 250, 350, text="")
 
     text = aggregate_ocr_text_by_block(ocr_layout, region, 0.5)
     assert text == expected
@@ -65,6 +65,7 @@ def test_supplement_layout_with_ocr_elements(mock_layout, mock_ocr_regions):
             r.x2,
             r.y2,
             text=r.text,
+            source=None,
             type="UncategorizedText",
         )
         for r in mock_ocr_regions
@@ -94,6 +95,7 @@ def test_merge_inferred_layout_with_ocr_layout(mock_inferred_layout, mock_ocr_re
             r.x2,
             r.y2,
             text=r.text,
+            source=None,
             type="UncategorizedText",
         )
         for r in mock_ocr_regions
@@ -138,6 +140,7 @@ def test_layout_element_do_dict(mock_layout_element):
         "text": "Sample text",
         "type": "Text",
         "prob": None,
+        "source": None,
     }
 
     assert mock_layout_element.to_dict() == expected
@@ -157,6 +160,14 @@ def test_layout_element_from_lp_textblock():
         score=0.99,
     )
 
-    expected = LayoutElement(100, 100, 300, 300, "Sample Text", "Text", 0.99)
-
+    expected = LayoutElement(
+        100,
+        100,
+        300,
+        300,
+        text="Sample Text",
+        source="detectron2_lp",
+        type="Text",
+        prob=0.99,
+    )
     assert LayoutElement.from_lp_textblock(mock_text_block) == expected
@@ -58,7 +58,7 @@ def test_raises_uninitialized():
 def test_model_initializes_once():
     from unstructured_inference.inference import layout
 
-    with mock.patch.object(models, "UnstructuredDetectronONNXModel", MockModel), mock.patch.object(
+    with mock.patch.object(models, "UnstructuredYoloXModel", MockModel), mock.patch.object(
         models,
         "models",
         {},
@@ -72,3 +72,77 @@ def test_model_initializes_once():
         assert (
             doc.pages[0].elements[0].prob is None
         )  # NOTE(pravin) New Assertion to Make Sure Uncategorized Text has None Probability
+
+
+def test_deduplicate_detected_elements():
+    import numpy as np
+
+    from unstructured_inference.inference.elements import intersections
+    from unstructured_inference.inference.layout import DocumentLayout
+    from unstructured_inference.models.base import get_model
+
+    model = get_model("yolox_quantized")
+    # model.confidence_threshold=0.5
+    file = "sample-docs/example_table.jpg"
+    doc = DocumentLayout.from_image_file(
+        file,
+        model,
+        ocr_strategy="never",
+        supplement_with_ocr_elements=False,
+    )
+    known_elements = [e for e in doc.pages[0].elements if e.type != "UncategorizedText"]
+    # Compute intersection matrix
+    intersections_mtx = intersections(*known_elements)
+    # Get rid off diagonal (cause an element will always intersect itself)
+    np.fill_diagonal(intersections_mtx, False)
+    # Now all the elements should be False, because any intersection remains
+    return not intersections_mtx.all()
+
+
+def test_enhance_regions():
+    from unstructured_inference.inference.elements import Rectangle
+    from unstructured_inference.models.base import get_model
+
+    elements = [
+        Rectangle(0, 0, 1, 1),
+        Rectangle(0.01, 0.01, 1.01, 1.01),
+        Rectangle(0.02, 0.02, 1.02, 1.02),
+        Rectangle(0.03, 0.03, 1.03, 1.03),
+        Rectangle(0.04, 0.04, 1.04, 1.04),
+        Rectangle(0.05, 0.05, 1.05, 1.05),
+        Rectangle(0.06, 0.06, 1.06, 1.06),
+        Rectangle(0.07, 0.07, 1.07, 1.07),
+        Rectangle(0.08, 0.08, 1.08, 1.08),
+        Rectangle(0.09, 0.09, 1.09, 1.09),
+        Rectangle(0.10, 0.10, 1.10, 1.10),
+    ]
+    model = get_model("yolox_tiny")
+    elements = model.enhance_regions(elements, 0.5)
+    assert len(elements) == 1
+    assert (elements[0].x1, elements[0].y1, elements[0].x2, elements[0].x2) == (0, 0, 1.10, 1.10)
+
+
+def test_clean_type():
+    from unstructured_inference.inference.layout import LayoutElement
+    from unstructured_inference.models.base import get_model
+
+    elements = [
+        LayoutElement(
+            0.6,
+            0.6,
+            0.65,
+            0.65,
+            type="Table",
+        ),  # One little table nested inside all the others
+        LayoutElement(0.5, 0.5, 0.7, 0.7, type="Table"),  # One nested table
+        LayoutElement(0, 0, 1, 1, type="Table"),  # Big table
+        LayoutElement(0.01, 0.01, 1.01, 1.01),
+        LayoutElement(0.02, 0.02, 1.02, 1.02),
+        LayoutElement(0.03, 0.03, 1.03, 1.03),
+        LayoutElement(0.04, 0.04, 1.04, 1.04),
+        LayoutElement(0.05, 0.05, 1.05, 1.05),
+    ]
+    model = get_model("yolox_tiny")
+    elements = model.clean_type(elements, type_to_clean="Table")
+    assert len(elements) == 1
+    assert (elements[0].x1, elements[0].y1, elements[0].x2, elements[0].x2) == (0, 0, 1, 1)
@@ -14,7 +14,9 @@ def test_layout_yolox_local_parsing_image():
     # NOTE(benjamin) The example image should result in one page result
     assert len(document_layout.pages) == 1
     # NOTE(benjamin) The example sent to the test contains 13 detections
-    assert len(document_layout.pages[0].elements) == 13
+    types_known = ["Text", "Section-header", "Page-header"]
+    known_regions = [e for e in document_layout.pages[0].elements if e.type in types_known]
+    assert len(known_regions) == 13
     assert hasattr(
         document_layout.pages[0].elements[0],
         "prob",
@@ -32,8 +34,9 @@ def test_layout_yolox_local_parsing_pdf():
     content = str(document_layout)
     assert "libero fringilla" in content
     assert len(document_layout.pages) == 1
-    # NOTE(benjamin) The example sent to the test contains 5 detections
-    assert len(document_layout.pages[0].elements) == 5
+    # NOTE(benjamin) The example sent to the test contains 5 text detections
+    text_elements = [e for e in document_layout.pages[0].elements if e.type == "Text"]
+    assert len(text_elements) == 5
     assert hasattr(
         document_layout.pages[0].elements[0],
         "prob",
@@ -59,10 +62,10 @@ def test_layout_yolox_local_parsing_empty_pdf():
 
 
 def test_layout_yolox_local_parsing_image_soft():
-    filename = os.path.join("sample-docs", "test-image.jpg")
+    filename = os.path.join("sample-docs", "example_table.jpg")
     # NOTE(benjamin) keep_output = True create a file for each image in
     # localstorage for visualization of the result
-    document_layout = process_file_with_model(filename, model_name="yolox_tiny", is_image=True)
+    document_layout = process_file_with_model(filename, model_name="yolox_quantized", is_image=True)
     # NOTE(benjamin) The example image should result in one page result
     assert len(document_layout.pages) == 1
     # NOTE(benjamin) Soft version of the test, run make test-long in order to run with full model