Unstructured-IO
diff --git a/‎.github/workflows/ci.yml‎
Lines changed: 2 additions & 4 deletions b/‎.github/workflows/ci.yml‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎CHANGELOG.md‎
Lines changed: 2 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎Makefile‎
Lines changed: 9 additions & 9 deletions b/‎Makefile‎
Lines changed: 9 additions & 9 deletions
diff --git a/‎test_unstructured_inference/inference/test_layout.py‎
Lines changed: 36 additions & 10 deletions b/‎test_unstructured_inference/inference/test_layout.py‎
Lines changed: 36 additions & 10 deletions
diff --git a/‎test_unstructured_inference/models/test_model.py‎
Lines changed: 3 additions & 2 deletions b/‎test_unstructured_inference/models/test_model.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎test_unstructured_inference/models/test_yolox.py‎
Lines changed: 7 additions & 6 deletions b/‎test_unstructured_inference/models/test_yolox.py‎
Lines changed: 7 additions & 6 deletions
diff --git a/‎test_unstructured_inference/test_elements.py‎
Lines changed: 141 additions & 4 deletions b/‎test_unstructured_inference/test_elements.py‎
Lines changed: 141 additions & 4 deletions
@@ -52,11 +52,10 @@ jobs:
       if: steps.virtualenv-cache.outputs.cache-hit != 'true'
       run: |
         python${{ env.PYTHON_VERSION }} -m venv .venv
-        source .venv/bin/activate
-        make install-ci
     - name: Lint
       run: |
         source .venv/bin/activate
+        make install-ci
         make check
 
   shellcheck:
@@ -83,8 +82,6 @@ jobs:
       if: steps.virtualenv-cache.outputs.cache-hit != 'true'
       run: |
         python${{ env.PYTHON_VERSION }} -m venv .venv
-        source .venv/bin/activate
-        make install-ci
     - name: Install Poppler
       run: |
         sudo apt-get update
@@ -100,6 +97,7 @@ jobs:
         UNSTRUCTURED_HF_TOKEN: ${{ secrets.HF_TOKEN }}
       run: |
         source .venv/bin/activate
+        make install-ci
         aws s3 cp s3://utic-dev-models/ci_test_model/test_ci_model.onnx test_unstructured_inference/models/
         CI=true make test
         make check-coverage
 
@@ -3,6 +3,8 @@
 * refactor: remove layout analysis related code
 * enhancement: Hide warning about table transformer weights not being loaded
 * fix(layout): Use TemporaryDirectory instead of NamedTemporaryFile for Windows support
+* refactor: use `numpy` array to store layout elements' information in one single `LayoutElements`
+  object instead of using a list of `LayoutElement`
 
 ## 0.7.36
 
 
@@ -15,7 +15,7 @@ help: Makefile
 ## install-base:            installs core requirements needed for text processing bricks
 .PHONY: install-base
 install-base: install-base-pip-packages
-	pip install -r requirements/base.txt
+	python3 -m pip install -r requirements/base.txt
 
 ## install:                 installs all test, dev, and experimental requirements
 .PHONY: install
@@ -30,11 +30,11 @@ install-base-pip-packages:
 
 .PHONY: install-test
 install-test: install-base
-	pip install -r requirements/test.txt
+	python3 -m pip install -r requirements/test.txt
 
 .PHONY: install-dev
 install-dev: install-test
-	pip install -r requirements/dev.txt
+	python3 -m pip install -r requirements/dev.txt
 
 ## pip-compile:             compiles all base/dev/test requirements
 .PHONY: pip-compile
@@ -66,14 +66,14 @@ check: check-src check-tests check-version
 .PHONY: check-src
 check-src:
 	ruff check ${PACKAGE_NAME} --line-length 100 --select C4,COM,E,F,I,PLR0402,PT,SIM,UP015,UP018,UP032,UP034 --ignore COM812,PT011,PT012,SIM117
-	black --line-length 100 ${PACKAGE_NAME} --check
-	flake8 ${PACKAGE_NAME}
-	mypy ${PACKAGE_NAME} --ignore-missing-imports
+	python -m black --line-length 100 ${PACKAGE_NAME} --check
+	python -m flake8 ${PACKAGE_NAME}
+	python -m mypy ${PACKAGE_NAME} --ignore-missing-imports
 
 .PHONY: check-tests
 check-tests:
-	black --line-length 100 test_${PACKAGE_NAME} --check
-	flake8 test_${PACKAGE_NAME}
+	python -m black --line-length 100 test_${PACKAGE_NAME} --check
+	python -m flake8 test_${PACKAGE_NAME}
 
 ## check-scripts:           run shellcheck
 .PHONY: check-scripts
@@ -105,7 +105,7 @@ version-sync:
 
 .PHONY: check-coverage
 check-coverage:
-	coverage report --fail-under=95
+	python -m coverage report --fail-under=95
 
 ##########
 # Docker #
 
@@ -71,7 +71,7 @@ def mock_final_layout():
         type="Title",
     )
 
-    return [text_block, title_block]
+    return layoutelement.LayoutElements.from_list([text_block, title_block])
 
 
 def test_pdf_page_converts_images_to_array(mock_image):
@@ -378,15 +378,41 @@ def initialize(self, *args, **kwargs):
         pass
 
     def predict(self, x):
-        return [
-            layout.LayoutElement.from_coords(x1=447.0, y1=315.0, x2=1275.7, y2=413.0, text="0"),
-            layout.LayoutElement.from_coords(x1=380.6, y1=473.4, x2=1334.8, y2=533.9, text="1"),
-            layout.LayoutElement.from_coords(x1=578.6, y1=556.8, x2=1109.0, y2=874.4, text="2"),
-            layout.LayoutElement.from_coords(x1=444.5, y1=942.3, x2=1261.1, y2=1584.1, text="3"),
-            layout.LayoutElement.from_coords(x1=444.8, y1=1609.4, x2=1257.2, y2=1665.2, text="4"),
-            layout.LayoutElement.from_coords(x1=414.0, y1=1718.8, x2=635.0, y2=1755.2, text="5"),
-            layout.LayoutElement.from_coords(x1=372.6, y1=1786.9, x2=1333.6, y2=1848.7, text="6"),
-        ]
+        return layoutelement.LayoutElements.from_list(
+            [
+                layout.LayoutElement.from_coords(x1=447.0, y1=315.0, x2=1275.7, y2=413.0, text="0"),
+                layout.LayoutElement.from_coords(x1=380.6, y1=473.4, x2=1334.8, y2=533.9, text="1"),
+                layout.LayoutElement.from_coords(x1=578.6, y1=556.8, x2=1109.0, y2=874.4, text="2"),
+                layout.LayoutElement.from_coords(
+                    x1=444.5,
+                    y1=942.3,
+                    x2=1261.1,
+                    y2=1584.1,
+                    text="3",
+                ),
+                layout.LayoutElement.from_coords(
+                    x1=444.8,
+                    y1=1609.4,
+                    x2=1257.2,
+                    y2=1665.2,
+                    text="4",
+                ),
+                layout.LayoutElement.from_coords(
+                    x1=414.0,
+                    y1=1718.8,
+                    x2=635.0,
+                    y2=1755.2,
+                    text="5",
+                ),
+                layout.LayoutElement.from_coords(
+                    x1=372.6,
+                    y1=1786.9,
+                    x2=1333.6,
+                    y2=1848.7,
+                    text="6",
+                ),
+            ],
+        )
 
 
 def test_layout_order(mock_image):
 
@@ -2,10 +2,11 @@
 from typing import Any
 from unittest import mock
 
+import numpy as np
 import pytest
 
 import unstructured_inference.models.base as models
-from unstructured_inference.inference.layoutelement import LayoutElement
+from unstructured_inference.inference.layoutelement import LayoutElement, LayoutElements
 from unstructured_inference.models.unstructuredmodel import (
     ModelNotInitializedError,
     UnstructuredObjectDetectionModel,
@@ -23,7 +24,7 @@ def initialize(self, *args, **kwargs):
         return self.initializer(self, *args, **kwargs)
 
     def predict(self, x: Any) -> Any:
-        return []
+        return LayoutElements(element_coords=np.array([]))
 
 
 MOCK_MODEL_TYPES = {
 
@@ -15,14 +15,15 @@ def test_layout_yolox_local_parsing_image():
     assert len(document_layout.pages) == 1
     # NOTE(benjamin) The example sent to the test contains 13 detections
     types_known = ["Text", "Section-header", "Page-header"]
-    known_regions = [e for e in document_layout.pages[0].elements if e.type in types_known]
+    elements = document_layout.pages[0].elements_array
+    known_regions = [
+        e for e in elements.element_class_ids if elements.element_class_id_map[e] in types_known
+    ]
     assert len(known_regions) == 13
-    assert hasattr(
-        document_layout.pages[0].elements[0],
-        "prob",
-    )  # NOTE(pravin) New Assertion to Make Sure LayoutElement has probabilities
+    # NOTE(pravin) New Assertion to Make Sure LayoutElement has probabilities
+    assert hasattr(elements, "element_probs")
     assert isinstance(
-        document_layout.pages[0].elements[0].prob,
+        elements.element_probs[0],
         float,
     )  # NOTE(pravin) New Assertion to Make Sure Populated Probability is Float
 
 
@@ -2,13 +2,22 @@
 from random import randint
 from unittest.mock import PropertyMock, patch
 
+import numpy as np
 import pytest
 
 from unstructured_inference.constants import ElementType
 from unstructured_inference.inference import elements
-from unstructured_inference.inference.elements import Rectangle, TextRegion, ImageTextRegion
+from unstructured_inference.inference.elements import (
+    ImageTextRegion,
+    Rectangle,
+    TextRegion,
+    TextRegions,
+)
 from unstructured_inference.inference.layoutelement import (
     LayoutElement,
+    LayoutElements,
+    clean_layoutelements,
+    clean_layoutelements_for_class,
     merge_inferred_layout_with_extracted_layout,
     partition_groups_from_regions,
     separate,
@@ -31,6 +40,30 @@ def rand_rect(size=10):
     return elements.Rectangle(x1, y1, x1 + size, y1 + size)
 
 
+@pytest.fixture()
+def test_layoutelements():
+    coords = np.array(
+        [
+            [0.6, 0.6, 0.65, 0.65],  # One little table nested inside all the others
+            [0.5, 0.5, 0.7, 0.7],  # One nested table
+            [0, 0, 1, 1],  # Big table
+            [0.01, 0.01, 1.01, 1.01],
+            [0.02, 0.02, 1.02, 1.02],
+            [0.03, 0.03, 1.03, 1.03],
+            [0.04, 0.04, 1.04, 1.04],
+            [0.05, 0.05, 1.05, 1.05],
+            [2, 2, 3, 3],  # Big table
+        ],
+    )
+    element_class_ids = np.array([1, 1, 1, 0, 0, 0, 0, 0, 2])
+    class_map = {0: "type0", 1: "type1", 2: "type2"}
+    return LayoutElements(
+        element_coords=coords,
+        element_class_ids=element_class_ids,
+        element_class_id_map=class_map,
+    )
+
+
 @pytest.mark.parametrize(
     ("rect1", "rect2", "expected"),
     [
@@ -111,14 +144,23 @@ def test_minimal_containing_rect():
 
 
 def test_partition_groups_from_regions(mock_embedded_text_regions):
-    words = mock_embedded_text_regions
+    words = TextRegions.from_list(mock_embedded_text_regions)
     groups = partition_groups_from_regions(words)
     assert len(groups) == 1
-    sorted_groups = sorted(groups, key=lambda group: group[0].bbox.y1)
-    text = "".join([el.text for el in sorted_groups[-1]])
+    text = "".join(groups[-1].texts)
+    assert text.startswith("Layout")
+    # test backward compatibility
+    text = "".join([str(region) for region in groups[-1].as_list()])
     assert text.startswith("Layout")
 
 
+def test_rectangle_padding():
+    rect = Rectangle(x1=0, y1=1, x2=3, y2=4)
+    padded = rect.pad(1)
+    assert (padded.x1, padded.y1, padded.x2, padded.y2) == (-1, 0, 4, 5)
+    assert (rect.x1, rect.y1, rect.x2, rect.y2) == (0, 1, 3, 4)
+
+
 def test_rectangle_area(monkeypatch):
     for _ in range(1000):
         width = randint(0, 20)
@@ -284,3 +326,98 @@ def test_merge_inferred_layout_with_extracted_layout():
         page_image_size=(1700, 2200),
     )
     assert merged_layout == inferred_layout
+
+
+def test_clean_layoutelements(test_layoutelements):
+    elements = clean_layoutelements(test_layoutelements).as_list()
+    assert len(elements) == 2
+    assert (
+        elements[0].bbox.x1,
+        elements[0].bbox.y1,
+        elements[0].bbox.x2,
+        elements[0].bbox.x2,
+    ) == (0, 0, 1, 1)
+    assert (
+        elements[1].bbox.x1,
+        elements[1].bbox.y1,
+        elements[1].bbox.x2,
+        elements[1].bbox.x2,
+    ) == (2, 2, 3, 3)
+
+
+@pytest.mark.parametrize(
+    ("coords", "class_ids", "expected_coords", "expected_ids"),
+    [
+        ([[0, 0, 1, 1], [0, 0, 1, 1]], [0, 1], [[0, 0, 1, 1]], [0]),  # one box
+        (
+            [[0, 0, 1, 1], [0, 0, 1, 1], [1, 1, 2, 2]],
+            [0, 1, 0],
+            [[0, 0, 1, 1], [1, 1, 2, 2]],
+            [0, 0],
+        ),
+        (
+            [[0, 0, 1.4, 1.4], [0, 0, 1, 1], [0.4, 0, 1.4, 1], [1.2, 0, 1.4, 1]],
+            [0, 1, 1, 1],
+            [[0, 0, 1.4, 1.4]],
+            [0],
+        ),
+    ],
+)
+def test_clean_layoutelements_cases(
+    coords,
+    class_ids,
+    expected_coords,
+    expected_ids,
+):
+    coords = np.array(coords)
+    element_class_ids = np.array(class_ids)
+    elements = LayoutElements(element_coords=coords, element_class_ids=element_class_ids)
+
+    elements = clean_layoutelements(elements)
+    np.testing.assert_array_equal(elements.element_coords, expected_coords)
+    np.testing.assert_array_equal(elements.element_class_ids, expected_ids)
+
+
+@pytest.mark.parametrize(
+    ("coords", "class_ids", "class_to_filter", "expected_coords", "expected_ids"),
+    [
+        ([[0, 0, 1, 1], [0, 0, 1, 1]], [0, 1], 1, [[0, 0, 1, 1]], [1]),  # one box
+        (
+            [[0, 0, 1, 1], [0, 0, 1, 1], [1, 1, 2, 2]],  # one box
+            [0, 1, 0],
+            1,
+            [[0, 0, 1, 1], [1, 1, 2, 2]],
+            [1, 0],
+        ),
+        (
+            # a -> b, b -> c, but a not -> c
+            [[0, 0, 1.4, 1.4], [0, 0, 1, 1], [0.4, 0, 1.4, 1], [1.2, 0, 1.4, 1]],
+            [0, 1, 1, 1],
+            1,
+            [[0, 0, 1, 1], [1.2, 0, 1.4, 1], [0, 0, 1.4, 1.4]],
+            [1, 1, 0],
+        ),
+        (
+            # like the case above but a different filtering element type changes the results
+            [[0, 0, 1.4, 1.4], [0, 0, 1, 1], [0.4, 0, 1.4, 1], [1.2, 0, 1.4, 1]],
+            [0, 1, 1, 1],
+            0,
+            [[0, 0, 1.4, 1.4]],
+            [0],
+        ),
+    ],
+)
+def test_clean_layoutelements_for_class(
+    coords,
+    class_ids,
+    class_to_filter,
+    expected_coords,
+    expected_ids,
+):
+    coords = np.array(coords)
+    element_class_ids = np.array(class_ids)
+    elements = LayoutElements(element_coords=coords, element_class_ids=element_class_ids)
+
+    elements = clean_layoutelements_for_class(elements, element_class=class_to_filter)
+    np.testing.assert_array_equal(elements.element_coords, expected_coords)
+    np.testing.assert_array_equal(elements.element_class_ids, expected_ids)