Skip to content

Commit 9504649

Browse files
authored
vectorize layout elements (#384)
This PR adds two new vectorized page level dataclasses: - `TextRegions`: replaces a `list[TextRegion]` and store coordinates and texts as numpy arrays for more efficient memory operations when the number of items is large - `LayoutElements`: replaces a `list[LayoutElement]` and store data in numpy arrays as above In addition this PR refactors `yolox` model inference to use those two new classes above internally while keeping the list data structure still available for backward compatibility (e.g., passing into a `PageLayout` object). ## test compare memory usage and runtime on a pdf image using ```python from unstructured_inference.inference.layout import process_file_with_model def main(): f = "/Users/yaoyou/Downloads/002489.pdf" layout = process_file_with_model(f, model_name="yolox") # replace elements_array with elements when using main branch print(f"fount {len(layout.pages[0].elements_array)} elements") if __name__ == "__main__": main() ``` The peak memory is smaller on this branch (exact amount depends on the number of layout elements detected) and processing time is slightly faster (since this PR skips generation of list of `LayoutElement` from numpy array output of the yolox model).
1 parent dc30178 commit 9504649

File tree

13 files changed

+565
-108
lines changed

13 files changed

+565
-108
lines changed

.github/workflows/ci.yml

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -52,11 +52,10 @@ jobs:
5252
if: steps.virtualenv-cache.outputs.cache-hit != 'true'
5353
run: |
5454
python${{ env.PYTHON_VERSION }} -m venv .venv
55-
source .venv/bin/activate
56-
make install-ci
5755
- name: Lint
5856
run: |
5957
source .venv/bin/activate
58+
make install-ci
6059
make check
6160
6261
shellcheck:
@@ -83,8 +82,6 @@ jobs:
8382
if: steps.virtualenv-cache.outputs.cache-hit != 'true'
8483
run: |
8584
python${{ env.PYTHON_VERSION }} -m venv .venv
86-
source .venv/bin/activate
87-
make install-ci
8885
- name: Install Poppler
8986
run: |
9087
sudo apt-get update
@@ -100,6 +97,7 @@ jobs:
10097
UNSTRUCTURED_HF_TOKEN: ${{ secrets.HF_TOKEN }}
10198
run: |
10299
source .venv/bin/activate
100+
make install-ci
103101
aws s3 cp s3://utic-dev-models/ci_test_model/test_ci_model.onnx test_unstructured_inference/models/
104102
CI=true make test
105103
make check-coverage

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
* refactor: remove layout analysis related code
44
* enhancement: Hide warning about table transformer weights not being loaded
55
* fix(layout): Use TemporaryDirectory instead of NamedTemporaryFile for Windows support
6+
* refactor: use `numpy` array to store layout elements' information in one single `LayoutElements`
7+
object instead of using a list of `LayoutElement`
68

79
## 0.7.36
810

Makefile

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ help: Makefile
1515
## install-base: installs core requirements needed for text processing bricks
1616
.PHONY: install-base
1717
install-base: install-base-pip-packages
18-
pip install -r requirements/base.txt
18+
python3 -m pip install -r requirements/base.txt
1919

2020
## install: installs all test, dev, and experimental requirements
2121
.PHONY: install
@@ -30,11 +30,11 @@ install-base-pip-packages:
3030

3131
.PHONY: install-test
3232
install-test: install-base
33-
pip install -r requirements/test.txt
33+
python3 -m pip install -r requirements/test.txt
3434

3535
.PHONY: install-dev
3636
install-dev: install-test
37-
pip install -r requirements/dev.txt
37+
python3 -m pip install -r requirements/dev.txt
3838

3939
## pip-compile: compiles all base/dev/test requirements
4040
.PHONY: pip-compile
@@ -66,14 +66,14 @@ check: check-src check-tests check-version
6666
.PHONY: check-src
6767
check-src:
6868
ruff check ${PACKAGE_NAME} --line-length 100 --select C4,COM,E,F,I,PLR0402,PT,SIM,UP015,UP018,UP032,UP034 --ignore COM812,PT011,PT012,SIM117
69-
black --line-length 100 ${PACKAGE_NAME} --check
70-
flake8 ${PACKAGE_NAME}
71-
mypy ${PACKAGE_NAME} --ignore-missing-imports
69+
python -m black --line-length 100 ${PACKAGE_NAME} --check
70+
python -m flake8 ${PACKAGE_NAME}
71+
python -m mypy ${PACKAGE_NAME} --ignore-missing-imports
7272

7373
.PHONY: check-tests
7474
check-tests:
75-
black --line-length 100 test_${PACKAGE_NAME} --check
76-
flake8 test_${PACKAGE_NAME}
75+
python -m black --line-length 100 test_${PACKAGE_NAME} --check
76+
python -m flake8 test_${PACKAGE_NAME}
7777

7878
## check-scripts: run shellcheck
7979
.PHONY: check-scripts
@@ -105,7 +105,7 @@ version-sync:
105105

106106
.PHONY: check-coverage
107107
check-coverage:
108-
coverage report --fail-under=95
108+
python -m coverage report --fail-under=95
109109

110110
##########
111111
# Docker #

test_unstructured_inference/inference/test_layout.py

Lines changed: 36 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ def mock_final_layout():
7171
type="Title",
7272
)
7373

74-
return [text_block, title_block]
74+
return layoutelement.LayoutElements.from_list([text_block, title_block])
7575

7676

7777
def test_pdf_page_converts_images_to_array(mock_image):
@@ -378,15 +378,41 @@ def initialize(self, *args, **kwargs):
378378
pass
379379

380380
def predict(self, x):
381-
return [
382-
layout.LayoutElement.from_coords(x1=447.0, y1=315.0, x2=1275.7, y2=413.0, text="0"),
383-
layout.LayoutElement.from_coords(x1=380.6, y1=473.4, x2=1334.8, y2=533.9, text="1"),
384-
layout.LayoutElement.from_coords(x1=578.6, y1=556.8, x2=1109.0, y2=874.4, text="2"),
385-
layout.LayoutElement.from_coords(x1=444.5, y1=942.3, x2=1261.1, y2=1584.1, text="3"),
386-
layout.LayoutElement.from_coords(x1=444.8, y1=1609.4, x2=1257.2, y2=1665.2, text="4"),
387-
layout.LayoutElement.from_coords(x1=414.0, y1=1718.8, x2=635.0, y2=1755.2, text="5"),
388-
layout.LayoutElement.from_coords(x1=372.6, y1=1786.9, x2=1333.6, y2=1848.7, text="6"),
389-
]
381+
return layoutelement.LayoutElements.from_list(
382+
[
383+
layout.LayoutElement.from_coords(x1=447.0, y1=315.0, x2=1275.7, y2=413.0, text="0"),
384+
layout.LayoutElement.from_coords(x1=380.6, y1=473.4, x2=1334.8, y2=533.9, text="1"),
385+
layout.LayoutElement.from_coords(x1=578.6, y1=556.8, x2=1109.0, y2=874.4, text="2"),
386+
layout.LayoutElement.from_coords(
387+
x1=444.5,
388+
y1=942.3,
389+
x2=1261.1,
390+
y2=1584.1,
391+
text="3",
392+
),
393+
layout.LayoutElement.from_coords(
394+
x1=444.8,
395+
y1=1609.4,
396+
x2=1257.2,
397+
y2=1665.2,
398+
text="4",
399+
),
400+
layout.LayoutElement.from_coords(
401+
x1=414.0,
402+
y1=1718.8,
403+
x2=635.0,
404+
y2=1755.2,
405+
text="5",
406+
),
407+
layout.LayoutElement.from_coords(
408+
x1=372.6,
409+
y1=1786.9,
410+
x2=1333.6,
411+
y2=1848.7,
412+
text="6",
413+
),
414+
],
415+
)
390416

391417

392418
def test_layout_order(mock_image):

test_unstructured_inference/models/test_model.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,11 @@
22
from typing import Any
33
from unittest import mock
44

5+
import numpy as np
56
import pytest
67

78
import unstructured_inference.models.base as models
8-
from unstructured_inference.inference.layoutelement import LayoutElement
9+
from unstructured_inference.inference.layoutelement import LayoutElement, LayoutElements
910
from unstructured_inference.models.unstructuredmodel import (
1011
ModelNotInitializedError,
1112
UnstructuredObjectDetectionModel,
@@ -23,7 +24,7 @@ def initialize(self, *args, **kwargs):
2324
return self.initializer(self, *args, **kwargs)
2425

2526
def predict(self, x: Any) -> Any:
26-
return []
27+
return LayoutElements(element_coords=np.array([]))
2728

2829

2930
MOCK_MODEL_TYPES = {

test_unstructured_inference/models/test_yolox.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15,14 +15,15 @@ def test_layout_yolox_local_parsing_image():
1515
assert len(document_layout.pages) == 1
1616
# NOTE(benjamin) The example sent to the test contains 13 detections
1717
types_known = ["Text", "Section-header", "Page-header"]
18-
known_regions = [e for e in document_layout.pages[0].elements if e.type in types_known]
18+
elements = document_layout.pages[0].elements_array
19+
known_regions = [
20+
e for e in elements.element_class_ids if elements.element_class_id_map[e] in types_known
21+
]
1922
assert len(known_regions) == 13
20-
assert hasattr(
21-
document_layout.pages[0].elements[0],
22-
"prob",
23-
) # NOTE(pravin) New Assertion to Make Sure LayoutElement has probabilities
23+
# NOTE(pravin) New Assertion to Make Sure LayoutElement has probabilities
24+
assert hasattr(elements, "element_probs")
2425
assert isinstance(
25-
document_layout.pages[0].elements[0].prob,
26+
elements.element_probs[0],
2627
float,
2728
) # NOTE(pravin) New Assertion to Make Sure Populated Probability is Float
2829

test_unstructured_inference/test_elements.py

Lines changed: 141 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,22 @@
22
from random import randint
33
from unittest.mock import PropertyMock, patch
44

5+
import numpy as np
56
import pytest
67

78
from unstructured_inference.constants import ElementType
89
from unstructured_inference.inference import elements
9-
from unstructured_inference.inference.elements import Rectangle, TextRegion, ImageTextRegion
10+
from unstructured_inference.inference.elements import (
11+
ImageTextRegion,
12+
Rectangle,
13+
TextRegion,
14+
TextRegions,
15+
)
1016
from unstructured_inference.inference.layoutelement import (
1117
LayoutElement,
18+
LayoutElements,
19+
clean_layoutelements,
20+
clean_layoutelements_for_class,
1221
merge_inferred_layout_with_extracted_layout,
1322
partition_groups_from_regions,
1423
separate,
@@ -31,6 +40,30 @@ def rand_rect(size=10):
3140
return elements.Rectangle(x1, y1, x1 + size, y1 + size)
3241

3342

43+
@pytest.fixture()
44+
def test_layoutelements():
45+
coords = np.array(
46+
[
47+
[0.6, 0.6, 0.65, 0.65], # One little table nested inside all the others
48+
[0.5, 0.5, 0.7, 0.7], # One nested table
49+
[0, 0, 1, 1], # Big table
50+
[0.01, 0.01, 1.01, 1.01],
51+
[0.02, 0.02, 1.02, 1.02],
52+
[0.03, 0.03, 1.03, 1.03],
53+
[0.04, 0.04, 1.04, 1.04],
54+
[0.05, 0.05, 1.05, 1.05],
55+
[2, 2, 3, 3], # Big table
56+
],
57+
)
58+
element_class_ids = np.array([1, 1, 1, 0, 0, 0, 0, 0, 2])
59+
class_map = {0: "type0", 1: "type1", 2: "type2"}
60+
return LayoutElements(
61+
element_coords=coords,
62+
element_class_ids=element_class_ids,
63+
element_class_id_map=class_map,
64+
)
65+
66+
3467
@pytest.mark.parametrize(
3568
("rect1", "rect2", "expected"),
3669
[
@@ -111,14 +144,23 @@ def test_minimal_containing_rect():
111144

112145

113146
def test_partition_groups_from_regions(mock_embedded_text_regions):
114-
words = mock_embedded_text_regions
147+
words = TextRegions.from_list(mock_embedded_text_regions)
115148
groups = partition_groups_from_regions(words)
116149
assert len(groups) == 1
117-
sorted_groups = sorted(groups, key=lambda group: group[0].bbox.y1)
118-
text = "".join([el.text for el in sorted_groups[-1]])
150+
text = "".join(groups[-1].texts)
151+
assert text.startswith("Layout")
152+
# test backward compatibility
153+
text = "".join([str(region) for region in groups[-1].as_list()])
119154
assert text.startswith("Layout")
120155

121156

157+
def test_rectangle_padding():
158+
rect = Rectangle(x1=0, y1=1, x2=3, y2=4)
159+
padded = rect.pad(1)
160+
assert (padded.x1, padded.y1, padded.x2, padded.y2) == (-1, 0, 4, 5)
161+
assert (rect.x1, rect.y1, rect.x2, rect.y2) == (0, 1, 3, 4)
162+
163+
122164
def test_rectangle_area(monkeypatch):
123165
for _ in range(1000):
124166
width = randint(0, 20)
@@ -284,3 +326,98 @@ def test_merge_inferred_layout_with_extracted_layout():
284326
page_image_size=(1700, 2200),
285327
)
286328
assert merged_layout == inferred_layout
329+
330+
331+
def test_clean_layoutelements(test_layoutelements):
332+
elements = clean_layoutelements(test_layoutelements).as_list()
333+
assert len(elements) == 2
334+
assert (
335+
elements[0].bbox.x1,
336+
elements[0].bbox.y1,
337+
elements[0].bbox.x2,
338+
elements[0].bbox.x2,
339+
) == (0, 0, 1, 1)
340+
assert (
341+
elements[1].bbox.x1,
342+
elements[1].bbox.y1,
343+
elements[1].bbox.x2,
344+
elements[1].bbox.x2,
345+
) == (2, 2, 3, 3)
346+
347+
348+
@pytest.mark.parametrize(
349+
("coords", "class_ids", "expected_coords", "expected_ids"),
350+
[
351+
([[0, 0, 1, 1], [0, 0, 1, 1]], [0, 1], [[0, 0, 1, 1]], [0]), # one box
352+
(
353+
[[0, 0, 1, 1], [0, 0, 1, 1], [1, 1, 2, 2]],
354+
[0, 1, 0],
355+
[[0, 0, 1, 1], [1, 1, 2, 2]],
356+
[0, 0],
357+
),
358+
(
359+
[[0, 0, 1.4, 1.4], [0, 0, 1, 1], [0.4, 0, 1.4, 1], [1.2, 0, 1.4, 1]],
360+
[0, 1, 1, 1],
361+
[[0, 0, 1.4, 1.4]],
362+
[0],
363+
),
364+
],
365+
)
366+
def test_clean_layoutelements_cases(
367+
coords,
368+
class_ids,
369+
expected_coords,
370+
expected_ids,
371+
):
372+
coords = np.array(coords)
373+
element_class_ids = np.array(class_ids)
374+
elements = LayoutElements(element_coords=coords, element_class_ids=element_class_ids)
375+
376+
elements = clean_layoutelements(elements)
377+
np.testing.assert_array_equal(elements.element_coords, expected_coords)
378+
np.testing.assert_array_equal(elements.element_class_ids, expected_ids)
379+
380+
381+
@pytest.mark.parametrize(
382+
("coords", "class_ids", "class_to_filter", "expected_coords", "expected_ids"),
383+
[
384+
([[0, 0, 1, 1], [0, 0, 1, 1]], [0, 1], 1, [[0, 0, 1, 1]], [1]), # one box
385+
(
386+
[[0, 0, 1, 1], [0, 0, 1, 1], [1, 1, 2, 2]], # one box
387+
[0, 1, 0],
388+
1,
389+
[[0, 0, 1, 1], [1, 1, 2, 2]],
390+
[1, 0],
391+
),
392+
(
393+
# a -> b, b -> c, but a not -> c
394+
[[0, 0, 1.4, 1.4], [0, 0, 1, 1], [0.4, 0, 1.4, 1], [1.2, 0, 1.4, 1]],
395+
[0, 1, 1, 1],
396+
1,
397+
[[0, 0, 1, 1], [1.2, 0, 1.4, 1], [0, 0, 1.4, 1.4]],
398+
[1, 1, 0],
399+
),
400+
(
401+
# like the case above but a different filtering element type changes the results
402+
[[0, 0, 1.4, 1.4], [0, 0, 1, 1], [0.4, 0, 1.4, 1], [1.2, 0, 1.4, 1]],
403+
[0, 1, 1, 1],
404+
0,
405+
[[0, 0, 1.4, 1.4]],
406+
[0],
407+
),
408+
],
409+
)
410+
def test_clean_layoutelements_for_class(
411+
coords,
412+
class_ids,
413+
class_to_filter,
414+
expected_coords,
415+
expected_ids,
416+
):
417+
coords = np.array(coords)
418+
element_class_ids = np.array(class_ids)
419+
elements = LayoutElements(element_coords=coords, element_class_ids=element_class_ids)
420+
421+
elements = clean_layoutelements_for_class(elements, element_class=class_to_filter)
422+
np.testing.assert_array_equal(elements.element_coords, expected_coords)
423+
np.testing.assert_array_equal(elements.element_class_ids, expected_ids)

0 commit comments

Comments
 (0)