Skip to content

Commit 8f2a719

Browse files
Feat/refactor layoutelement textregion to vectorized data structure (#3881)
This PR refactors the data structure for `list[LayoutElement]` and `list[TextRegion]` used in partition pdf/image files. - new data structure replaces a list of objects with one object with `numpy` array to store data - this only affects partition internal steps and it doesn't change input or output signature of `partition` function itself, i.e., `partition` still returns `list[Element]` - internally `list[LayoutElement]` -> `LayoutElements`; `list[TextRegion]` -> `TextRegions` - current refactor stops before clean up pdfminer elements inside inferred layout elements -> the algorithm of clean up needs to be refactored before the data structure refactor can move forward. So current refactor converts the array data structure into list data structure with `element_array.as_list()` call. This is the last step before turning `list[LayoutElement]` into `list[Element]` as return - a future PR will update this last step so that we build `list[Element]` from `LayoutElements` data structure instead. The goal of this PR is to replace the data structure as much as possible without changing underlying logic. There are a few places where the slicing or filtering logic was simple enough to be converted into vector data structure operations. Those are refactored to be vector based. As a result there is some small improvements observed in ingest test. This is likely because the vector operations cleaned up some previous inconsistency in data types and operations. --------- Co-authored-by: ryannikolaidis <[email protected]> Co-authored-by: badGarnet <[email protected]>
1 parent 8d0b68a commit 8f2a719

File tree

21 files changed

+647
-429
lines changed

21 files changed

+647
-429
lines changed

CHANGELOG.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,12 @@
1+
## 0.16.16-dev0
2+
3+
### Enhancements
4+
5+
### Features
6+
- **Vectorize layout (inferred, extracted, and OCR) data structure** Using `np.ndarray` to store a group of layout elements or text regions instead of using a list of objects. This improves the memory efficiency and compute speed around layout merging and deduplication.
7+
8+
### Fixes
9+
110
## 0.16.15
211

312
### Enhancements

Dockerfile

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
FROM quay.io/unstructured-io/base-images:wolfi-base-latest AS base
22

33
ARG PYTHON=python3.11
4-
ARG PIP=pip3.11
4+
ARG PIP="${PYTHON} -m pip"
55

66
USER root
77

@@ -19,6 +19,9 @@ RUN chown -R notebook-user:notebook-user /app && \
1919

2020
USER notebook-user
2121

22+
# append PATH before pip install to avoid warning logs; it also avoids issues with packages that needs compilation during installation
23+
ENV PATH="${PATH}:/home/notebook-user/.local/bin"
24+
ENV TESSDATA_PREFIX=/usr/local/share/tessdata
2225
ENV NLTK_DATA=/home/notebook-user/nltk_data
2326

2427
# Install Python dependencies and download required NLTK packages
@@ -28,7 +31,4 @@ RUN find requirements/ -type f -name "*.txt" -exec $PIP install --no-cache-dir -
2831
$PYTHON -c "from unstructured.partition.model_init import initialize; initialize()" && \
2932
$PYTHON -c "from unstructured_inference.models.tables import UnstructuredTableTransformerModel; model = UnstructuredTableTransformerModel(); model.initialize('microsoft/table-transformer-structure-recognition')"
3033

31-
ENV PATH="${PATH}:/home/notebook-user/.local/bin"
32-
ENV TESSDATA_PREFIX=/usr/local/share/tessdata
33-
3434
CMD ["/bin/bash"]

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -308,7 +308,7 @@ docker-test:
308308
$(DOCKER_IMAGE) \
309309
bash -c "CI=$(CI) \
310310
UNSTRUCTURED_INCLUDE_DEBUG_METADATA=$(UNSTRUCTURED_INCLUDE_DEBUG_METADATA) \
311-
pytest $(if $(TEST_FILE),$(TEST_FILE),test_unstructured)"
311+
python3 -m pytest $(if $(TEST_FILE),$(TEST_FILE),test_unstructured)"
312312

313313
.PHONY: docker-smoke-test
314314
docker-smoke-test:

test_unstructured/partition/pdf_image/test_image.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ def __init__(self, number: int, image: Image):
7979
text="Charlie Brown and the Great Pumpkin",
8080
),
8181
]
82+
self.elements_array = layout.LayoutElements.from_list(self.elements)
8283

8384

8485
class MockDocumentLayout(layout.DocumentLayout):
@@ -254,7 +255,10 @@ def test_partition_image_with_ocr_detects_korean():
254255
)
255256

256257
assert elements[0].text == "RULES AND INSTRUCTIONS"
257-
assert elements[3].text.replace(" ", "").startswith("안녕하세요")
258+
# FIXME (yao): revisit this lstrip after refactoring merging logics; right now on docker and
259+
# local testing yield different results and on docker there is a "," at the start of the Korean
260+
# text line
261+
assert elements[3].text.replace(" ", "").lstrip(",").startswith("안녕하세요")
258262

259263

260264
def test_partition_image_with_ocr_detects_korean_from_file():
@@ -267,7 +271,7 @@ def test_partition_image_with_ocr_detects_korean_from_file():
267271
)
268272

269273
assert elements[0].text == "RULES AND INSTRUCTIONS"
270-
assert elements[3].text.replace(" ", "").startswith("안녕하세요")
274+
assert elements[3].text.replace(" ", "").lstrip(",").startswith("안녕하세요")
271275

272276

273277
def test_partition_image_raises_with_bad_strategy():
@@ -579,6 +583,7 @@ def inference_results():
579583
image=mock.MagicMock(format="JPEG"),
580584
)
581585
page.elements = [layout.LayoutElement.from_coords(0, 0, 600, 800, text="hello")]
586+
page.elements_array = layout.LayoutElements.from_list(page.elements)
582587
doc = layout.DocumentLayout(pages=[page])
583588
return doc
584589

test_unstructured/partition/pdf_image/test_inference_utils.py

Lines changed: 69 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from unstructured_inference.inference.elements import TextRegion, TextRegions
2-
from unstructured_inference.inference.layoutelement import LayoutElement
2+
from unstructured_inference.inference.layoutelement import LayoutElement, LayoutElements
33

44
from unstructured.documents.elements import ElementType
55
from unstructured.partition.pdf_image.inference_utils import (
@@ -22,16 +22,72 @@ def test_merge_text_regions(mock_embedded_text_regions):
2222

2323

2424
def test_build_layout_elements_from_ocr_regions(mock_embedded_text_regions):
25-
expected = [
26-
LayoutElement.from_coords(
27-
x1=437.83888888888885,
28-
y1=317.319341111111,
29-
x2=1256.334784222222,
30-
y2=406.9837855555556,
31-
text="LayoutParser: A Unified Toolkit for Deep Learning Based Document Image",
32-
type=ElementType.UNCATEGORIZED_TEXT,
33-
),
34-
]
35-
36-
elements = build_layout_elements_from_ocr_regions(mock_embedded_text_regions)
25+
expected = LayoutElements.from_list(
26+
[
27+
LayoutElement.from_coords(
28+
x1=437.83888888888885,
29+
y1=317.319341111111,
30+
x2=1256.334784222222,
31+
y2=406.9837855555556,
32+
text="LayoutParser: A Unified Toolkit for Deep Learning Based Document Image",
33+
type=ElementType.UNCATEGORIZED_TEXT,
34+
),
35+
]
36+
)
37+
38+
elements = build_layout_elements_from_ocr_regions(
39+
TextRegions.from_list(mock_embedded_text_regions)
40+
)
3741
assert elements == expected
42+
43+
44+
def test_build_layout_elements_from_ocr_regions_with_text(mock_embedded_text_regions):
45+
text = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image"
46+
expected = LayoutElements.from_list(
47+
[
48+
LayoutElement.from_coords(
49+
x1=437.83888888888885,
50+
y1=317.319341111111,
51+
x2=1256.334784222222,
52+
y2=406.9837855555556,
53+
text=text,
54+
type=ElementType.UNCATEGORIZED_TEXT,
55+
),
56+
]
57+
)
58+
59+
elements = build_layout_elements_from_ocr_regions(
60+
TextRegions.from_list(mock_embedded_text_regions),
61+
text,
62+
group_by_ocr_text=True,
63+
)
64+
assert elements == expected
65+
66+
67+
def test_build_layout_elements_from_ocr_regions_with_multi_line_text(mock_embedded_text_regions):
68+
text = "LayoutParser: \n\nA Unified Toolkit for Deep Learning Based Document Image"
69+
elements = build_layout_elements_from_ocr_regions(
70+
TextRegions.from_list(mock_embedded_text_regions),
71+
text,
72+
group_by_ocr_text=True,
73+
)
74+
assert elements == LayoutElements.from_list(
75+
[
76+
LayoutElement.from_coords(
77+
x1=453.00277777777774,
78+
y1=317.319341111111,
79+
x2=711.5338541666665,
80+
y2=358.28571222222206,
81+
text="LayoutParser:",
82+
type=ElementType.UNCATEGORIZED_TEXT,
83+
),
84+
LayoutElement.from_coords(
85+
x1=437.83888888888885,
86+
y1=317.319341111111,
87+
x2=1256.334784222222,
88+
y2=406.9837855555556,
89+
text="A Unified Toolkit for Deep Learning Based Document Image",
90+
type=ElementType.UNCATEGORIZED_TEXT,
91+
),
92+
]
93+
)

0 commit comments

Comments
 (0)