Skip to content

Commit 9a3f925

Browse files
authored
chore: remove merging logic (#447)
Merging logic has been moved to unstructured for a long time now, this PR removes the leftover logic here. <!-- CURSOR_SUMMARY --> --- > [!NOTE] > Removes obsolete layout merge logic and related tests, drops unused MERGED source, and bumps version to 1.0.10 with changelog update. > > - **Core**: > - Remove `merge_inferred_layout_with_extracted_layout` from `unstructured_inference/inference/layoutelement.py` and associated unused imports. > - **Tests**: > - Delete `test_merge_inferred_layout_with_extracted_layout` and related imports in `test_unstructured_inference/test_elements.py`. > - **Constants**: > - Drop unused `Source.MERGED` from `unstructured_inference/constants.py`. > - **Versioning/Docs**: > - Bump `__version__` to `1.0.10` and update `CHANGELOG.md`. > > <sup>Written by [Cursor Bugbot](https://cursor.com/dashboard?tab=bugbot) for commit 5d7f9a1. This will update automatically on new commits. Configure [here](https://cursor.com/dashboard?tab=bugbot).</sup> <!-- /CURSOR_SUMMARY -->
1 parent 02f6fcc commit 9a3f925

File tree

5 files changed

+6
-150
lines changed

5 files changed

+6
-150
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
## 1.0.10
2+
3+
* Remove merging logic that's no longer used
4+
15
## 1.0.9
26

37
* Make OD model loading thread safe

test_unstructured_inference/test_elements.py

Lines changed: 0 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -5,20 +5,15 @@
55
import numpy as np
66
import pytest
77

8-
from unstructured_inference.constants import ElementType
98
from unstructured_inference.inference import elements
109
from unstructured_inference.inference.elements import (
11-
ImageTextRegion,
1210
Rectangle,
13-
TextRegion,
1411
TextRegions,
1512
)
1613
from unstructured_inference.inference.layoutelement import (
17-
LayoutElement,
1814
LayoutElements,
1915
clean_layoutelements,
2016
clean_layoutelements_for_class,
21-
merge_inferred_layout_with_extracted_layout,
2217
partition_groups_from_regions,
2318
separate,
2419
)
@@ -297,40 +292,6 @@ def test_separate(rect1, rect2):
297292
# assert not rect1.intersects(rect2) #TODO: fix this test
298293

299294

300-
def test_merge_inferred_layout_with_extracted_layout():
301-
inferred_layout = [
302-
LayoutElement.from_coords(453, 322, 1258, 408, text=None, type=ElementType.SECTION_HEADER),
303-
LayoutElement.from_coords(387, 477, 1320, 537, text=None, type=ElementType.TEXT),
304-
]
305-
306-
extracted_layout = [
307-
TextRegion.from_coords(438, 318, 1272, 407, text="Example Section Header"),
308-
TextRegion.from_coords(377, 469, 1335, 535, text="Example Title"),
309-
]
310-
311-
extracted_layout_with_full_page_image = [
312-
ImageTextRegion.from_coords(0, 0, 1700, 2200, text="Example Section Header"),
313-
]
314-
315-
merged_layout = merge_inferred_layout_with_extracted_layout(
316-
inferred_layout=inferred_layout,
317-
extracted_layout=extracted_layout,
318-
page_image_size=(1700, 2200),
319-
)
320-
assert merged_layout[0].type == ElementType.SECTION_HEADER
321-
assert merged_layout[0].text == "Example Section Header"
322-
assert merged_layout[1].type == ElementType.TEXT
323-
assert merged_layout[1].text == "Example Title"
324-
325-
# case: extracted layout with a full page image
326-
merged_layout = merge_inferred_layout_with_extracted_layout(
327-
inferred_layout=inferred_layout,
328-
extracted_layout=extracted_layout_with_full_page_image,
329-
page_image_size=(1700, 2200),
330-
)
331-
assert merged_layout == inferred_layout
332-
333-
334295
def test_clean_layoutelements(test_layoutelements):
335296
elements = clean_layoutelements(test_layoutelements).as_list()
336297
assert len(elements) == 2
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "1.0.9" # pragma: no cover
1+
__version__ = "1.0.10" # pragma: no cover

unstructured_inference/constants.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@ class Source(Enum):
55
YOLOX = "yolox"
66
DETECTRON2_ONNX = "detectron2_onnx"
77
DETECTRON2_LP = "detectron2_lp"
8-
MERGED = "merged"
98

109

1110
class ElementType:

unstructured_inference/inference/layoutelement.py

Lines changed: 1 addition & 109 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,18 @@
11
from __future__ import annotations
22

33
from dataclasses import dataclass, field
4-
from typing import Any, Collection, Iterable, List, Optional
4+
from typing import Any, Iterable, List, Optional
55

66
import numpy as np
77
from pandas import DataFrame
88
from scipy.sparse.csgraph import connected_components
99

1010
from unstructured_inference.config import inference_config
11-
from unstructured_inference.constants import (
12-
FULL_PAGE_REGION_THRESHOLD,
13-
ElementType,
14-
)
1511
from unstructured_inference.inference.elements import (
16-
ImageTextRegion,
1712
Rectangle,
1813
TextRegion,
1914
TextRegions,
2015
coords_intersections,
21-
grow_region_to_match_region,
22-
region_bounding_boxes_are_almost_the_same,
2316
)
2417

2518
EPSILON_AREA = 1e-7
@@ -221,107 +214,6 @@ def from_region(cls, region: TextRegion):
221214
return cls(text=text, source=source, type=type, prob=prob, bbox=region.bbox)
222215

223216

224-
def merge_inferred_layout_with_extracted_layout(
225-
inferred_layout: Collection[LayoutElement],
226-
extracted_layout: Collection[TextRegion],
227-
page_image_size: tuple,
228-
same_region_threshold: float = inference_config.LAYOUT_SAME_REGION_THRESHOLD,
229-
subregion_threshold: float = inference_config.LAYOUT_SUBREGION_THRESHOLD,
230-
) -> List[LayoutElement]:
231-
"""Merge two layouts to produce a single layout."""
232-
extracted_elements_to_add: List[TextRegion] = []
233-
inferred_regions_to_remove = []
234-
w, h = page_image_size
235-
full_page_region = Rectangle(0, 0, w, h)
236-
for extracted_region in extracted_layout:
237-
extracted_is_image = isinstance(extracted_region, ImageTextRegion)
238-
if extracted_is_image:
239-
# Skip extracted images for this purpose, we don't have the text from them and they
240-
# don't provide good text bounding boxes.
241-
242-
is_full_page_image = region_bounding_boxes_are_almost_the_same(
243-
extracted_region.bbox,
244-
full_page_region,
245-
FULL_PAGE_REGION_THRESHOLD,
246-
)
247-
248-
if is_full_page_image:
249-
continue
250-
region_matched = False
251-
for inferred_region in inferred_layout:
252-
253-
if inferred_region.bbox.intersects(extracted_region.bbox):
254-
same_bbox = region_bounding_boxes_are_almost_the_same(
255-
inferred_region.bbox,
256-
extracted_region.bbox,
257-
same_region_threshold,
258-
)
259-
inferred_is_subregion_of_extracted = inferred_region.bbox.is_almost_subregion_of(
260-
extracted_region.bbox,
261-
subregion_threshold=subregion_threshold,
262-
)
263-
inferred_is_text = inferred_region.type not in (
264-
ElementType.FIGURE,
265-
ElementType.IMAGE,
266-
ElementType.PAGE_BREAK,
267-
ElementType.TABLE,
268-
)
269-
extracted_is_subregion_of_inferred = extracted_region.bbox.is_almost_subregion_of(
270-
inferred_region.bbox,
271-
subregion_threshold=subregion_threshold,
272-
)
273-
either_region_is_subregion_of_other = (
274-
inferred_is_subregion_of_extracted or extracted_is_subregion_of_inferred
275-
)
276-
if same_bbox:
277-
# Looks like these represent the same region
278-
if extracted_is_image:
279-
# keep extracted region, remove inferred region
280-
inferred_regions_to_remove.append(inferred_region)
281-
else:
282-
# keep inferred region, remove extracted region
283-
grow_region_to_match_region(inferred_region.bbox, extracted_region.bbox)
284-
inferred_region.text = extracted_region.text
285-
region_matched = True
286-
elif extracted_is_subregion_of_inferred and inferred_is_text:
287-
if extracted_is_image:
288-
# keep both extracted and inferred regions
289-
region_matched = False
290-
else:
291-
# keep inferred region, remove extracted region
292-
grow_region_to_match_region(inferred_region.bbox, extracted_region.bbox)
293-
region_matched = True
294-
elif (
295-
either_region_is_subregion_of_other
296-
and inferred_region.type != ElementType.TABLE
297-
):
298-
# keep extracted region, remove inferred region
299-
inferred_regions_to_remove.append(inferred_region)
300-
if not region_matched:
301-
extracted_elements_to_add.append(extracted_region)
302-
# Need to classify the extracted layout elements we're keeping.
303-
categorized_extracted_elements_to_add = [
304-
LayoutElement(
305-
text=el.text,
306-
type=(
307-
ElementType.IMAGE
308-
if isinstance(el, ImageTextRegion)
309-
else ElementType.UNCATEGORIZED_TEXT
310-
),
311-
source=el.source,
312-
bbox=el.bbox,
313-
)
314-
for el in extracted_elements_to_add
315-
]
316-
inferred_regions_to_add = [
317-
region for region in inferred_layout if region not in inferred_regions_to_remove
318-
]
319-
320-
final_layout = categorized_extracted_elements_to_add + inferred_regions_to_add
321-
322-
return final_layout
323-
324-
325217
def separate(region_a: Rectangle, region_b: Rectangle):
326218
"""Reduce leftmost rectangle to don't overlap with the other"""
327219

0 commit comments

Comments
 (0)