Skip to content

Commit d51fb13

Browse files
authored
Feat/improve iou speed (#3582)
This PR vectorizes the computation of element overlap to speed up deduplication process of extracted elements. ## test This PR adds unit test to the new vectorized IOU and subregion computation functions. In addition, running partition on large files with many elements like this slide: [002489.pdf](https://github.com/user-attachments/files/16823176/002489.pdf) shows a reduction of runtime from around 15min on the main branch to less than 4min with this branch. Profiling results show that the new implementation greatly reduces the time cost of computation and now most of the time is spend on getting the coordinates from a list of bboxes. ![Screenshot 2024-08-30 at 9 29 27 PM](https://github.com/user-attachments/assets/6c186838-54c7-483b-ac3e-7342c23ff3a6)
1 parent 404f780 commit d51fb13

File tree

4 files changed

+170
-31
lines changed

4 files changed

+170
-31
lines changed

CHANGELOG.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
1-
## 0.15.10-dev1
1+
## 0.15.10-dev2
22

33
### Enhancements
4+
45
* **Modified analysis drawing tools to dump to files and draw from dumps** If the parameter `analysis` of the `partition_pdf` function is set to `True`, the layout for Object Detection, Pdfminer Extraction, OCR and final layouts will be dumped as json files. The drawers now accept dict (dump) objects instead of internal classes instances.
6+
* **Vectorize pdfminer elements deduplication computation**. Use `numpy` operations to compute IOU and sub-region membership instead of using simply loop. This improves the speed of deduplicating elements for pages with a lot of elements.
57

68
### Features
79

test_unstructured/partition/pdf_image/test_pdfminer_processing.py

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import numpy as np
12
import pytest
23
from PIL import Image
34
from unstructured_inference.constants import Source as InferenceSource
@@ -6,6 +7,8 @@
67

78
from unstructured.partition.pdf_image.pdfminer_processing import (
89
aggregate_embedded_text_by_block,
10+
bboxes1_is_almost_subregion_of_bboxes2,
11+
boxes_self_iou,
912
clean_pdfminer_duplicate_image_elements,
1013
clean_pdfminer_inner_elements,
1114
)
@@ -153,3 +156,56 @@ def test_aggregate_by_block():
153156

154157
text = aggregate_embedded_text_by_block(target_region, embedded_regions)
155158
assert text == expected
159+
160+
161+
@pytest.mark.parametrize(
162+
("coords1", "coords2", "expected"),
163+
[
164+
(
165+
[[0, 0, 10, 10], [10, 0, 20, 10], [10, 10, 20, 20]],
166+
[[0, 0, 10, 10], [0, 0, 12, 12]],
167+
[[True, True], [False, False], [False, False]],
168+
),
169+
(
170+
[[0, 0, 10, 10], [10, 0, 20, 10], [10, 10, 20, 20]],
171+
[[0, 0, 10, 10], [10, 10, 22, 22], [0, 0, 5, 5]],
172+
[[True, False, False], [False, False, False], [False, True, False]],
173+
),
174+
(
175+
[[0, 0, 10, 10], [10, 10, 10, 10]],
176+
[[0, 0, 10, 10], [10, 10, 22, 22], [0, 0, 5, 5]],
177+
[[True, False, False], [True, True, False]],
178+
),
179+
],
180+
)
181+
def test_bboxes1_is_almost_subregion_of_bboxes2(coords1, coords2, expected):
182+
bboxes1 = [Rectangle(*row) for row in coords1]
183+
bboxes2 = [Rectangle(*row) for row in coords2]
184+
np.testing.assert_array_equal(
185+
bboxes1_is_almost_subregion_of_bboxes2(bboxes1, bboxes2), expected
186+
)
187+
188+
189+
@pytest.mark.parametrize(
190+
("coords", "threshold", "expected"),
191+
[
192+
(
193+
[[0, 0, 10, 10], [2, 2, 12, 12], [10, 10, 20, 20]],
194+
0.5,
195+
[[True, True, False], [True, True, False], [False, False, True]],
196+
),
197+
(
198+
[[0, 0, 10, 10], [2, 2, 12, 12], [10, 10, 20, 20]],
199+
0.9,
200+
[[True, False, False], [False, True, False], [False, False, True]],
201+
),
202+
(
203+
[[0, 0, 10, 10], [10, 10, 10, 10]],
204+
0.5,
205+
[[True, False], [False, True]],
206+
),
207+
],
208+
)
209+
def test_boxes_self_iou(coords, threshold, expected):
210+
bboxes = [Rectangle(*row) for row in coords]
211+
np.testing.assert_array_equal(boxes_self_iou(bboxes, threshold), expected)

unstructured/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.15.10-dev1" # pragma: no cover
1+
__version__ = "0.15.10-dev2" # pragma: no cover

unstructured/partition/pdf_image/pdfminer_processing.py

Lines changed: 110 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from typing import TYPE_CHECKING, BinaryIO, List, Optional, Union, cast
22

3+
import numpy as np
34
from pdfminer.utils import open_filename
45

56
from unstructured.documents.elements import ElementType
@@ -19,6 +20,9 @@
1920
from unstructured_inference.inference.layout import DocumentLayout
2021

2122

23+
EPSILON_AREA = 0.01
24+
25+
2226
def process_file_with_pdfminer(
2327
filename: str = "",
2428
dpi: int = 200,
@@ -96,6 +100,57 @@ def _create_text_region(x1, y1, x2, y2, coef, text, source, region_class):
96100
)
97101

98102

103+
def get_coords_from_bboxes(bboxes) -> np.ndarray:
104+
"""convert a list of boxes's coords into np array"""
105+
# preallocate memory
106+
coords = np.zeros((len(bboxes), 4))
107+
108+
for i, bbox in enumerate(bboxes):
109+
coords[i, :] = [bbox.x1, bbox.y1, bbox.x2, bbox.y2]
110+
111+
return coords
112+
113+
114+
def areas_of_boxes_and_intersection_area(
115+
coords1: np.ndarray, coords2: np.ndarray, threshold: float = 0.5
116+
):
117+
"""compute intersection area and own areas for two groups of bounding boxes"""
118+
x11, y11, x12, y12 = np.split(coords1, 4, axis=1)
119+
x21, y21, x22, y22 = np.split(coords2, 4, axis=1)
120+
121+
xa = np.maximum(x11, np.transpose(x21))
122+
ya = np.maximum(y11, np.transpose(y21))
123+
xb = np.minimum(x12, np.transpose(x22))
124+
yb = np.minimum(y12, np.transpose(y22))
125+
126+
inter_area = np.maximum((xb - xa + 1), 0) * np.maximum((yb - ya + 1), 0)
127+
boxa_area = (x12 - x11 + 1) * (y12 - y11 + 1)
128+
boxb_area = (x22 - x21 + 1) * (y22 - y21 + 1)
129+
130+
return inter_area, boxa_area, boxb_area
131+
132+
133+
def bboxes1_is_almost_subregion_of_bboxes2(bboxes1, bboxes2, threshold: float = 0.5) -> np.ndarray:
134+
"""compute if each element from bboxes1 is almost a subregion of one or more elements in
135+
bboxes2"""
136+
coords1, coords2 = get_coords_from_bboxes(bboxes1), get_coords_from_bboxes(bboxes2)
137+
138+
inter_area, boxa_area, boxb_area = areas_of_boxes_and_intersection_area(coords1, coords2)
139+
140+
return (inter_area / np.maximum(boxa_area, EPSILON_AREA) > threshold) & (
141+
boxa_area <= boxb_area.T
142+
)
143+
144+
145+
def boxes_self_iou(bboxes, threshold: float = 0.5) -> np.ndarray:
146+
"""compute iou for a group of elements"""
147+
coords = get_coords_from_bboxes(bboxes)
148+
149+
inter_area, boxa_area, boxb_area = areas_of_boxes_and_intersection_area(coords, coords)
150+
151+
return (inter_area / np.maximum(EPSILON_AREA, boxa_area + boxb_area.T - inter_area)) > threshold
152+
153+
99154
@requires_dependencies("unstructured_inference")
100155
def merge_inferred_with_extracted_layout(
101156
inferred_document_layout: "DocumentLayout",
@@ -168,45 +223,71 @@ def clean_pdfminer_inner_elements(document: "DocumentLayout") -> "DocumentLayout
168223
"""
169224

170225
for page in document.pages:
171-
tables = [e for e in page.elements if e.type == ElementType.TABLE]
226+
table_boxes = [e.bbox for e in page.elements if e.type == ElementType.TABLE]
227+
element_boxes = []
228+
element_to_subregion_map = {}
229+
subregion_indice = 0
172230
for i, element in enumerate(page.elements):
173231
if element.source != Source.PDFMINER:
174232
continue
175-
subregion_threshold = env_config.EMBEDDED_TEXT_AGGREGATION_SUBREGION_THRESHOLD
176-
element_inside_table = [
177-
element.bbox.is_almost_subregion_of(t.bbox, subregion_threshold) for t in tables
178-
]
179-
if sum(element_inside_table) == 1:
180-
page.elements[i] = None
181-
page.elements = [e for e in page.elements if e]
233+
element_boxes.append(element.bbox)
234+
element_to_subregion_map[i] = subregion_indice
235+
subregion_indice += 1
236+
237+
is_element_subregion_of_tables = (
238+
bboxes1_is_almost_subregion_of_bboxes2(
239+
element_boxes,
240+
table_boxes,
241+
env_config.EMBEDDED_TEXT_AGGREGATION_SUBREGION_THRESHOLD,
242+
).sum(axis=1)
243+
== 1
244+
)
245+
246+
page.elements = [
247+
e
248+
for i, e in enumerate(page.elements)
249+
if (
250+
(i not in element_to_subregion_map)
251+
or not is_element_subregion_of_tables[element_to_subregion_map[i]]
252+
)
253+
]
182254

183255
return document
184256

185257

186258
def clean_pdfminer_duplicate_image_elements(document: "DocumentLayout") -> "DocumentLayout":
187259
"""Removes duplicate image elements extracted by PDFMiner from a document layout."""
188260

189-
from unstructured_inference.inference.elements import (
190-
region_bounding_boxes_are_almost_the_same,
191-
)
192-
193261
for page in document.pages:
194-
image_elements = []
262+
image_bboxes = []
263+
texts = []
264+
bbox_to_iou_mapping = {}
265+
current_idx = 0
195266
for i, element in enumerate(page.elements):
196267
if element.source != Source.PDFMINER or element.type != ElementType.IMAGE:
197268
continue
269+
image_bboxes.append(element.bbox)
270+
texts.append(element.text)
271+
bbox_to_iou_mapping[i] = current_idx
272+
current_idx += 1
273+
274+
iou = boxes_self_iou(image_bboxes, env_config.EMBEDDED_IMAGE_SAME_REGION_THRESHOLD)
198275

199-
# check if this element is a duplicate
276+
filtered_elements = []
277+
for i, element in enumerate(page.elements[:-1]):
278+
if element.source != Source.PDFMINER or element.type != ElementType.IMAGE:
279+
filtered_elements.append(element)
280+
continue
281+
text = element.text
282+
this_idx = bbox_to_iou_mapping[i]
200283
if any(
201-
e.text == element.text
202-
and region_bounding_boxes_are_almost_the_same(
203-
e.bbox, element.bbox, env_config.EMBEDDED_IMAGE_SAME_REGION_THRESHOLD
204-
)
205-
for e in image_elements
284+
text == texts[potential_match + this_idx + 1]
285+
for potential_match in np.where(iou[this_idx, this_idx + 1 :])[0]
206286
):
207-
page.elements[i] = None
208-
image_elements.append(element)
209-
page.elements = [e for e in page.elements if e]
287+
continue
288+
else:
289+
filtered_elements.append(element)
290+
page.elements[:-1] = filtered_elements
210291

211292
return document
212293

@@ -218,11 +299,11 @@ def aggregate_embedded_text_by_block(
218299
"""Extracts the text aggregated from the elements of the given layout that lie within the given
219300
block."""
220301

221-
subregion_threshold = env_config.EMBEDDED_TEXT_AGGREGATION_SUBREGION_THRESHOLD
222-
filtered_blocks = [
223-
obj
224-
for obj in pdf_objects
225-
if obj.bbox.is_almost_subregion_of(text_region.bbox, subregion_threshold)
226-
]
227-
text = " ".join([x.text for x in filtered_blocks if x.text])
302+
mask = bboxes1_is_almost_subregion_of_bboxes2(
303+
[obj.bbox for obj in pdf_objects],
304+
[text_region.bbox],
305+
env_config.EMBEDDED_TEXT_AGGREGATION_SUBREGION_THRESHOLD,
306+
).sum(axis=1)
307+
308+
text = " ".join([obj.text for i, obj in enumerate(pdf_objects) if (mask[i] and obj.text)])
228309
return text

0 commit comments

Comments
 (0)