|
1 | 1 | from __future__ import annotations |
2 | 2 |
|
3 | 3 | from dataclasses import dataclass, field |
4 | | -from typing import Any, Collection, Iterable, List, Optional |
| 4 | +from typing import Any, Iterable, List, Optional |
5 | 5 |
|
6 | 6 | import numpy as np |
7 | 7 | from pandas import DataFrame |
8 | 8 | from scipy.sparse.csgraph import connected_components |
9 | 9 |
|
10 | 10 | from unstructured_inference.config import inference_config |
11 | | -from unstructured_inference.constants import ( |
12 | | - FULL_PAGE_REGION_THRESHOLD, |
13 | | - ElementType, |
14 | | -) |
15 | 11 | from unstructured_inference.inference.elements import ( |
16 | | - ImageTextRegion, |
17 | 12 | Rectangle, |
18 | 13 | TextRegion, |
19 | 14 | TextRegions, |
20 | 15 | coords_intersections, |
21 | | - grow_region_to_match_region, |
22 | | - region_bounding_boxes_are_almost_the_same, |
23 | 16 | ) |
24 | 17 |
|
25 | 18 | EPSILON_AREA = 1e-7 |
@@ -221,107 +214,6 @@ def from_region(cls, region: TextRegion): |
221 | 214 | return cls(text=text, source=source, type=type, prob=prob, bbox=region.bbox) |
222 | 215 |
|
223 | 216 |
|
224 | | -def merge_inferred_layout_with_extracted_layout( |
225 | | - inferred_layout: Collection[LayoutElement], |
226 | | - extracted_layout: Collection[TextRegion], |
227 | | - page_image_size: tuple, |
228 | | - same_region_threshold: float = inference_config.LAYOUT_SAME_REGION_THRESHOLD, |
229 | | - subregion_threshold: float = inference_config.LAYOUT_SUBREGION_THRESHOLD, |
230 | | -) -> List[LayoutElement]: |
231 | | - """Merge two layouts to produce a single layout.""" |
232 | | - extracted_elements_to_add: List[TextRegion] = [] |
233 | | - inferred_regions_to_remove = [] |
234 | | - w, h = page_image_size |
235 | | - full_page_region = Rectangle(0, 0, w, h) |
236 | | - for extracted_region in extracted_layout: |
237 | | - extracted_is_image = isinstance(extracted_region, ImageTextRegion) |
238 | | - if extracted_is_image: |
239 | | - # Skip extracted images for this purpose, we don't have the text from them and they |
240 | | - # don't provide good text bounding boxes. |
241 | | - |
242 | | - is_full_page_image = region_bounding_boxes_are_almost_the_same( |
243 | | - extracted_region.bbox, |
244 | | - full_page_region, |
245 | | - FULL_PAGE_REGION_THRESHOLD, |
246 | | - ) |
247 | | - |
248 | | - if is_full_page_image: |
249 | | - continue |
250 | | - region_matched = False |
251 | | - for inferred_region in inferred_layout: |
252 | | - |
253 | | - if inferred_region.bbox.intersects(extracted_region.bbox): |
254 | | - same_bbox = region_bounding_boxes_are_almost_the_same( |
255 | | - inferred_region.bbox, |
256 | | - extracted_region.bbox, |
257 | | - same_region_threshold, |
258 | | - ) |
259 | | - inferred_is_subregion_of_extracted = inferred_region.bbox.is_almost_subregion_of( |
260 | | - extracted_region.bbox, |
261 | | - subregion_threshold=subregion_threshold, |
262 | | - ) |
263 | | - inferred_is_text = inferred_region.type not in ( |
264 | | - ElementType.FIGURE, |
265 | | - ElementType.IMAGE, |
266 | | - ElementType.PAGE_BREAK, |
267 | | - ElementType.TABLE, |
268 | | - ) |
269 | | - extracted_is_subregion_of_inferred = extracted_region.bbox.is_almost_subregion_of( |
270 | | - inferred_region.bbox, |
271 | | - subregion_threshold=subregion_threshold, |
272 | | - ) |
273 | | - either_region_is_subregion_of_other = ( |
274 | | - inferred_is_subregion_of_extracted or extracted_is_subregion_of_inferred |
275 | | - ) |
276 | | - if same_bbox: |
277 | | - # Looks like these represent the same region |
278 | | - if extracted_is_image: |
279 | | - # keep extracted region, remove inferred region |
280 | | - inferred_regions_to_remove.append(inferred_region) |
281 | | - else: |
282 | | - # keep inferred region, remove extracted region |
283 | | - grow_region_to_match_region(inferred_region.bbox, extracted_region.bbox) |
284 | | - inferred_region.text = extracted_region.text |
285 | | - region_matched = True |
286 | | - elif extracted_is_subregion_of_inferred and inferred_is_text: |
287 | | - if extracted_is_image: |
288 | | - # keep both extracted and inferred regions |
289 | | - region_matched = False |
290 | | - else: |
291 | | - # keep inferred region, remove extracted region |
292 | | - grow_region_to_match_region(inferred_region.bbox, extracted_region.bbox) |
293 | | - region_matched = True |
294 | | - elif ( |
295 | | - either_region_is_subregion_of_other |
296 | | - and inferred_region.type != ElementType.TABLE |
297 | | - ): |
298 | | - # keep extracted region, remove inferred region |
299 | | - inferred_regions_to_remove.append(inferred_region) |
300 | | - if not region_matched: |
301 | | - extracted_elements_to_add.append(extracted_region) |
302 | | - # Need to classify the extracted layout elements we're keeping. |
303 | | - categorized_extracted_elements_to_add = [ |
304 | | - LayoutElement( |
305 | | - text=el.text, |
306 | | - type=( |
307 | | - ElementType.IMAGE |
308 | | - if isinstance(el, ImageTextRegion) |
309 | | - else ElementType.UNCATEGORIZED_TEXT |
310 | | - ), |
311 | | - source=el.source, |
312 | | - bbox=el.bbox, |
313 | | - ) |
314 | | - for el in extracted_elements_to_add |
315 | | - ] |
316 | | - inferred_regions_to_add = [ |
317 | | - region for region in inferred_layout if region not in inferred_regions_to_remove |
318 | | - ] |
319 | | - |
320 | | - final_layout = categorized_extracted_elements_to_add + inferred_regions_to_add |
321 | | - |
322 | | - return final_layout |
323 | | - |
324 | | - |
325 | 217 | def separate(region_a: Rectangle, region_b: Rectangle): |
326 | 218 | """Reduce leftmost rectangle to don't overlap with the other""" |
327 | 219 |
|
|
0 commit comments