Skip to content

Commit 00b4936

Browse files
Feat/219 keep extracted image elements (#225)
update `merge_inferred_layout_with_extracted_layout` to keep extracted image elements
1 parent f4236c8 commit 00b4936

File tree

7 files changed

+22
-8
lines changed

7 files changed

+22
-8
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
## 0.6.4
77

8+
* Add functionality to keep extracted image elements while merging inferred layout with extracted layout
89
* add a function to automatically scale table crop images based on text height so the text height is optimum for `tesseract` OCR task
910
* add the new image auto scaling parameters to `config.py`
1011

examples/image-extraction/requirements.txt

Lines changed: 0 additions & 1 deletion
This file was deleted.
File renamed without changes.
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
unstructured-inference
2+
pymupdf
3+
pypdf2

unstructured_inference/inference/layout.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -441,7 +441,7 @@ def annotate(
441441
width = style["width"]
442442
for region in getattr(self, attribute):
443443
if isinstance(region, Rectangle):
444-
required_source = getattr(el, "source", None)
444+
required_source = getattr(region, "source", None)
445445
if "all" in sources or required_source in sources:
446446
img = draw_bbox(
447447
img,

unstructured_inference/inference/layoutelement.py

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -148,13 +148,24 @@ def merge_inferred_layout_with_extracted_layout(
148148
)
149149
if same_bbox:
150150
# Looks like these represent the same region
151-
grow_region_to_match_region(inferred_region, extracted_region)
152-
inferred_region.text = extracted_region.text
153-
region_matched = True
154-
elif extracted_is_subregion_of_inferred and inferred_is_text and extracted_is_image:
155-
grow_region_to_match_region(inferred_region, extracted_region)
156-
region_matched = True
151+
if extracted_is_image:
152+
# keep extracted region, remove inferred region
153+
inferred_regions_to_remove.append(inferred_region)
154+
else:
155+
# keep inferred region, remove extracted region
156+
grow_region_to_match_region(inferred_region, extracted_region)
157+
inferred_region.text = extracted_region.text
158+
region_matched = True
159+
elif extracted_is_subregion_of_inferred and inferred_is_text:
160+
if extracted_is_image:
161+
# keep both extracted and inferred regions
162+
region_matched = False
163+
else:
164+
# keep inferred region, remove extracted region
165+
grow_region_to_match_region(inferred_region, extracted_region)
166+
region_matched = True
157167
elif either_region_is_subregion_of_other and inferred_region.type != "Table":
168+
# keep extracted region, remove inferred region
158169
inferred_regions_to_remove.append(inferred_region)
159170
if not region_matched:
160171
extracted_elements_to_add.append(extracted_region)

0 commit comments

Comments
 (0)