Skip to content

Commit b0d8a77

Browse files
feat: partiton_pdf() set inferred elements text (#3061)
This PR adds the ability to fill inferred elements text from embedded text (`pdfminer`) without depending on `unstructured-inference` library. This PR is the second part of moving embedded text related code from `unstructured-inference` to `unstructured` and works together with Unstructured-IO/unstructured-inference#349.
1 parent 059fc64 commit b0d8a77

File tree

9 files changed

+100
-15
lines changed

9 files changed

+100
-15
lines changed

CHANGELOG.md

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,19 @@
1-
## 0.14.1-dev1
1+
## 0.14.1
22

3-
* **Add support for Python 3.12**. `unstructured` now works with Python 3.12!
3+
### Enhancements
4+
5+
* **Refactor code related to embedded text extraction**. The embedded text extraction code is moved from `unstructured-inference` to `unstructured`.
46

57
### Features
8+
69
* **Large improvements to the ingest process:**
710
* Support for multiprocessing and async, with limits for both.
811
* Streamlined to process when mapping CLI invocations to the underlying code
912
* More granular steps introduced to give better control over process (i.e. dedicated step to uncompress files already in the local filesystem, new optional staging step before upload)
1013
* Use the python client when calling the unstructured api for partitioning or chunking
1114
* Saving the final content is now a dedicated destination connector (local) set as the default if none are provided. Avoids adding new files locally if uploading elsewhere.
1215
* Leverage last modified date when deciding if new files should be downloaded and reprocessed.
16+
* **Add support for Python 3.12**. `unstructured` now works with Python 3.12!
1317

1418
### Fixes
1519

test_unstructured/partition/pdf_image/test_pdf_image_utils.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -343,3 +343,11 @@ def test_annotate_layout_elements_file_not_found_error():
343343
pdf_image_dpi=200,
344344
is_image=True,
345345
)
346+
347+
348+
@pytest.mark.parametrize(
349+
("text", "expected"),
350+
[("c\to\x0cn\ftrol\ncharacter\rs\b", "control characters"), ("\"'\\", "\"'\\")],
351+
)
352+
def test_remove_control_characters(text, expected):
353+
assert pdf_image_utils.remove_control_characters(text) == expected

test_unstructured/partition/pdf_image/test_pdfminer_processing.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
11
import pytest
22
from PIL import Image
33
from unstructured_inference.constants import Source as InferenceSource
4-
from unstructured_inference.inference.elements import Rectangle
4+
from unstructured_inference.inference.elements import Rectangle, TextRegion
55
from unstructured_inference.inference.layout import DocumentLayout, LayoutElement, PageLayout
66

77
from unstructured.partition.pdf_image.pdfminer_processing import (
8+
aggregate_embedded_text_by_block,
89
clean_pdfminer_duplicate_image_elements,
910
clean_pdfminer_inner_elements,
1011
)
@@ -139,3 +140,16 @@ def test_clean_pdfminer_duplicate_image_elements(elements, expected_document_len
139140
cleaned_doc = clean_pdfminer_duplicate_image_elements(document)
140141

141142
assert len(cleaned_doc.pages[0].elements) == expected_document_length
143+
144+
145+
def test_aggregate_by_block():
146+
expected = "Inside region1 Inside region2"
147+
embedded_regions = [
148+
TextRegion.from_coords(0, 0, 20, 20, "Inside region1"),
149+
TextRegion.from_coords(50, 50, 150, 150, "Inside region2"),
150+
TextRegion.from_coords(250, 250, 350, 350, "Outside region"),
151+
]
152+
target_region = TextRegion.from_coords(0, 0, 300, 300)
153+
154+
text = aggregate_embedded_text_by_block(target_region, embedded_regions)
155+
assert text == expected

unstructured/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.14.1-dev1" # pragma: no cover
1+
__version__ = "0.14.1" # pragma: no cover

unstructured/partition/pdf.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -573,6 +573,7 @@ def _partition_pdf_or_image_local(
573573
merged_document_layout = merge_inferred_with_extracted_layout(
574574
inferred_document_layout=inferred_document_layout,
575575
extracted_layout=extracted_layout,
576+
hi_res_model_name=hi_res_model_name,
576577
)
577578

578579
final_document_layout = process_file_with_ocr(
@@ -611,6 +612,7 @@ def _partition_pdf_or_image_local(
611612
merged_document_layout = merge_inferred_with_extracted_layout(
612613
inferred_document_layout=inferred_document_layout,
613614
extracted_layout=extracted_layout,
615+
hi_res_model_name=hi_res_model_name,
614616
)
615617

616618
if hasattr(file, "seek"):

unstructured/partition/pdf_image/pdf_image_utils.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import os
55
import re
66
import tempfile
7+
import unicodedata
78
from copy import deepcopy
89
from io import BytesIO
910
from pathlib import Path, PurePath
@@ -420,3 +421,13 @@ def get_the_last_modification_date_pdf_or_img(
420421
get_last_modified_date_from_file(file) if date_from_file_object else None
421422
)
422423
return last_modification_date
424+
425+
426+
def remove_control_characters(text: str) -> str:
427+
"""Removes control characters from text."""
428+
429+
# Replace newline character with a space
430+
text = text.replace("\n", " ")
431+
# Remove other control characters
432+
out_text = "".join(c for c in text if unicodedata.category(c)[0] != "C")
433+
return out_text

unstructured/partition/pdf_image/pdfminer_processing.py

Lines changed: 42 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,14 @@
33
from pdfminer.utils import open_filename
44

55
from unstructured.documents.elements import ElementType
6+
from unstructured.partition.pdf_image.pdf_image_utils import remove_control_characters
67
from unstructured.partition.pdf_image.pdfminer_utils import (
78
get_images_from_pdf_element,
89
open_pdfminer_pages_generator,
910
rect_to_bbox,
1011
)
1112
from unstructured.partition.utils.config import env_config
12-
from unstructured.partition.utils.constants import Source
13+
from unstructured.partition.utils.constants import SORT_MODE_BASIC, Source
1314
from unstructured.partition.utils.sorting import sort_text_regions
1415
from unstructured.utils import requires_dependencies
1516

@@ -43,7 +44,6 @@ def process_data_with_pdfminer(
4344
EmbeddedTextRegion,
4445
ImageTextRegion,
4546
)
46-
from unstructured_inference.inference.ordering import order_layout
4747

4848
layouts = []
4949
# Coefficient to rescale bounding box to be compatible with images
@@ -80,7 +80,7 @@ def process_data_with_pdfminer(
8080

8181
# NOTE(christine): always do the basic sort first for deterministic order across
8282
# python versions.
83-
layout = order_layout(layout)
83+
layout = sort_text_regions(layout, SORT_MODE_BASIC)
8484

8585
# apply the current default sorting to the layout elements extracted by pdfminer
8686
layout = sort_text_regions(layout)
@@ -94,6 +94,7 @@ def process_data_with_pdfminer(
9494
def merge_inferred_with_extracted_layout(
9595
inferred_document_layout: "DocumentLayout",
9696
extracted_layout: List[List["TextRegion"]],
97+
hi_res_model_name: str,
9798
) -> "DocumentLayout":
9899
"""Merge an inferred layout with an extracted layout"""
99100

@@ -102,6 +103,10 @@ def merge_inferred_with_extracted_layout(
102103
)
103104
from unstructured_inference.models.detectron2onnx import UnstructuredDetectronONNXModel
104105

106+
# If the model is a chipper model, we don't want to order the
107+
# elements, as they are already ordered
108+
order_elements = not hi_res_model_name.startswith("chipper")
109+
105110
inferred_pages = inferred_document_layout.pages
106111
for i, (inferred_page, extracted_page_layout) in enumerate(
107112
zip(inferred_pages, extracted_layout)
@@ -128,31 +133,40 @@ def merge_inferred_with_extracted_layout(
128133
**threshold_kwargs,
129134
)
130135

131-
elements = inferred_page.get_elements_from_layout(
132-
layout=cast(List["TextRegion"], merged_layout),
133-
pdf_objects=extracted_page_layout,
134-
)
136+
if order_elements:
137+
merged_layout = sort_text_regions(
138+
cast(List["TextRegion"], merged_layout), SORT_MODE_BASIC
139+
)
140+
141+
elements = []
142+
for layout_el in merged_layout:
143+
if layout_el.text is None:
144+
text = aggregate_embedded_text_by_block(
145+
text_region=cast("TextRegion", layout_el),
146+
pdf_objects=extracted_page_layout,
147+
)
148+
else:
149+
text = layout_el.text
150+
layout_el.text = remove_control_characters(text)
151+
elements.append(layout_el)
135152

136153
inferred_page.elements[:] = elements
137154

138155
return inferred_document_layout
139156

140157

141-
@requires_dependencies("unstructured_inference")
142158
def clean_pdfminer_inner_elements(document: "DocumentLayout") -> "DocumentLayout":
143159
"""Clean pdfminer elements from inside tables.
144160
145161
This function removes elements sourced from PDFMiner that are subregions within table elements.
146162
"""
147163

148-
from unstructured_inference.config import inference_config
149-
150164
for page in document.pages:
151165
tables = [e for e in page.elements if e.type == ElementType.TABLE]
152166
for i, element in enumerate(page.elements):
153167
if element.source != Source.PDFMINER:
154168
continue
155-
subregion_threshold = inference_config.EMBEDDED_TEXT_AGGREGATION_SUBREGION_THRESHOLD
169+
subregion_threshold = env_config.EMBEDDED_TEXT_AGGREGATION_SUBREGION_THRESHOLD
156170
element_inside_table = [
157171
element.bbox.is_almost_subregion_of(t.bbox, subregion_threshold) for t in tables
158172
]
@@ -189,3 +203,20 @@ def clean_pdfminer_duplicate_image_elements(document: "DocumentLayout") -> "Docu
189203
page.elements = [e for e in page.elements if e]
190204

191205
return document
206+
207+
208+
def aggregate_embedded_text_by_block(
209+
text_region: "TextRegion",
210+
pdf_objects: list["TextRegion"],
211+
) -> str:
212+
"""Extracts the text aggregated from the elements of the given layout that lie within the given
213+
block."""
214+
215+
subregion_threshold = env_config.EMBEDDED_TEXT_AGGREGATION_SUBREGION_THRESHOLD
216+
filtered_blocks = [
217+
obj
218+
for obj in pdf_objects
219+
if obj.bbox.is_almost_subregion_of(text_region.bbox, subregion_threshold)
220+
]
221+
text = " ".join([x.text for x in filtered_blocks if x.text])
222+
return text

unstructured/partition/utils/config.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,16 @@ def EMBEDDED_IMAGE_SAME_REGION_THRESHOLD(self) -> float:
131131
"""threshold to consider the bounding boxes of two embedded images as the same region"""
132132
return self._get_float("EMBEDDED_IMAGE_SAME_REGION_THRESHOLD", 0.6)
133133

134+
@property
135+
def EMBEDDED_TEXT_AGGREGATION_SUBREGION_THRESHOLD(self) -> float:
136+
"""threshold to determine if an embedded region is a sub-region of a given block
137+
when aggregating the text from embedded elements that lie within the given block
138+
139+
When the intersection region area divided by self area is larger than this threshold self is
140+
considered a subregion of the other
141+
"""
142+
return self._get_float("EMBEDDED_TEXT_AGGREGATION_SUBREGION_THRESHOLD", 0.99)
143+
134144
@property
135145
def PDF_ANNOTATION_THRESHOLD(self) -> float:
136146
"""The threshold value (between 0.0 and 1.0) that determines the minimum overlap required

unstructured/partition/utils/sorting.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -261,6 +261,11 @@ def _bboxes_ok(strict_points: bool):
261261
xy_cut_primary_direction=xy_cut_primary_direction,
262262
)
263263
sorted_elements = [elements[i] for i in res]
264+
elif sort_mode == SORT_MODE_BASIC:
265+
sorted_elements = sorted(
266+
elements,
267+
key=lambda el: (el.bbox.y1, el.bbox.x1, el.bbox.y2, el.bbox.x2),
268+
)
264269
else:
265270
sorted_elements = elements
266271

0 commit comments

Comments
 (0)