feat: partiton_pdf() set inferred elements text (#3061)

christinestraub · web-flow · commit b0d8a779da65 · 2024-05-21T19:43:38.000Z
This PR adds the ability to fill inferred elements text from embedded text (`pdfminer`) without depending on `unstructured-inference` library. This PR is the second part of moving embedded text related code from `unstructured-inference` to `unstructured` and works together with Unstructured-IO/unstructured-inference#349.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,15 +1,19 @@
-## 0.14.1-dev1
+## 0.14.1
 
-* **Add support for Python 3.12**. `unstructured` now works with Python 3.12!
+### Enhancements
+
+* **Refactor code related to embedded text extraction**. The embedded text extraction code is moved from `unstructured-inference` to `unstructured`.
 
 ### Features
+
 * **Large improvements to the ingest process:**
   * Support for multiprocessing and async, with limits for both.
   * Streamlined to process when mapping CLI invocations to the underlying code
   * More granular steps introduced to give better control over process (i.e. dedicated step to uncompress files already in the local filesystem, new optional staging step before upload)
   * Use the python client when calling the unstructured api for partitioning or chunking
   * Saving the final content is now a dedicated destination connector (local) set as the default if none are provided. Avoids adding new files locally if uploading elsewhere.
   * Leverage last modified date when deciding if new files should be downloaded and reprocessed.
+* **Add support for Python 3.12**. `unstructured` now works with Python 3.12!
 
 ### Fixes
 
diff --git a/test_unstructured/partition/pdf_image/test_pdf_image_utils.py b/test_unstructured/partition/pdf_image/test_pdf_image_utils.py
@@ -343,3 +343,11 @@ def test_annotate_layout_elements_file_not_found_error():
             pdf_image_dpi=200,
             is_image=True,
         )
+
+
+@pytest.mark.parametrize(
+    ("text", "expected"),
+    [("c\to\x0cn\ftrol\ncharacter\rs\b", "control characters"), ("\"'\\", "\"'\\")],
+)
+def test_remove_control_characters(text, expected):
+    assert pdf_image_utils.remove_control_characters(text) == expected
diff --git a/test_unstructured/partition/pdf_image/test_pdfminer_processing.py b/test_unstructured/partition/pdf_image/test_pdfminer_processing.py
@@ -1,10 +1,11 @@
 import pytest
 from PIL import Image
 from unstructured_inference.constants import Source as InferenceSource
-from unstructured_inference.inference.elements import Rectangle
+from unstructured_inference.inference.elements import Rectangle, TextRegion
 from unstructured_inference.inference.layout import DocumentLayout, LayoutElement, PageLayout
 
 from unstructured.partition.pdf_image.pdfminer_processing import (
+    aggregate_embedded_text_by_block,
     clean_pdfminer_duplicate_image_elements,
     clean_pdfminer_inner_elements,
 )
@@ -139,3 +140,16 @@ def test_clean_pdfminer_duplicate_image_elements(elements, expected_document_len
     cleaned_doc = clean_pdfminer_duplicate_image_elements(document)
 
     assert len(cleaned_doc.pages[0].elements) == expected_document_length
+
+
+def test_aggregate_by_block():
+    expected = "Inside region1 Inside region2"
+    embedded_regions = [
+        TextRegion.from_coords(0, 0, 20, 20, "Inside region1"),
+        TextRegion.from_coords(50, 50, 150, 150, "Inside region2"),
+        TextRegion.from_coords(250, 250, 350, 350, "Outside region"),
+    ]
+    target_region = TextRegion.from_coords(0, 0, 300, 300)
+
+    text = aggregate_embedded_text_by_block(target_region, embedded_regions)
+    assert text == expected
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.14.1-dev1"  # pragma: no cover
+__version__ = "0.14.1"  # pragma: no cover
diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py
@@ -573,6 +573,7 @@ def _partition_pdf_or_image_local(
             merged_document_layout = merge_inferred_with_extracted_layout(
                 inferred_document_layout=inferred_document_layout,
                 extracted_layout=extracted_layout,
+                hi_res_model_name=hi_res_model_name,
             )
 
             final_document_layout = process_file_with_ocr(
@@ -611,6 +612,7 @@ def _partition_pdf_or_image_local(
             merged_document_layout = merge_inferred_with_extracted_layout(
                 inferred_document_layout=inferred_document_layout,
                 extracted_layout=extracted_layout,
+                hi_res_model_name=hi_res_model_name,
             )
 
             if hasattr(file, "seek"):
diff --git a/unstructured/partition/pdf_image/pdf_image_utils.py b/unstructured/partition/pdf_image/pdf_image_utils.py
@@ -4,6 +4,7 @@
 import os
 import re
 import tempfile
+import unicodedata
 from copy import deepcopy
 from io import BytesIO
 from pathlib import Path, PurePath
@@ -420,3 +421,13 @@ def get_the_last_modification_date_pdf_or_img(
             get_last_modified_date_from_file(file) if date_from_file_object else None
         )
     return last_modification_date
+
+
+def remove_control_characters(text: str) -> str:
+    """Removes control characters from text."""
+
+    # Replace newline character with a space
+    text = text.replace("\n", " ")
+    # Remove other control characters
+    out_text = "".join(c for c in text if unicodedata.category(c)[0] != "C")
+    return out_text
diff --git a/unstructured/partition/pdf_image/pdfminer_processing.py b/unstructured/partition/pdf_image/pdfminer_processing.py
@@ -3,13 +3,14 @@
 from pdfminer.utils import open_filename
 
 from unstructured.documents.elements import ElementType
+from unstructured.partition.pdf_image.pdf_image_utils import remove_control_characters
 from unstructured.partition.pdf_image.pdfminer_utils import (
     get_images_from_pdf_element,
     open_pdfminer_pages_generator,
     rect_to_bbox,
 )
 from unstructured.partition.utils.config import env_config
-from unstructured.partition.utils.constants import Source
+from unstructured.partition.utils.constants import SORT_MODE_BASIC, Source
 from unstructured.partition.utils.sorting import sort_text_regions
 from unstructured.utils import requires_dependencies
 
@@ -43,7 +44,6 @@ def process_data_with_pdfminer(
         EmbeddedTextRegion,
         ImageTextRegion,
     )
-    from unstructured_inference.inference.ordering import order_layout
 
     layouts = []
     # Coefficient to rescale bounding box to be compatible with images
@@ -80,7 +80,7 @@ def process_data_with_pdfminer(
 
         # NOTE(christine): always do the basic sort first for deterministic order across
         # python versions.
-        layout = order_layout(layout)
+        layout = sort_text_regions(layout, SORT_MODE_BASIC)
 
         # apply the current default sorting to the layout elements extracted by pdfminer
         layout = sort_text_regions(layout)
@@ -94,6 +94,7 @@ def process_data_with_pdfminer(
 def merge_inferred_with_extracted_layout(
     inferred_document_layout: "DocumentLayout",
     extracted_layout: List[List["TextRegion"]],
+    hi_res_model_name: str,
 ) -> "DocumentLayout":
     """Merge an inferred layout with an extracted layout"""
 
@@ -102,6 +103,10 @@ def merge_inferred_with_extracted_layout(
     )
     from unstructured_inference.models.detectron2onnx import UnstructuredDetectronONNXModel
 
+    # If the model is a chipper model, we don't want to order the
+    # elements, as they are already ordered
+    order_elements = not hi_res_model_name.startswith("chipper")
+
     inferred_pages = inferred_document_layout.pages
     for i, (inferred_page, extracted_page_layout) in enumerate(
         zip(inferred_pages, extracted_layout)
@@ -128,31 +133,40 @@ def merge_inferred_with_extracted_layout(
             **threshold_kwargs,
         )
 
-        elements = inferred_page.get_elements_from_layout(
-            layout=cast(List["TextRegion"], merged_layout),
-            pdf_objects=extracted_page_layout,
-        )
+        if order_elements:
+            merged_layout = sort_text_regions(
+                cast(List["TextRegion"], merged_layout), SORT_MODE_BASIC
+            )
+
+        elements = []
+        for layout_el in merged_layout:
+            if layout_el.text is None:
+                text = aggregate_embedded_text_by_block(
+                    text_region=cast("TextRegion", layout_el),
+                    pdf_objects=extracted_page_layout,
+                )
+            else:
+                text = layout_el.text
+            layout_el.text = remove_control_characters(text)
+            elements.append(layout_el)
 
         inferred_page.elements[:] = elements
 
     return inferred_document_layout
 
 
-@requires_dependencies("unstructured_inference")
 def clean_pdfminer_inner_elements(document: "DocumentLayout") -> "DocumentLayout":
     """Clean pdfminer elements from inside tables.
 
     This function removes elements sourced from PDFMiner that are subregions within table elements.
     """
 
-    from unstructured_inference.config import inference_config
-
     for page in document.pages:
         tables = [e for e in page.elements if e.type == ElementType.TABLE]
         for i, element in enumerate(page.elements):
             if element.source != Source.PDFMINER:
                 continue
-            subregion_threshold = inference_config.EMBEDDED_TEXT_AGGREGATION_SUBREGION_THRESHOLD
+            subregion_threshold = env_config.EMBEDDED_TEXT_AGGREGATION_SUBREGION_THRESHOLD
             element_inside_table = [
                 element.bbox.is_almost_subregion_of(t.bbox, subregion_threshold) for t in tables
             ]
@@ -189,3 +203,20 @@ def clean_pdfminer_duplicate_image_elements(document: "DocumentLayout") -> "Docu
         page.elements = [e for e in page.elements if e]
 
     return document
+
+
+def aggregate_embedded_text_by_block(
+    text_region: "TextRegion",
+    pdf_objects: list["TextRegion"],
+) -> str:
+    """Extracts the text aggregated from the elements of the given layout that lie within the given
+    block."""
+
+    subregion_threshold = env_config.EMBEDDED_TEXT_AGGREGATION_SUBREGION_THRESHOLD
+    filtered_blocks = [
+        obj
+        for obj in pdf_objects
+        if obj.bbox.is_almost_subregion_of(text_region.bbox, subregion_threshold)
+    ]
+    text = " ".join([x.text for x in filtered_blocks if x.text])
+    return text
diff --git a/unstructured/partition/utils/config.py b/unstructured/partition/utils/config.py
@@ -131,6 +131,16 @@ def EMBEDDED_IMAGE_SAME_REGION_THRESHOLD(self) -> float:
         """threshold to consider the bounding boxes of two embedded images as the same region"""
         return self._get_float("EMBEDDED_IMAGE_SAME_REGION_THRESHOLD", 0.6)
 
+    @property
+    def EMBEDDED_TEXT_AGGREGATION_SUBREGION_THRESHOLD(self) -> float:
+        """threshold to determine if an embedded region is a sub-region of a given block
+        when aggregating the text from embedded elements that lie within the given block
+
+        When the intersection region area divided by self area is larger than this threshold self is
+        considered a subregion of the other
+        """
+        return self._get_float("EMBEDDED_TEXT_AGGREGATION_SUBREGION_THRESHOLD", 0.99)
+
     @property
     def PDF_ANNOTATION_THRESHOLD(self) -> float:
         """The threshold value (between 0.0 and 1.0) that determines the minimum overlap required
diff --git a/unstructured/partition/utils/sorting.py b/unstructured/partition/utils/sorting.py
@@ -261,6 +261,11 @@ def _bboxes_ok(strict_points: bool):
             xy_cut_primary_direction=xy_cut_primary_direction,
         )
         sorted_elements = [elements[i] for i in res]
+    elif sort_mode == SORT_MODE_BASIC:
+        sorted_elements = sorted(
+            elements,
+            key=lambda el: (el.bbox.y1, el.bbox.x1, el.bbox.y2, el.bbox.x2),
+        )
     else:
         sorted_elements = elements
 

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.14.1-dev1" # pragma: no cover`
	`1`	`+__version__ = "0.14.1" # pragma: no cover`
Original file line number	Diff line number	Diff line change
`@@ -573,6 +573,7 @@ def _partition_pdf_or_image_local(`
`573`	`573`	`merged_document_layout = merge_inferred_with_extracted_layout(`
`574`	`574`	`inferred_document_layout=inferred_document_layout,`
`575`	`575`	`extracted_layout=extracted_layout,`
	`576`	`+ hi_res_model_name=hi_res_model_name,`
`576`	`577`	`)`
`577`	`578`
`578`	`579`	`final_document_layout = process_file_with_ocr(`
`@@ -611,6 +612,7 @@ def _partition_pdf_or_image_local(`
`611`	`612`	`merged_document_layout = merge_inferred_with_extracted_layout(`
`612`	`613`	`inferred_document_layout=inferred_document_layout,`
`613`	`614`	`extracted_layout=extracted_layout,`
	`615`	`+ hi_res_model_name=hi_res_model_name,`
`614`	`616`	`)`
`615`	`617`
`616`	`618`	`if hasattr(file, "seek"):`