Skip to content

Commit 887e6c9

Browse files
refactor: use env_config instead of SUBREGION_THRESHOLD_FOR_OCR constant (#2697)
The purpose of this PR is to introduce a new env_config for the subregion threshold for OCR. ### Testing CI should pass.
1 parent c8cf8f3 commit 887e6c9

File tree

5 files changed

+18
-9
lines changed

5 files changed

+18
-9
lines changed

CHANGELOG.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414

1515
### Fixes
1616

17-
* **Fix `clean_pdfminer_inner_elements()` to remove only pdfminer (embedded) elements merged with inferred elements** Previously, some embedded elements were removed even if they were not merged with inferred elements. Now, only embedded elements that are already merged with inferred elements are removed.
17+
* **Fix `clean_pdfminer_inner_elements()` to remove only pdfminer (embedded) elements merged with inferred elements**. Previously, some embedded elements were removed even if they were not merged with inferred elements. Now, only embedded elements that are already merged with inferred elements are removed.
1818
* **Clarify IAM Role Requirement for GCS Platform Connectors**. The GCS Source Connector requires Storage Object Viewer and GCS Destination Connector requires Storage Object Creator IAM roles.
1919
* **Change table extraction defaults** Change table extraction defaults in favor of using `skip_infer_table_types` parameter and reflect these changes in documentation.
2020
* **Fix OneDrive dates with inconsistent formatting** Adds logic to conditionally support dates returned by office365 that may vary in date formatting or may be a datetime rather than a string. See previous fix for SharePoint

test_unstructured/partition/pdf_image/test_ocr.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
from unstructured.documents.elements import ElementType
1616
from unstructured.partition.pdf_image import ocr
1717
from unstructured.partition.pdf_image.ocr import pad_element_bboxes
18+
from unstructured.partition.utils.config import env_config
1819
from unstructured.partition.utils.constants import (
1920
Source,
2021
)
@@ -267,7 +268,7 @@ def test_supplement_layout_with_ocr_elements(mock_layout, mock_ocr_regions):
267268
for ocr_element in ocr_elements:
268269
if ocr_element.bbox.is_almost_subregion_of(
269270
element.bbox,
270-
ocr.SUBREGION_THRESHOLD_FOR_OCR,
271+
env_config.OCR_LAYOUT_SUBREGION_THRESHOLD,
271272
):
272273
assert ocr_element not in final_layout
273274

unstructured/partition/pdf_image/ocr.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818
OCR_AGENT_PADDLE_OLD,
1919
OCR_AGENT_TESSERACT,
2020
OCR_AGENT_TESSERACT_OLD,
21-
SUBREGION_THRESHOLD_FOR_OCR,
2221
OCRMode,
2322
)
2423
from unstructured.partition.utils.ocr_models.ocr_interface import (
@@ -349,7 +348,6 @@ def merge_out_layout_with_ocr_layout(
349348
out_region.text = aggregate_ocr_text_by_block(
350349
ocr_layout,
351350
out_region,
352-
SUBREGION_THRESHOLD_FOR_OCR,
353351
)
354352

355353
final_layout = (
@@ -364,7 +362,7 @@ def merge_out_layout_with_ocr_layout(
364362
def aggregate_ocr_text_by_block(
365363
ocr_layout: List["TextRegion"],
366364
region: "TextRegion",
367-
subregion_threshold: float,
365+
subregion_threshold: float = env_config.OCR_LAYOUT_SUBREGION_THRESHOLD,
368366
) -> Optional[str]:
369367
"""Extracts the text aggregated from the regions of the ocr layout that lie within the given
370368
block."""
@@ -374,7 +372,7 @@ def aggregate_ocr_text_by_block(
374372
for ocr_region in ocr_layout:
375373
ocr_region_is_subregion_of_given_region = ocr_region.bbox.is_almost_subregion_of(
376374
region.bbox,
377-
subregion_threshold=subregion_threshold,
375+
subregion_threshold,
378376
)
379377
if ocr_region_is_subregion_of_given_region and ocr_region.text:
380378
extracted_texts.append(ocr_region.text)
@@ -386,6 +384,7 @@ def aggregate_ocr_text_by_block(
386384
def supplement_layout_with_ocr_elements(
387385
layout: List["LayoutElement"],
388386
ocr_layout: List["TextRegion"],
387+
subregion_threshold: float = env_config.OCR_LAYOUT_SUBREGION_THRESHOLD,
389388
) -> List["LayoutElement"]:
390389
"""
391390
Supplement the existing layout with additional OCR-derived elements.
@@ -410,7 +409,7 @@ def supplement_layout_with_ocr_elements(
410409
is a subregion of an existing layout element.
411410
- It also relies on `build_layout_elements_from_ocr_regions()` to convert OCR regions to
412411
layout elements.
413-
- The `SUBREGION_THRESHOLD_FOR_OCR` constant is used to specify the subregion matching
412+
- The env_config `OCR_LAYOUT_SUBREGION_THRESHOLD` is used to specify the subregion matching
414413
threshold.
415414
"""
416415

@@ -423,7 +422,7 @@ def supplement_layout_with_ocr_elements(
423422
for el in layout:
424423
ocr_region_is_subregion_of_out_el = ocr_region.bbox.is_almost_subregion_of(
425424
el.bbox,
426-
SUBREGION_THRESHOLD_FOR_OCR,
425+
subregion_threshold,
427426
)
428427
if ocr_region_is_subregion_of_out_el:
429428
ocr_regions_to_remove.append(ocr_region)

unstructured/partition/utils/config.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,5 +94,15 @@ def EXTRACT_IMAGE_BLOCK_CROP_VERTICAL_PAD(self) -> int:
9494
"""
9595
return self._get_int("EXTRACT_IMAGE_BLOCK_CROP_VERTICAL_PAD", 0)
9696

97+
@property
98+
def OCR_LAYOUT_SUBREGION_THRESHOLD(self) -> float:
99+
"""threshold to determine if an OCR region is a sub-region of a given block
100+
when aggregating the text from OCR'd elements that lie within the given block
101+
102+
When the intersection region area divided by self area is larger than this threshold self is
103+
considered a subregion of the other
104+
"""
105+
return self._get_float("OCR_LAYOUT_SUBREGION_THRESHOLD", 0.5)
106+
97107

98108
env_config = ENVConfig()

unstructured/partition/utils/constants.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,6 @@ class PartitionStrategy:
3636
"unstructured.partition.utils.ocr_models.paddle_ocr",
3737
).split(",")
3838

39-
SUBREGION_THRESHOLD_FOR_OCR = 0.5
4039
UNSTRUCTURED_INCLUDE_DEBUG_METADATA = os.getenv("UNSTRUCTURED_INCLUDE_DEBUG_METADATA", False)
4140

4241
# Note(yuming): Default language for paddle OCR

0 commit comments

Comments
 (0)