rfctr: extract OCRAgent.get_agent() out of PDF subtree (#2965)

scanny · web-flow · commit cb55245f7015 · 2024-05-03T19:39:22.000Z
**Summary**
File-types other than PDF need to use OCR on extracted images. Extract
`OCRAgent.get_agent()` such that any file-type partitioner can use it
without risking dependency on PDF-only extras.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,8 +1,9 @@
-## 0.13.7-dev5
+## 0.13.7-dev6
 
 ### Enhancements
 
 * **Remove `page_number` metadata fields** for HTML partition until we have a better strategy to decide page counting.
+* **Extract OCRAgent.get_agent().** Generalize access to the configured OCRAgent instance beyond its use for PDFs.
 
 ### Features
 
diff --git a/test_unstructured/partition/utils/ocr_models/test_ocr_interface.py b/test_unstructured/partition/utils/ocr_models/test_ocr_interface.py
@@ -0,0 +1,94 @@
+# pyright: reportPrivateUsage=false
+
+"""Unit-test suite for the `unstructured.partition.utils.ocr_models.ocr_interface` module."""
+
+from __future__ import annotations
+
+import pytest
+
+from test_unstructured.unit_utils import (
+    FixtureRequest,
+    LogCaptureFixture,
+    Mock,
+    instance_mock,
+    method_mock,
+    property_mock,
+)
+from unstructured.partition.utils.config import ENVConfig
+from unstructured.partition.utils.constants import (
+    OCR_AGENT_PADDLE,
+    OCR_AGENT_PADDLE_OLD,
+    OCR_AGENT_TESSERACT,
+    OCR_AGENT_TESSERACT_OLD,
+)
+from unstructured.partition.utils.ocr_models.ocr_interface import OCRAgent
+
+
+class DescribeOCRAgent:
+    """Unit-test suite for `unstructured.partition.utils...ocr_interface.OCRAgent` class."""
+
+    def it_provides_access_to_the_configured_OCR_agent(
+        self, _get_ocr_agent_cls_qname_: Mock, get_instance_: Mock, ocr_agent_: Mock
+    ):
+        _get_ocr_agent_cls_qname_.return_value = OCR_AGENT_TESSERACT
+        get_instance_.return_value = ocr_agent_
+
+        ocr_agent = OCRAgent.get_agent()
+
+        _get_ocr_agent_cls_qname_.assert_called_once_with()
+        get_instance_.assert_called_once_with(OCR_AGENT_TESSERACT)
+        assert ocr_agent is ocr_agent_
+
+    @pytest.mark.parametrize("ExceptionCls", [ImportError, AttributeError])
+    def but_it_raises_whan_no_such_ocr_agent_class_is_found(
+        self, ExceptionCls: type, _get_ocr_agent_cls_qname_: Mock, get_instance_: Mock
+    ):
+        _get_ocr_agent_cls_qname_.return_value = "Invalid.Ocr.Agent.Qname"
+        get_instance_.side_effect = ExceptionCls
+
+        with pytest.raises(ValueError, match="OCR_AGENT must be set to an existing OCR agent "):
+            OCRAgent.get_agent()
+
+        _get_ocr_agent_cls_qname_.assert_called_once_with()
+        get_instance_.assert_called_once_with("Invalid.Ocr.Agent.Qname")
+
+    @pytest.mark.parametrize(
+        ("OCR_AGENT", "expected_value"),
+        [
+            (OCR_AGENT_PADDLE, OCR_AGENT_PADDLE),
+            (OCR_AGENT_PADDLE_OLD, OCR_AGENT_PADDLE),
+            (OCR_AGENT_TESSERACT, OCR_AGENT_TESSERACT),
+            (OCR_AGENT_TESSERACT_OLD, OCR_AGENT_TESSERACT),
+        ],
+    )
+    def it_computes_the_OCR_agent_qualified_module_name(
+        self, OCR_AGENT: str, expected_value: str, OCR_AGENT_prop_: Mock
+    ):
+        OCR_AGENT_prop_.return_value = OCR_AGENT
+        assert OCRAgent._get_ocr_agent_cls_qname() == expected_value
+
+    @pytest.mark.parametrize("OCR_AGENT", [OCR_AGENT_PADDLE_OLD, OCR_AGENT_TESSERACT_OLD])
+    def and_it_logs_a_warning_when_the_OCR_AGENT_module_name_is_obsolete(
+        self, caplog: LogCaptureFixture, OCR_AGENT: str, OCR_AGENT_prop_: Mock
+    ):
+        OCR_AGENT_prop_.return_value = OCR_AGENT
+        OCRAgent._get_ocr_agent_cls_qname()
+        assert f"OCR agent name {OCR_AGENT} is outdated " in caplog.text
+
+    # -- fixtures --------------------------------------------------------------------------------
+
+    @pytest.fixture()
+    def get_instance_(self, request: FixtureRequest):
+        return method_mock(request, OCRAgent, "get_instance")
+
+    @pytest.fixture()
+    def _get_ocr_agent_cls_qname_(self, request: FixtureRequest):
+        return method_mock(request, OCRAgent, "_get_ocr_agent_cls_qname")
+
+    @pytest.fixture()
+    def ocr_agent_(self, request: FixtureRequest):
+        return instance_mock(request, OCRAgent)
+
+    @pytest.fixture()
+    def OCR_AGENT_prop_(self, request: FixtureRequest):
+        return property_mock(request, ENVConfig, "OCR_AGENT")
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.13.7-dev5"  # pragma: no cover
+__version__ = "0.13.7-dev6"  # pragma: no cover
diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py
@@ -937,21 +937,19 @@ def _partition_pdf_or_image_with_ocr(
 
 
 def _partition_pdf_or_image_with_ocr_from_image(
-    image: PILImage,
+    image: PILImage.Image,
     languages: Optional[list[str]] = None,
     page_number: int = 1,
     include_page_breaks: bool = False,
     metadata_last_modified: Optional[str] = None,
     sort_mode: str = SORT_MODE_XY_CUT,
-    **kwargs,
+    **kwargs: Any,
 ) -> list[Element]:
     """Extract `unstructured` elements from an image using OCR and perform partitioning."""
 
-    from unstructured.partition.pdf_image.ocr import (
-        get_ocr_agent,
-    )
+    from unstructured.partition.utils.ocr_models.ocr_interface import OCRAgent
 
-    ocr_agent = get_ocr_agent()
+    ocr_agent = OCRAgent.get_agent()
     ocr_languages = prepare_languages_for_tesseract(languages)
 
     # NOTE(christine): `unstructured_pytesseract.image_to_string()` returns sorted text
diff --git a/unstructured/partition/pdf_image/ocr.py b/unstructured/partition/pdf_image/ocr.py
@@ -12,20 +12,11 @@
 from PIL import ImageSequence
 
 from unstructured.documents.elements import ElementType
-from unstructured.logger import logger
 from unstructured.metrics.table.table_formats import SimpleTableCell
 from unstructured.partition.pdf_image.pdf_image_utils import pad_element_bboxes, valid_text
 from unstructured.partition.utils.config import env_config
-from unstructured.partition.utils.constants import (
-    OCR_AGENT_PADDLE,
-    OCR_AGENT_PADDLE_OLD,
-    OCR_AGENT_TESSERACT,
-    OCR_AGENT_TESSERACT_OLD,
-    OCRMode,
-)
-from unstructured.partition.utils.ocr_models.ocr_interface import (
-    OCRAgent,
-)
+from unstructured.partition.utils.constants import OCR_AGENT_TESSERACT, OCRMode
+from unstructured.partition.utils.ocr_models.ocr_interface import OCRAgent
 from unstructured.utils import requires_dependencies
 
 if TYPE_CHECKING:
@@ -35,12 +26,6 @@
     from unstructured_inference.models.tables import UnstructuredTableTransformerModel
 
 
-# Force tesseract to be single threaded,
-# otherwise we see major performance problems
-if "OMP_THREAD_LIMIT" not in os.environ:
-    os.environ["OMP_THREAD_LIMIT"] = "1"
-
-
 def process_data_with_ocr(
     data: bytes | IO[bytes],
     out_layout: "DocumentLayout",
@@ -200,7 +185,7 @@ def supplement_page_layout_with_ocr(
     with no text and add text from OCR to each element.
     """
 
-    ocr_agent = get_ocr_agent()
+    ocr_agent = OCRAgent.get_agent()
     if ocr_mode == OCRMode.FULL_PAGE.value:
         ocr_layout = ocr_agent.get_layout_from_image(
             image,
@@ -453,34 +438,3 @@ def supplement_layout_with_ocr_elements(
         final_layout = layout
 
     return final_layout
-
-
-def get_ocr_agent() -> OCRAgent:
-    ocr_agent_module = env_config.OCR_AGENT
-    message = (
-        "OCR agent name %s is outdated and will be deprecated in a future release; please use %s "
-        "instead"
-    )
-    # deal with compatibility with origin way to set OCR
-    if ocr_agent_module.lower() == OCR_AGENT_TESSERACT_OLD:
-        logger.warning(
-            message,
-            ocr_agent_module,
-            OCR_AGENT_TESSERACT,
-        )
-        ocr_agent_module = OCR_AGENT_TESSERACT
-    elif ocr_agent_module.lower() == OCR_AGENT_PADDLE_OLD:
-        logger.warning(
-            message,
-            ocr_agent_module,
-            OCR_AGENT_PADDLE,
-        )
-        ocr_agent_module = OCR_AGENT_PADDLE
-    try:
-        ocr_agent = OCRAgent.get_instance(ocr_agent_module)
-    except (ImportError, AttributeError):
-        raise ValueError(
-            "Environment variable OCR_AGENT",
-            f" must be set to an existing ocr agent module, not {ocr_agent_module}.",
-        )
-    return ocr_agent
diff --git a/unstructured/partition/utils/ocr_models/ocr_interface.py b/unstructured/partition/utils/ocr_models/ocr_interface.py
@@ -5,38 +5,39 @@
 from abc import ABC, abstractmethod
 from typing import TYPE_CHECKING
 
-from unstructured.partition.utils.constants import OCR_AGENT_MODULES_WHITELIST
+from unstructured.logger import logger
+from unstructured.partition.utils.config import env_config
+from unstructured.partition.utils.constants import (
+    OCR_AGENT_MODULES_WHITELIST,
+    OCR_AGENT_PADDLE,
+    OCR_AGENT_PADDLE_OLD,
+    OCR_AGENT_TESSERACT,
+    OCR_AGENT_TESSERACT_OLD,
+)
 
 if TYPE_CHECKING:
     from PIL import Image as PILImage
     from unstructured_inference.inference.elements import TextRegion
-    from unstructured_inference.inference.layoutelement import (
-        LayoutElement,
-    )
+    from unstructured_inference.inference.layoutelement import LayoutElement
 
 
 class OCRAgent(ABC):
     """Defines the interface for an Optical Character Recognition (OCR) service."""
 
-    @abstractmethod
-    def is_text_sorted(self) -> bool:
-        pass
-
-    @abstractmethod
-    def get_text_from_image(self, image: PILImage.Image, ocr_languages: str = "eng") -> str:
-        pass
-
-    @abstractmethod
-    def get_layout_from_image(
-        self, image: PILImage.Image, ocr_languages: str = "eng"
-    ) -> list[TextRegion]:
-        pass
+    @classmethod
+    def get_agent(cls) -> OCRAgent:
+        """Get the configured OCRAgent instance.
 
-    @abstractmethod
-    def get_layout_elements_from_image(
-        self, image: PILImage.Image, ocr_languages: str = "eng"
-    ) -> list[LayoutElement]:
-        pass
+        The OCR package used by the agent is determined by the `OCR_AGENT` environment variable.
+        """
+        ocr_agent_cls_qname = cls._get_ocr_agent_cls_qname()
+        try:
+            return cls.get_instance(ocr_agent_cls_qname)
+        except (ImportError, AttributeError):
+            raise ValueError(
+                f"Environment variable OCR_AGENT must be set to an existing OCR agent module,"
+                f" not {ocr_agent_cls_qname}."
+            )
 
     @staticmethod
     @functools.lru_cache(maxsize=None)
@@ -51,3 +52,48 @@ def get_instance(ocr_agent_module: str) -> "OCRAgent":
                 f"Environment variable OCR_AGENT module name {module_name}, must be set to a"
                 f" whitelisted module part of {OCR_AGENT_MODULES_WHITELIST}.",
             )
+
+    @abstractmethod
+    def get_layout_elements_from_image(
+        self, image: PILImage.Image, ocr_languages: str = "eng"
+    ) -> list[LayoutElement]:
+        pass
+
+    @abstractmethod
+    def get_layout_from_image(
+        self, image: PILImage.Image, ocr_languages: str = "eng"
+    ) -> list[TextRegion]:
+        pass
+
+    @abstractmethod
+    def get_text_from_image(self, image: PILImage.Image, ocr_languages: str = "eng") -> str:
+        pass
+
+    @abstractmethod
+    def is_text_sorted(self) -> bool:
+        pass
+
+    @staticmethod
+    def _get_ocr_agent_cls_qname() -> str:
+        """Get the fully-qualified class name of the configured OCR agent.
+
+        The qualified name (qname) looks like:
+            "unstructured.partition.utils.ocr_models.tesseract_ocr.OCRAgentTesseract"
+
+        The qname provides the full module address and class name of the OCR agent.
+        """
+        ocr_agent_qname = env_config.OCR_AGENT
+
+        # -- map legacy method of setting OCR agent by key-name to full qname --
+        qnames_by_keyname = {
+            OCR_AGENT_TESSERACT_OLD: OCR_AGENT_TESSERACT,
+            OCR_AGENT_PADDLE_OLD: OCR_AGENT_PADDLE,
+        }
+        if qname_mapped_from_keyname := qnames_by_keyname.get(ocr_agent_qname.lower()):
+            logger.warning(
+                f"OCR agent name {ocr_agent_qname} is outdated and will be removed in a future"
+                f" release; please use {qname_mapped_from_keyname} instead"
+            )
+            return qname_mapped_from_keyname
+
+        return ocr_agent_qname
diff --git a/unstructured/partition/utils/ocr_models/tesseract_ocr.py b/unstructured/partition/utils/ocr_models/tesseract_ocr.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import os
 from typing import TYPE_CHECKING, List
 
 import cv2
@@ -22,9 +23,11 @@
 
 if TYPE_CHECKING:
     from unstructured_inference.inference.elements import TextRegion
-    from unstructured_inference.inference.layoutelement import (
-        LayoutElement,
-    )
+    from unstructured_inference.inference.layoutelement import LayoutElement
+
+# -- force tesseract to be single threaded, otherwise we see major performance problems --
+if "OMP_THREAD_LIMIT" not in os.environ:
+    os.environ["OMP_THREAD_LIMIT"] = "1"
 
 
 class OCRAgentTesseract(OCRAgent):

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.13.7-dev5" # pragma: no cover`
	`1`	`+__version__ = "0.13.7-dev6" # pragma: no cover`