feat: add ability to pass languages to OCR agent (#86)

MthwRobinson · web-flow · commit f986f139fa1e · 2023-04-20T17:13:52.000Z
* add language parameter to tesseract

* pass language into elements and layout

* enable loading multiple language agents

* update tests to include ocr languages

* test ocr load

* changelog and version

* version bump for release
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,8 @@
+## 0.4.1
+
+* Added the ability to pass `ocr_languages` to the OCR agent for users who need
+  non-English language packs.
+
 ## 0.4.0
 
 * Added logic to partition granular elements (words, characters) by proximity
diff --git a/test_unstructured_inference/inference/test_layout.py b/test_unstructured_inference/inference/test_layout.py
@@ -44,7 +44,7 @@ class MockOCRAgent:
         def detect(self, *args):
             return mock_text
 
-    monkeypatch.setattr(tesseract, "ocr_agent", MockOCRAgent)
+    monkeypatch.setattr(tesseract, "ocr_agents", {"eng": MockOCRAgent})
     monkeypatch.setattr(tesseract, "is_pytesseract_available", lambda *args: True)
 
     image = Image.fromarray(np.random.randint(12, 24, (40, 40)), mode="RGB")
@@ -96,7 +96,7 @@ def test_get_page_elements_with_ocr(monkeypatch):
     doc_layout = [text_block, image_block]
 
     monkeypatch.setattr(detectron2, "is_detectron2_available", lambda *args: True)
-    monkeypatch.setattr(elements, "ocr", lambda *args: "An Even Catchier Title")
+    monkeypatch.setattr(elements, "ocr", lambda *args, **kwargs: "An Even Catchier Title")
 
     image = Image.fromarray(np.random.randint(12, 14, size=(40, 10, 3)), mode="RGB")
     page = layout.PageLayout(
@@ -187,11 +187,19 @@ def points(self):
 
 
 class MockPageLayout(layout.PageLayout):
-    def __init__(self, layout=None, model=None, ocr_strategy="auto", extract_tables=False):
+    def __init__(
+        self,
+        layout=None,
+        model=None,
+        ocr_strategy="auto",
+        ocr_languages="eng",
+        extract_tables=False,
+    ):
         self.image = None
         self.layout = layout
         self.model = model
         self.ocr_strategy = ocr_strategy
+        self.ocr_languages = ocr_languages
         self.extract_tables = extract_tables
 
     def ocr(self, text_block: MockEmbeddedTextRegion):
diff --git a/test_unstructured_inference/models/test_tesseract.py b/test_unstructured_inference/models/test_tesseract.py
@@ -11,12 +11,12 @@ def __init__(self, languages):
 
 def test_load_agent(monkeypatch):
     monkeypatch.setattr(tesseract, "TesseractAgent", MockTesseractAgent)
-    monkeypatch.setattr(tesseract, "ocr_agent", None)
+    monkeypatch.setattr(tesseract, "ocr_agents", {})
 
     with patch.object(tesseract, "is_pytesseract_available", return_value=True):
-        tesseract.load_agent()
+        tesseract.load_agent(languages="eng+swe")
 
-    assert isinstance(tesseract.ocr_agent, MockTesseractAgent)
+    assert isinstance(tesseract.ocr_agents["eng+swe"], MockTesseractAgent)
 
 
 def test_load_agent_raises_when_not_available():
diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py
@@ -1 +1 @@
-__version__ = "0.4.0"  # pragma: no cover
+__version__ = "0.4.1"  # pragma: no cover
diff --git a/unstructured_inference/inference/elements.py b/unstructured_inference/inference/elements.py
@@ -162,6 +162,7 @@ def extract_text(
         image: Optional[Image.Image] = None,
         extract_tables: bool = False,
         ocr_strategy: str = "auto",
+        ocr_languages: str = "eng",
     ) -> str:
         """Extracts text contained in region."""
         if self.text is not None:
@@ -172,7 +173,7 @@ def extract_text(
         elif image is not None:
             if ocr_strategy != "never":
                 # We don't have anything to go on but the image itself, so we use OCR
-                text = ocr(self, image)
+                text = ocr(self, image, languages=ocr_languages)
             else:
                 text = ""
         else:
@@ -190,6 +191,7 @@ def extract_text(
         image: Optional[Image.Image] = None,
         extract_tables: bool = False,
         ocr_strategy: str = "auto",
+        ocr_languages: str = "eng",
     ) -> str:
         """Extracts text contained in region."""
         if self.text is None:
@@ -205,24 +207,28 @@ def extract_text(
         image: Optional[Image.Image] = None,
         extract_tables: bool = False,
         ocr_strategy: str = "auto",
+        ocr_languages: str = "eng",
     ) -> str:
         """Extracts text contained in region."""
         if self.text is None:
             if ocr_strategy == "never" or image is None:
                 return ""
             else:
-                return ocr(self, image)
+                return ocr(self, image, languages=ocr_languages)
         else:
             return super().extract_text(objects, image, extract_tables, ocr_strategy)
 
 
-def ocr(text_block: TextRegion, image: Image.Image) -> str:
+def ocr(text_block: TextRegion, image: Image.Image, languages: str = "eng") -> str:
     """Runs a cropped text block image through and OCR agent."""
     logger.debug("Running OCR on text block ...")
-    tesseract.load_agent()
+    tesseract.load_agent(languages=languages)
     padded_block = text_block.pad(12)
     cropped_image = image.crop((padded_block.x1, padded_block.y1, padded_block.x2, padded_block.y2))
-    return tesseract.ocr_agent.detect(cropped_image)
+    agent = tesseract.ocr_agents.get(languages)
+    if agent is None:
+        raise RuntimeError("OCR agent is not loaded for {languages}.")
+    return agent.detect(cropped_image)
 
 
 def needs_ocr(
@@ -263,16 +269,17 @@ def aggregate_by_block(
     image: Optional[Image.Image],
     pdf_objects: List[TextRegion],
     ocr_strategy: str = "auto",
+    ocr_languages: str = "eng",
 ) -> str:
     """Extracts the text aggregated from the elements of the given layout that lie within the given
     block."""
     if image is not None and needs_ocr(text_region, pdf_objects, ocr_strategy):
-        text = ocr(text_region, image)
+        text = ocr(text_region, image, languages=ocr_languages)
     else:
         filtered_blocks = [obj for obj in pdf_objects if obj.is_in(text_region, error_margin=5)]
         for little_block in filtered_blocks:
             if image is not None and needs_ocr(little_block, pdf_objects, ocr_strategy):
-                little_block.text = ocr(little_block, image)
+                little_block.text = ocr(little_block, image, languages=ocr_languages)
         text = " ".join([x.text for x in filtered_blocks if x.text])
     text = remove_control_characters(text)
     return text
diff --git a/unstructured_inference/inference/layout.py b/unstructured_inference/inference/layout.py
@@ -55,6 +55,7 @@ def from_file(
         model: Optional[UnstructuredModel] = None,
         fixed_layouts: Optional[List[Optional[List[TextRegion]]]] = None,
         ocr_strategy: str = "auto",
+        ocr_languages: str = "eng",
         extract_tables: bool = False,
     ) -> DocumentLayout:
         """Creates a DocumentLayout from a pdf file."""
@@ -75,6 +76,7 @@ def from_file(
                 model=model,
                 layout=layout,
                 ocr_strategy=ocr_strategy,
+                ocr_languages=ocr_languages,
                 fixed_layout=fixed_layout,
                 extract_tables=extract_tables,
             )
@@ -87,6 +89,7 @@ def from_image_file(
         filename: str,
         model: Optional[UnstructuredModel] = None,
         ocr_strategy: str = "auto",
+        ocr_languages: str = "eng",
         fixed_layout: Optional[List[TextRegion]] = None,
         extract_tables: bool = False,
     ) -> DocumentLayout:
@@ -104,6 +107,7 @@ def from_image_file(
             model=model,
             layout=None,
             ocr_strategy=ocr_strategy,
+            ocr_languages=ocr_languages,
             fixed_layout=fixed_layout,
             extract_tables=extract_tables,
         )
@@ -120,6 +124,7 @@ def __init__(
         layout: Optional[List[TextRegion]],
         model: Optional[UnstructuredModel] = None,
         ocr_strategy: str = "auto",
+        ocr_languages: str = "eng",
         extract_tables: bool = False,
     ):
         self.image = image
@@ -131,6 +136,7 @@ def __init__(
         if ocr_strategy not in VALID_OCR_STRATEGIES:
             raise ValueError(f"ocr_strategy must be one of {VALID_OCR_STRATEGIES}.")
         self.ocr_strategy = ocr_strategy
+        self.ocr_languages = ocr_languages
         self.extract_tables = extract_tables
 
     def __str__(self) -> str:
@@ -159,7 +165,12 @@ def get_elements_from_layout(self, layout: List[TextRegion]) -> List[LayoutEleme
         layout.sort(key=lambda element: element.y1)
         elements = [
             get_element_from_block(
-                e, self.image, self.layout, self.ocr_strategy, self.extract_tables
+                block=e,
+                image=self.image,
+                pdf_objects=self.layout,
+                ocr_strategy=self.ocr_strategy,
+                ocr_languages=self.ocr_languages,
+                extract_tables=self.extract_tables,
             )
             for e in layout
         ]
@@ -178,6 +189,7 @@ def from_image(
         model: Optional[UnstructuredModel] = None,
         layout: Optional[List[TextRegion]] = None,
         ocr_strategy: str = "auto",
+        ocr_languages: str = "eng",
         extract_tables: bool = False,
         fixed_layout: Optional[List[TextRegion]] = None,
     ):
@@ -188,6 +200,7 @@ def from_image(
             layout=layout,
             model=model,
             ocr_strategy=ocr_strategy,
+            ocr_languages=ocr_languages,
             extract_tables=extract_tables,
         )
         if fixed_layout is None:
@@ -202,6 +215,7 @@ def process_data_with_model(
     model_name: Optional[str],
     is_image: bool = False,
     ocr_strategy: str = "auto",
+    ocr_languages: str = "eng",
     fixed_layouts: Optional[List[Optional[List[TextRegion]]]] = None,
     extract_tables: bool = False,
 ) -> DocumentLayout:
@@ -214,6 +228,7 @@ def process_data_with_model(
             model_name,
             is_image=is_image,
             ocr_strategy=ocr_strategy,
+            ocr_languages=ocr_languages,
             fixed_layouts=fixed_layouts,
             extract_tables=extract_tables,
         )
@@ -226,6 +241,7 @@ def process_file_with_model(
     model_name: Optional[str],
     is_image: bool = False,
     ocr_strategy: str = "auto",
+    ocr_languages: str = "eng",
     fixed_layouts: Optional[List[Optional[List[TextRegion]]]] = None,
     extract_tables: bool = False,
 ) -> DocumentLayout:
@@ -234,13 +250,18 @@ def process_file_with_model(
     model = get_model(model_name)
     layout = (
         DocumentLayout.from_image_file(
-            filename, model=model, ocr_strategy=ocr_strategy, extract_tables=extract_tables
+            filename,
+            model=model,
+            ocr_strategy=ocr_strategy,
+            ocr_languages=ocr_languages,
+            extract_tables=extract_tables,
         )
         if is_image
         else DocumentLayout.from_file(
             filename,
             model=model,
             ocr_strategy=ocr_strategy,
+            ocr_languages=ocr_languages,
             fixed_layouts=fixed_layouts,
             extract_tables=extract_tables,
         )
@@ -253,13 +274,18 @@ def get_element_from_block(
     image: Optional[Image.Image] = None,
     pdf_objects: Optional[List[TextRegion]] = None,
     ocr_strategy: str = "auto",
+    ocr_languages: str = "eng",
     extract_tables: bool = False,
 ) -> LayoutElement:
     """Creates a LayoutElement from a given layout or image by finding all the text that lies within
     a given block."""
     element = LayoutElement.from_region(block)
     element.text = block.extract_text(
-        objects=pdf_objects, image=image, extract_tables=extract_tables, ocr_strategy=ocr_strategy
+        objects=pdf_objects,
+        image=image,
+        extract_tables=extract_tables,
+        ocr_strategy=ocr_strategy,
+        ocr_languages=ocr_languages,
     )
     return element
 
diff --git a/unstructured_inference/inference/layoutelement.py b/unstructured_inference/inference/layoutelement.py
@@ -19,6 +19,7 @@ def extract_text(
         image: Optional[Image.Image] = None,
         extract_tables: bool = False,
         ocr_strategy: str = "auto",
+        ocr_languages: str = "eng",
     ):
         """Extracts text contained in region"""
         if self.text is not None:
@@ -32,6 +33,7 @@ def extract_text(
                 image=image,
                 extract_tables=extract_tables,
                 ocr_strategy=ocr_strategy,
+                ocr_languages=ocr_languages,
             )
         return text
 
diff --git a/unstructured_inference/models/tesseract.py b/unstructured_inference/models/tesseract.py
@@ -1,20 +1,29 @@
+from typing import Dict
+
 from layoutparser.ocr.tesseract_agent import is_pytesseract_available, TesseractAgent
 
 from unstructured_inference.logger import logger
 
-ocr_agent: TesseractAgent = None
+ocr_agents: Dict[str, TesseractAgent] = {}
+
 
+def load_agent(languages: str = "eng"):
+    """Loads the Tesseract OCR agent as a global variable to ensure that we only load it once.
 
-def load_agent():
-    """Loads the Tesseract OCR agent as a global variable to ensure that we only load it once."""
-    global ocr_agent
+    Parameters
+    ----------
+    languages
+        The languages to use for the Tesseract agent. To use a langauge, you'll first need
+        to isntall the appropriate Tesseract language pack.
+    """
+    global ocr_agents
 
     if not is_pytesseract_available():
         raise ImportError(
             "Failed to load Tesseract. Ensure that Tesseract is installed. Example command: \n"
             "    >>> sudo apt install -y tesseract-ocr"
         )
 
-    if ocr_agent is None:
-        logger.info("Loading the Tesseract OCR agent ...")
-        ocr_agent = TesseractAgent(languages="eng")
+    if languages not in ocr_agents:
+        logger.info(f"Loading the Tesseract OCR agent for {languages} ...")
+        ocr_agents[languages] = TesseractAgent(languages=languages)

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.4.0" # pragma: no cover`
	`1`	`+__version__ = "0.4.1" # pragma: no cover`