fix: handle potential errors from tesseract call (#183)

awalker4 · web-flow · commit ea0831fd2a01 · 2023-08-22T13:30:03.000-04:00
We've seen a 500 error in `unstructured-api` due to an uncaught TesseractError in the `entire_page` path. I can't reproduce it, but we can at least add a try catch. The last fix was too aggessive, which we're tracking [here](Unstructured-IO/unstructured#1086), so we may need to adjust this fix as well. Closes #179
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,7 @@
+## 0.5.15
+
+* Handle an uncaught TesseractError
+
 ## 0.5.14
 
 * Add TIFF test file and TIFF filetype to `test_from_image_file` in `test_layout`
diff --git a/test_unstructured_inference/inference/test_layout.py b/test_unstructured_inference/inference/test_layout.py
@@ -127,6 +127,26 @@ def test_get_page_elements(monkeypatch, mock_final_layout):
     assert elements == page.elements
 
 
+def test_get_page_elements_with_tesseract_error(monkeypatch, mock_final_layout):
+    def mock_image_to_data(*args, **kwargs):
+        raise tesseract.TesseractError(-2, "Estimating resolution as 1023")
+
+    monkeypatch.setattr(layout.pytesseract, "image_to_data", mock_image_to_data)
+
+    image = Image.fromarray(np.random.randint(12, 14, size=(40, 10, 3)), mode="RGB")
+    page = layout.PageLayout(
+        number=0,
+        image=image,
+        layout=mock_final_layout,
+        detection_model=MockLayoutModel(mock_final_layout),
+    )
+
+    elements = page.get_elements_with_detection_model(inplace=False)
+
+    assert str(elements[0]) == "A Catchy Title"
+    assert str(elements[1]).startswith("A very repetitive narrative.")
+
+
 class MockPool:
     def map(self, f, xs):
         return [f(x) for x in xs]
diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py
@@ -1 +1 @@
-__version__ = "0.5.14"  # pragma: no cover
+__version__ = "0.5.15"  # pragma: no cover
diff --git a/unstructured_inference/inference/elements.py b/unstructured_inference/inference/elements.py
@@ -273,6 +273,7 @@ def ocr(text_block: TextRegion, image: Image.Image, languages: str = "eng") -> s
     try:
         return agent.detect(cropped_image)
     except tesseract.TesseractError:
+        logger.warning("TesseractError: Skipping region", exc_info=True)
         return ""
 
 
diff --git a/unstructured_inference/inference/layout.py b/unstructured_inference/inference/layout.py
@@ -246,12 +246,16 @@ def get_elements_with_detection_model(
         if self.ocr_mode == "individual_blocks":
             ocr_layout = None
         elif self.ocr_mode == "entire_page":
-            ocr_data = pytesseract.image_to_data(
-                self.image,
-                lang=self.ocr_languages,
-                output_type=Output.DICT,
-            )
-            ocr_layout = parse_ocr_data(ocr_data)
+            ocr_layout = None
+            try:
+                ocr_data = pytesseract.image_to_data(
+                    self.image,
+                    lang=self.ocr_languages,
+                    output_type=Output.DICT,
+                )
+                ocr_layout = parse_ocr_data(ocr_data)
+            except pytesseract.pytesseract.TesseractError:
+                logger.warning("TesseractError: Skipping page", exc_info=True)
         else:
             raise ValueError("Invalid OCR mode")
 

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.5.14" # pragma: no cover`
	`1`	`+__version__ = "0.5.15" # pragma: no cover`