Skip to content

Commit 203f7ab

Browse files
awalker4cragwolfeshreyanid
authored
fix: handle errors from Tesseract (#165)
* fix: handle errors from Tesseract Certain regions of a document are failing ocr with this error: `pytesseract.pytesseract.TesseractError: (-8, 'Estimating resolution as 1250')` When I try the same region on the CLI, I get: ``` $ tesseract bad_tile.jpeg output Estimating resolution as 1813 Floating point exception ``` Whatever the root cause, let's catch this error and return an empty string. * fix lint error * more lint stuff * temporary bump to dev0 version for debugging * release version --------- Co-authored-by: Crag Wolfe <[email protected]> Co-authored-by: shreyanid <[email protected]>
1 parent 9a53178 commit 203f7ab

File tree

5 files changed

+28
-2
lines changed

5 files changed

+28
-2
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
## 0.5.9
2+
3+
* Handle exceptions from Tesseract
4+
15
## 0.5.8
26

37
* Add alternative architecture for detectron2 (but default is unchanged)

test_unstructured_inference/inference/test_layout.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,21 @@ def detect(self, *args):
8383
assert elements.ocr(text_block, image=image) == mock_text
8484

8585

86+
def test_ocr_with_error(monkeypatch):
87+
class MockOCRAgent:
88+
def detect(self, *args):
89+
# We sometimes get this error on very small images
90+
raise tesseract.TesseractError(-8, "Estimating resolution as 1023")
91+
92+
monkeypatch.setattr(tesseract, "ocr_agents", {"eng": MockOCRAgent})
93+
monkeypatch.setattr(tesseract, "is_pytesseract_available", lambda *args: True)
94+
95+
image = Image.fromarray(np.random.randint(12, 24, (40, 40)), mode="RGB")
96+
text_block = layout.TextRegion(1, 2, 3, 4, text=None)
97+
98+
assert elements.ocr(text_block, image=image) == ""
99+
100+
86101
class MockLayoutModel:
87102
def __init__(self, layout):
88103
self.layout_return = layout
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.5.8" # pragma: no cover
1+
__version__ = "0.5.9" # pragma: no cover

unstructured_inference/inference/elements.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -269,7 +269,11 @@ def ocr(text_block: TextRegion, image: Image.Image, languages: str = "eng") -> s
269269
agent = tesseract.ocr_agents.get(languages)
270270
if agent is None:
271271
raise RuntimeError("OCR agent is not loaded for {languages}.")
272-
return agent.detect(cropped_image)
272+
273+
try:
274+
return agent.detect(cropped_image)
275+
except tesseract.TesseractError:
276+
return ""
273277

274278

275279
def needs_ocr(

unstructured_inference/models/tesseract.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,14 @@
11
from typing import Dict
22

3+
import pytesseract
34
from layoutparser.ocr.tesseract_agent import TesseractAgent, is_pytesseract_available
45

56
from unstructured_inference.logger import logger
67

78
ocr_agents: Dict[str, TesseractAgent] = {}
89

10+
TesseractError = pytesseract.pytesseract.TesseractError
11+
912

1013
def load_agent(languages: str = "eng"):
1114
"""Loads the Tesseract OCR agent as a global variable to ensure that we only load it once.

0 commit comments

Comments
 (0)