Skip to content

Commit ea0831f

Browse files
authored
fix: handle potential errors from tesseract call (#183)
We've seen a 500 error in `unstructured-api` due to an uncaught TesseractError in the `entire_page` path. I can't reproduce it, but we can at least add a try catch. The last fix was too aggessive, which we're tracking [here](Unstructured-IO/unstructured#1086), so we may need to adjust this fix as well. Closes #179
1 parent d162c56 commit ea0831f

File tree

5 files changed

+36
-7
lines changed

5 files changed

+36
-7
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
## 0.5.15
2+
3+
* Handle an uncaught TesseractError
4+
15
## 0.5.14
26

37
* Add TIFF test file and TIFF filetype to `test_from_image_file` in `test_layout`

test_unstructured_inference/inference/test_layout.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,26 @@ def test_get_page_elements(monkeypatch, mock_final_layout):
127127
assert elements == page.elements
128128

129129

130+
def test_get_page_elements_with_tesseract_error(monkeypatch, mock_final_layout):
131+
def mock_image_to_data(*args, **kwargs):
132+
raise tesseract.TesseractError(-2, "Estimating resolution as 1023")
133+
134+
monkeypatch.setattr(layout.pytesseract, "image_to_data", mock_image_to_data)
135+
136+
image = Image.fromarray(np.random.randint(12, 14, size=(40, 10, 3)), mode="RGB")
137+
page = layout.PageLayout(
138+
number=0,
139+
image=image,
140+
layout=mock_final_layout,
141+
detection_model=MockLayoutModel(mock_final_layout),
142+
)
143+
144+
elements = page.get_elements_with_detection_model(inplace=False)
145+
146+
assert str(elements[0]) == "A Catchy Title"
147+
assert str(elements[1]).startswith("A very repetitive narrative.")
148+
149+
130150
class MockPool:
131151
def map(self, f, xs):
132152
return [f(x) for x in xs]
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.5.14" # pragma: no cover
1+
__version__ = "0.5.15" # pragma: no cover

unstructured_inference/inference/elements.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -273,6 +273,7 @@ def ocr(text_block: TextRegion, image: Image.Image, languages: str = "eng") -> s
273273
try:
274274
return agent.detect(cropped_image)
275275
except tesseract.TesseractError:
276+
logger.warning("TesseractError: Skipping region", exc_info=True)
276277
return ""
277278

278279

unstructured_inference/inference/layout.py

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -246,12 +246,16 @@ def get_elements_with_detection_model(
246246
if self.ocr_mode == "individual_blocks":
247247
ocr_layout = None
248248
elif self.ocr_mode == "entire_page":
249-
ocr_data = pytesseract.image_to_data(
250-
self.image,
251-
lang=self.ocr_languages,
252-
output_type=Output.DICT,
253-
)
254-
ocr_layout = parse_ocr_data(ocr_data)
249+
ocr_layout = None
250+
try:
251+
ocr_data = pytesseract.image_to_data(
252+
self.image,
253+
lang=self.ocr_languages,
254+
output_type=Output.DICT,
255+
)
256+
ocr_layout = parse_ocr_data(ocr_data)
257+
except pytesseract.pytesseract.TesseractError:
258+
logger.warning("TesseractError: Skipping page", exc_info=True)
255259
else:
256260
raise ValueError("Invalid OCR mode")
257261

0 commit comments

Comments
 (0)