Skip to content

Commit 173f633

Browse files
authored
Chore: add env ENTIRE_PAGE_OCR to specify paddle/tesseract for entire page ocr (#209)
### Summary We need a way to use paddle for the entire page OCR since the OCR result could be better than tesseract, which has shown on some image files with tables. This PR adds an environment variable `ENTIRE_PAGE_OCR` that can be set to `paddle` or `tesseract`. We still use tesseract as default since paddle performs poorly on entire-page English PDF files. ### Test if you are on x86 arch, please run this snippet to install paddle (paddle still doesn't work on m1/m2 chip locally): ``` pip install paddlepaddle #or pip install unstructured.paddlepaddle if on aarch64 arch pip install unstructured_paddleocr export LD_LIBRARY_PATH=/usr/local/lib:/usr/local/lib64 ``` run the following script to see different entire page result from paddle and tesseract ``` from unstructured_inference.inference.layout import DocumentLayout import os def get_layout_from_image(ocr_languages): layout = DocumentLayout.from_image_file("sample-docs/table-multi-row-column-cells.png", ocr_languages=ocr_languages) # Create a list to store the layout elements with only "text" and "type" fields elements_dict_list = [] for page in layout.pages: for element in page.elements: element_dict = { "text": element.text, "type": element.type } elements_dict_list.append(element_dict) return elements_dict_list # default is tesseract os.environ['ENTIRE_PAGE_OCR'] = "tesseract" tesseract_elements = get_layout_from_image(ocr_languages="eng") # set env to use paddle and call function agin os.environ['ENTIRE_PAGE_OCR'] = "paddle" paddle_elements = get_layout_from_image(ocr_languages="en") # should expect difference assert tesseract_elements != paddle_elements # compare result print(tesseract_elements) print(paddle_elements) ``` ### Note There are different language code between tesseract and paddle on the same language i.e, `en` in paddle and `eng` in tesseract for English. This can be addressed once we introduce the language mappings from standard language code to tesseract and to paddle respectively. However, unlike tesseract, paddle does support passing in multiple languages, and we will fallback to tesseract if thats the case (future PR).
1 parent d6ccdc1 commit 173f633

File tree

9 files changed

+174
-48
lines changed

9 files changed

+174
-48
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
## 0.5.28
2+
3+
* add env variable `ENTIRE_PAGE_OCR` to specify using paddle or tesseract on entire page OCR
4+
15
## 0.5.27
26

37
* table structure detection now pads the input image by 25 pixels in all 4 directions to improve its recall

test_unstructured_inference/inference/test_layout.py

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -158,7 +158,9 @@ def join(self):
158158
pass
159159

160160

161-
def test_get_page_elements_with_ocr(monkeypatch):
161+
@pytest.mark.parametrize("entire_page_ocr", ["paddle", "tesseract"])
162+
def test_get_page_elements_with_ocr(monkeypatch, entire_page_ocr):
163+
monkeypatch.setenv("ENTIRE_PAGE_OCR", entire_page_ocr)
162164
text_block = layout.TextRegion(2, 4, 6, 8, text=None)
163165
image_block = layout.ImageTextRegion(8, 14, 16, 18)
164166
doc_initial_layout = [text_block, image_block]
@@ -182,12 +184,45 @@ def test_get_page_elements_with_ocr(monkeypatch):
182184
image=image,
183185
layout=doc_initial_layout,
184186
detection_model=MockLayoutModel(doc_final_layout),
187+
# Note(yuming): there are differnt language codes for same language
188+
# between paddle and tesseract
189+
ocr_languages="en" if entire_page_ocr == "paddle" else "eng",
185190
)
186191
page.get_elements_with_detection_model()
187192

188193
assert str(page) == "\n\nAn Even Catchier Title"
189194

190195

196+
def test_get_page_elements_with_ocr_invalid_entrie_page_ocr(monkeypatch):
197+
monkeypatch.setenv("ENTIRE_PAGE_OCR", "invalid_entire_page_ocr")
198+
text_block = layout.TextRegion(2, 4, 6, 8, text=None)
199+
image_block = layout.ImageTextRegion(8, 14, 16, 18)
200+
doc_initial_layout = [text_block, image_block]
201+
text_layoutelement = layoutelement.LayoutElement(
202+
2,
203+
4,
204+
6,
205+
8,
206+
text=None,
207+
type="UncategorizedText",
208+
)
209+
image_layoutelement = layoutelement.LayoutElement(8, 14, 16, 18, text=None, type="Image")
210+
doc_final_layout = [text_layoutelement, image_layoutelement]
211+
212+
monkeypatch.setattr(detectron2, "is_detectron2_available", lambda *args: True)
213+
monkeypatch.setattr(elements, "ocr", lambda *args, **kwargs: "An Even Catchier Title")
214+
215+
image = Image.fromarray(np.random.randint(12, 14, size=(40, 10, 3)), mode="RGB")
216+
page = layout.PageLayout(
217+
number=0,
218+
image=image,
219+
layout=doc_initial_layout,
220+
detection_model=MockLayoutModel(doc_final_layout),
221+
)
222+
with pytest.raises(ValueError):
223+
page.get_elements_with_detection_model()
224+
225+
191226
def test_read_pdf(monkeypatch, mock_initial_layout, mock_final_layout, mock_image):
192227
with tempfile.TemporaryDirectory() as tmpdir:
193228
image_path1 = os.path.join(tmpdir, "mock1.jpg")

test_unstructured_inference/models/test_tables.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -355,8 +355,8 @@ def test_table_prediction_paddle(monkeypatch):
355355
img = Image.open("./sample-docs/table-multi-row-column-cells.png").convert("RGB")
356356
prediction = table_model.predict(img)
357357
# Note(yuming): lossen paddle table prediction output test since performance issue
358-
# assert rows spans two rows are detected
359-
assert '<table><thead><th rowspan="2">' in prediction
358+
# and results are different in different platforms (i.e., gpu vs cpu)
359+
assert len(prediction)
360360

361361

362362
def test_table_prediction_invalid_table_ocr(monkeypatch):

test_unstructured_inference/test_elements.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
1+
import logging
12
from random import randint
23
from unittest.mock import PropertyMock, patch
34

45
import pytest
6+
from PIL import Image
57

68
from unstructured_inference.inference import elements
79

@@ -184,3 +186,15 @@ def test_intersection_over_min(
184186
assert (
185187
rect1.intersection_over_minimum(rect2) == rect2.intersection_over_minimum(rect1) == expected
186188
)
189+
190+
191+
def test_ocr_paddle(monkeypatch, caplog):
192+
monkeypatch.setenv("ENTIRE_PAGE_OCR", "paddle")
193+
image = Image.new("RGB", (100, 100), (255, 255, 255))
194+
text_block = elements.TextRegion(0, 0, 50, 50)
195+
# Note(yuming): paddle result is currently non-deterministic on ci
196+
# so don't check result like `assert result == ""`
197+
# use logger info to confirm we are using paddle instead
198+
with caplog.at_level(logging.INFO):
199+
_ = elements.ocr(text_block, image, languages="en")
200+
assert "paddle" in caplog.text
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.5.27" # pragma: no cover
1+
__version__ = "0.5.28" # pragma: no cover

unstructured_inference/inference/elements.py

Lines changed: 22 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from __future__ import annotations
22

3+
import os
34
import re
45
import unicodedata
56
from copy import deepcopy
@@ -267,15 +268,27 @@ def ocr(text_block: TextRegion, image: Image.Image, languages: str = "eng") -> s
267268
tesseract.load_agent(languages=languages)
268269
padded_block = text_block.pad(12)
269270
cropped_image = image.crop((padded_block.x1, padded_block.y1, padded_block.x2, padded_block.y2))
270-
agent = tesseract.ocr_agents.get(languages)
271-
if agent is None:
272-
raise RuntimeError("OCR agent is not loaded for {languages}.")
273-
274-
try:
275-
return agent.detect(cropped_image)
276-
except tesseract.TesseractError:
277-
logger.warning("TesseractError: Skipping region", exc_info=True)
278-
return ""
271+
entrie_page_ocr = os.getenv("ENTIRE_PAGE_OCR", "tesseract").lower()
272+
if entrie_page_ocr == "paddle":
273+
from unstructured_inference.models import paddle_ocr
274+
275+
paddle_result = paddle_ocr.load_agent().ocr(np.array(cropped_image), cls=True)
276+
recognized_text = ""
277+
for idx in range(len(paddle_result)):
278+
res = paddle_result[idx]
279+
for line in res:
280+
recognized_text += line[1][0]
281+
return recognized_text
282+
else:
283+
agent = tesseract.ocr_agents.get(languages)
284+
if agent is None:
285+
raise RuntimeError("OCR agent is not loaded for {languages}.")
286+
287+
try:
288+
return agent.detect(cropped_image)
289+
except tesseract.TesseractError:
290+
logger.warning("TesseractError: Skipping region", exc_info=True)
291+
return ""
279292

280293

281294
def needs_ocr(

unstructured_inference/inference/layout.py

Lines changed: 66 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -264,17 +264,34 @@ def get_elements_with_detection_model(
264264
ocr_layout = None
265265
elif self.ocr_mode == OCRMode.FULL_PAGE.value:
266266
ocr_layout = None
267-
try:
268-
ocr_data = pytesseract.image_to_data(
269-
self.image,
270-
lang=self.ocr_languages,
271-
output_type=Output.DICT,
267+
entrie_page_ocr = os.getenv("ENTIRE_PAGE_OCR", "tesseract").lower()
268+
if entrie_page_ocr not in ["paddle", "tesseract"]:
269+
raise ValueError(
270+
"Environment variable ENTIRE_PAGE_OCR must be set to 'tesseract' or 'paddle'.",
272271
)
273-
ocr_layout = parse_ocr_data(ocr_data)
274-
except pytesseract.pytesseract.TesseractError:
275-
logger.warning("TesseractError: Skipping page", exc_info=True)
276-
else:
277-
raise ValueError("Invalid OCR mode")
272+
273+
if entrie_page_ocr == "paddle":
274+
logger.info("Processing entrie page OCR with paddle...")
275+
from unstructured_inference.models import paddle_ocr
276+
277+
# TODO(yuming): paddle only support one language at once,
278+
# change ocr to tesseract if passed in multilanguages.
279+
ocr_data = paddle_ocr.load_agent(language=self.ocr_languages).ocr(
280+
np.array(self.image),
281+
cls=True,
282+
)
283+
ocr_layout = parse_ocr_data_paddle(ocr_data)
284+
else:
285+
logger.info("Processing entrie page OCR with tesseract...")
286+
try:
287+
ocr_data = pytesseract.image_to_data(
288+
self.image,
289+
lang=self.ocr_languages,
290+
output_type=Output.DICT,
291+
)
292+
ocr_layout = parse_ocr_data_tesseract(ocr_data)
293+
except pytesseract.pytesseract.TesseractError:
294+
logger.warning("TesseractError: Skipping page", exc_info=True)
278295

279296
if self.layout is not None:
280297
threshold_kwargs = {}
@@ -626,9 +643,10 @@ def load_pdf(
626643
return layouts, images
627644

628645

629-
def parse_ocr_data(ocr_data: dict) -> List[TextRegion]:
646+
def parse_ocr_data_tesseract(ocr_data: dict) -> List[TextRegion]:
630647
"""
631-
Parse the OCR result data to extract a list of TextRegion objects.
648+
Parse the OCR result data to extract a list of TextRegion objects from
649+
tesseract.
632650
633651
The function processes the OCR result dictionary, looking for bounding
634652
box information and associated text to create instances of the TextRegion
@@ -664,3 +682,39 @@ def parse_ocr_data(ocr_data: dict) -> List[TextRegion]:
664682
text_regions.append(text_region)
665683

666684
return text_regions
685+
686+
687+
def parse_ocr_data_paddle(ocr_data: list) -> List[TextRegion]:
688+
"""
689+
Parse the OCR result data to extract a list of TextRegion objects from
690+
paddle.
691+
692+
The function processes the OCR result dictionary, looking for bounding
693+
box information and associated text to create instances of the TextRegion
694+
class, which are then appended to a list.
695+
696+
Parameters:
697+
- ocr_data (list): A list containing the OCR result data
698+
699+
Returns:
700+
- List[TextRegion]: A list of TextRegion objects, each representing a
701+
detected text region within the OCR-ed image.
702+
703+
Note:
704+
- An empty string or a None value for the 'text' key in the input
705+
dictionary will result in its associated bounding box being ignored.
706+
"""
707+
text_regions = []
708+
for idx in range(len(ocr_data)):
709+
res = ocr_data[idx]
710+
for line in res:
711+
x1 = min([i[0] for i in line[0]])
712+
y1 = min([i[1] for i in line[0]])
713+
x2 = max([i[0] for i in line[0]])
714+
y2 = max([i[1] for i in line[0]])
715+
text = line[1][0]
716+
if text:
717+
text_region = TextRegion(x1, y1, x2, y2, text)
718+
text_regions.append(text_region)
719+
720+
return text_regions
Lines changed: 27 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,12 @@
1+
import functools
2+
13
import paddle
24
from unstructured_paddleocr import PaddleOCR
35

4-
paddle_ocr = None # type: ignore
6+
from unstructured_inference.logger import logger
57

68

9+
@functools.lru_cache(maxsize=None)
710
def load_agent(language: str = "en"):
811
"""Loads the PaddleOCR agent as a global variable to ensure that we only load it once."""
912

@@ -13,26 +16,27 @@ def load_agent(language: str = "en"):
1316
paddle.disable_signal_handler()
1417
# Use paddlepaddle-gpu if there is gpu device available
1518
gpu_available = paddle.device.cuda.device_count() > 0
16-
17-
global paddle_ocr
18-
if paddle_ocr is None:
19-
try:
20-
# Enable MKL-DNN for paddle to speed up OCR if OS supports it
21-
# ref: https://paddle-inference.readthedocs.io/en/master/
22-
# api_reference/cxx_api_doc/Config/CPUConfig.html
23-
paddle_ocr = PaddleOCR(
24-
use_angle_cls=True,
25-
use_gpu=gpu_available,
26-
lang=language,
27-
enable_mkldnn=True,
28-
show_log=False,
29-
)
30-
except AttributeError:
31-
paddle_ocr = PaddleOCR(
32-
use_angle_cls=True,
33-
use_gpu=gpu_available,
34-
lang=language,
35-
enable_mkldnn=False,
36-
show_log=False,
37-
)
19+
if gpu_available:
20+
logger.info(f"Loading paddle with GPU on language={language}...")
21+
else:
22+
logger.info(f"Loading paddle with CPU on language={language}...")
23+
try:
24+
# Enable MKL-DNN for paddle to speed up OCR if OS supports it
25+
# ref: https://paddle-inference.readthedocs.io/en/master/
26+
# api_reference/cxx_api_doc/Config/CPUConfig.html
27+
paddle_ocr = PaddleOCR(
28+
use_angle_cls=True,
29+
use_gpu=gpu_available,
30+
lang=language,
31+
enable_mkldnn=True,
32+
show_log=False,
33+
)
34+
except AttributeError:
35+
paddle_ocr = PaddleOCR(
36+
use_angle_cls=True,
37+
use_gpu=gpu_available,
38+
lang=language,
39+
enable_mkldnn=False,
40+
show_log=False,
41+
)
3842
return paddle_ocr

unstructured_inference/models/tables.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ def get_tokens(self, x: Image):
6363
"Environment variable TABLE_OCR must be set to 'tesseract' or 'paddle'.",
6464
)
6565
if table_ocr == "paddle":
66+
logger.info("Processing table OCR with paddleocr...")
6667
from unstructured_inference.models import paddle_ocr
6768

6869
paddle_result = paddle_ocr.load_agent().ocr(np.array(x), cls=True)
@@ -78,6 +79,7 @@ def get_tokens(self, x: Image):
7879
tokens.append({"bbox": [xmin, ymin, xmax, ymax], "text": line[1][0]})
7980
return tokens
8081
else:
82+
logger.info("Processing table OCR with tesseract...")
8183
ocr_df: pd.DataFrame = pytesseract.image_to_data(
8284
x,
8385
output_type="data.frame",

0 commit comments

Comments
 (0)