Skip to content

Commit 5bc67b5

Browse files
authored
chore: support paddle with both cpu and gpu if it is installed (#207)
address: #200 ### Summary * Before this PR, we only used paddle if the platform is `x86_64`, remove this platform check and just assume that paddle and paddle OCR is already installed on user's platform * Introduce an env variable `TABLE_OCR` that can be used to specify if we use paddle/tesseract for table extraction, default is tesseract * Add logic for parameter `use_gpu` while setting up paddle ocr instance, if there is gpu advices available, paddle will use gpu ### Test * paddle on CPU: please check unit test `test_table_prediction_paddle`, the result is not good (can use gdb to print value of `prediction`) but can prove that paddle ocr is running in CI tests * paddle on GPU: on a gpu instance: * rerun test `PYTHONPATH=. pytest test_unstructured_inference/models/test_tables.py::test_table_prediction_paddle` and run * from another terminal run `nvidia-smi -l 1` to monitor gpu usage, and you can see the python test is using gpu memory
1 parent 52c5bea commit 5bc67b5

File tree

8 files changed

+127
-96
lines changed

8 files changed

+127
-96
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
## 0.5.26
2+
3+
* support paddle with both cpu and gpu and assumed it is pre-installed
4+
15
## 0.5.25
26

37
* fix a bug where `cells_to_html` doesn't handle cells spanning multiple rows properly

Makefile

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@ install-detectron2:
3434

3535
.PHONY: install-paddleocr
3636
install-paddleocr:
37+
pip install paddlepaddle
38+
pip install paddlepaddle-gpu
3739
pip install "unstructured.PaddleOCR"
3840

3941
.PHONY: install-test

README.md

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -36,17 +36,21 @@ tips on installing Detectron2 on Windows.
3636

3737
### PaddleOCR
3838

39-
[PaddleOCR](https://github.com/Unstructured-IO/unstructured.PaddleOCR) is suggested for table processing for `x86_64` architectures.
40-
It **should not be installed under MacOS running Apple Silicon**.
39+
[PaddleOCR](https://github.com/Unstructured-IO/unstructured.PaddleOCR) is suggested for table processing. Please set
40+
environment variable `TABLE_OCR`
41+
to `paddle` if you wish to use paddle for table processing instead of default `tesseract`.
4142

4243
PaddleOCR may be with installed with:
4344

4445
```shell
45-
# x86_64 only!
46+
pip install paddepaddle
4647
pip install "unstructured.PaddleOCR"
4748
```
4849

49-
If paddle is not available, OCR is handled by tesseract instead.
50+
We suggest that you install paddlepaddle-gpu with `pip install paddepaddle-gpu` if you have gpu devices available for better OCR performance.
51+
52+
Please note that **paddlepaddle does not work on MacOS with Apple Silicon**. So if you want it running on Apple M1/M2 chip, we have a custom wheel of paddlepaddle for aarch64 architecture, you can install it with `pip install unstructured.paddlepaddle`, and run it inside a docker container.
53+
5054

5155
### Repository
5256

setup.py

Lines changed: 0 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -74,21 +74,4 @@ def load_text_from_file(filename: str):
7474
version=__version__,
7575
entry_points={},
7676
install_requires=load_requirements(),
77-
extras_require={
78-
"tables": [
79-
'unstructured.PaddleOCR ; platform_machine=="x86_64"',
80-
# NOTE(crag): workaround issue for error output below
81-
# ERROR test_unstructured/partition/test_common.py - TypeError: Descriptors cannot not
82-
# be created directly.
83-
# If this call came from a _pb2.py file, your generated code is out of date and must be
84-
# regenerated with protoc >= 3.19.0.
85-
# If you cannot immediately regenerate your protos, some other possible workarounds are:
86-
# 1. Downgrade the protobuf package to 3.20.x or lower.
87-
# 2. Set PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python (but this will use pure-Python
88-
# parsing and will be much slower).
89-
'protobuf<3.21 ; platform_machine=="x86_64"',
90-
# NOTE(alan): Pin to get around error: undefined symbol: _dl_sym, version GLIBC_PRIVATE
91-
'paddlepaddle>=2.4 ; platform_machine=="x86_64"',
92-
],
93-
},
9477
)

test_unstructured_inference/models/test_tables.py

Lines changed: 40 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
from unittest.mock import patch
2-
31
import pytest
42
from transformers.models.table_transformer.modeling_table_transformer import (
53
TableTransformerDecoder,
@@ -326,35 +324,50 @@ def test_align_rows(rows, bbox, output):
326324
assert postprocess.align_rows(rows, bbox) == output
327325

328326

329-
# TODO: break this test down so it doesn't account for nearly 8% of test coverage
330-
@pytest.mark.parametrize(
331-
("model_path", "platform_type"),
332-
[
333-
("microsoft/table-transformer-structure-recognition", "arm64"),
334-
("microsoft/table-transformer-structure-recognition", "x86_64"),
335-
],
336-
)
337-
def test_table_prediction(model_path, platform_type):
338-
with patch("platform.machine", return_value=platform_type):
327+
def test_table_prediction_tesseract():
328+
table_model = tables.UnstructuredTableTransformerModel()
329+
from PIL import Image
330+
331+
table_model.initialize(model="microsoft/table-transformer-structure-recognition")
332+
img = Image.open("./sample-docs/table-multi-row-column-cells.png").convert("RGB")
333+
prediction = table_model.predict(img)
334+
# assert rows spans two rows are detected
335+
assert '<table><thead><th rowspan="2">' in prediction
336+
# one of the safest rows to detect should be present
337+
assert (
338+
"<tr>"
339+
"<td>Blind</td>"
340+
"<td>5</td>"
341+
"<td>1</td>"
342+
"<td>4</td>"
343+
"<td>34.5%, n=1</td>"
344+
"<td>1199 sec, n=1</td>"
345+
"</tr>"
346+
) in prediction
347+
348+
349+
def test_table_prediction_paddle(monkeypatch):
350+
monkeypatch.setenv("TABLE_OCR", "paddle")
351+
table_model = tables.UnstructuredTableTransformerModel()
352+
from PIL import Image
353+
354+
table_model.initialize(model="microsoft/table-transformer-structure-recognition")
355+
img = Image.open("./sample-docs/table-multi-row-column-cells.png").convert("RGB")
356+
prediction = table_model.predict(img)
357+
# Note(yuming): lossen paddle table prediction output test since performance issue
358+
# assert rows spans two rows are detected
359+
assert '<table><thead><th rowspan="2">' in prediction
360+
361+
362+
def test_table_prediction_invalid_table_ocr(monkeypatch):
363+
monkeypatch.setenv("TABLE_OCR", "invalid_table_ocr")
364+
with pytest.raises(ValueError):
339365
table_model = tables.UnstructuredTableTransformerModel()
340366
from PIL import Image
341367

342-
table_model.initialize(model=model_path)
368+
table_model.initialize(model="microsoft/table-transformer-structure-recognition")
343369
img = Image.open("./sample-docs/table-multi-row-column-cells.png").convert("RGB")
344-
prediction = table_model.predict(img)
345-
# assert rows spans two rows are detected
346-
assert '<table><thead><th rowspan="2">' in prediction
347-
# one of the safest rows to detect should be present
348-
assert (
349-
"<tr>"
350-
"<td>Blind</td>"
351-
"<td>5</td>"
352-
"<td>1</td>"
353-
"<td>4</td>"
354-
"<td>34.5%, n=1</td>"
355-
"<td>1199 sec, n=1</td>"
356-
"</tr>"
357-
) in prediction
370+
_ = table_model.predict(img)
358371

359372

360373
def test_intersect():
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.5.25" # pragma: no cover
1+
__version__ = "0.5.26" # pragma: no cover
Lines changed: 30 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,38 @@
1+
import paddle
2+
from unstructured_paddleocr import PaddleOCR
3+
14
paddle_ocr = None # type: ignore
25

36

4-
def load_agent():
7+
def load_agent(language: str = "en"):
58
"""Loads the PaddleOCR agent as a global variable to ensure that we only load it once."""
69

7-
from unstructured_paddleocr import PaddleOCR
10+
# Disable signal handlers at C++ level upon failing
11+
# ref: https://www.paddlepaddle.org.cn/documentation/docs/en/api/paddle/
12+
# disable_signal_handler_en.html#disable-signal-handler
13+
paddle.disable_signal_handler()
14+
# Use paddlepaddle-gpu if there is gpu device available
15+
gpu_available = paddle.device.cuda.device_count() > 0
816

917
global paddle_ocr
10-
paddle_ocr = PaddleOCR(use_angle_cls=True, lang="en", mkl_dnn=True, show_log=False)
11-
18+
if paddle_ocr is None:
19+
try:
20+
# Enable MKL-DNN for paddle to speed up OCR if OS supports it
21+
# ref: https://paddle-inference.readthedocs.io/en/master/
22+
# api_reference/cxx_api_doc/Config/CPUConfig.html
23+
paddle_ocr = PaddleOCR(
24+
use_angle_cls=True,
25+
use_gpu=gpu_available,
26+
lang=language,
27+
enable_mkldnn=True,
28+
show_log=False,
29+
)
30+
except AttributeError:
31+
paddle_ocr = PaddleOCR(
32+
use_angle_cls=True,
33+
use_gpu=gpu_available,
34+
lang=language,
35+
enable_mkldnn=False,
36+
show_log=False,
37+
)
1238
return paddle_ocr

unstructured_inference/models/tables.py

Lines changed: 42 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# https://github.com/microsoft/table-transformer/blob/main/src/inference.py
22
# https://github.com/NielsRogge/Transformers-Tutorials/blob/master/Table%20Transformer/Using_Table_Transformer_for_table_detection_and_table_structure_recognition.ipynb
33
import logging
4-
import platform
4+
import os
55
import xml.etree.ElementTree as ET
66
from collections import defaultdict
77
from pathlib import Path
@@ -56,49 +56,48 @@ def initialize(
5656

5757
def get_tokens(self, x: Image):
5858
"""Get OCR tokens from either paddleocr or tesseract"""
59-
if platform.machine() == "x86_64":
60-
try:
61-
from unstructured_inference.models import paddle_ocr
62-
63-
paddle_result = paddle_ocr.load_agent().ocr(np.array(x), cls=True)
64-
65-
tokens = []
66-
for idx in range(len(paddle_result)):
67-
res = paddle_result[idx]
68-
for line in res:
69-
xmin = min([i[0] for i in line[0]])
70-
ymin = min([i[1] for i in line[0]])
71-
xmax = max([i[0] for i in line[0]])
72-
ymax = max([i[1] for i in line[0]])
73-
tokens.append({"bbox": [xmin, ymin, xmax, ymax], "text": line[1][0]})
74-
return tokens
75-
except ModuleNotFoundError:
76-
logging.warning(
77-
"No module named 'unstructured_paddleocr', falling back to tesseract",
78-
)
79-
pass
80-
81-
ocr_df: pd.DataFrame = pytesseract.image_to_data(
82-
x,
83-
output_type="data.frame",
84-
)
85-
86-
ocr_df = ocr_df.dropna()
87-
88-
tokens = []
89-
for idtx in ocr_df.itertuples():
90-
tokens.append(
91-
{
92-
"bbox": [
93-
idtx.left,
94-
idtx.top,
95-
idtx.left + idtx.width,
96-
idtx.top + idtx.height,
97-
],
98-
"text": idtx.text,
99-
},
59+
table_ocr = os.getenv("TABLE_OCR", "tesseract").lower()
60+
if table_ocr not in ["paddle", "tesseract"]:
61+
raise ValueError(
62+
"Environment variable TABLE_OCR must be set to 'tesseract' or 'paddle'.",
63+
)
64+
if table_ocr == "paddle":
65+
from unstructured_inference.models import paddle_ocr
66+
67+
paddle_result = paddle_ocr.load_agent().ocr(np.array(x), cls=True)
68+
69+
tokens = []
70+
for idx in range(len(paddle_result)):
71+
res = paddle_result[idx]
72+
for line in res:
73+
xmin = min([i[0] for i in line[0]])
74+
ymin = min([i[1] for i in line[0]])
75+
xmax = max([i[0] for i in line[0]])
76+
ymax = max([i[1] for i in line[0]])
77+
tokens.append({"bbox": [xmin, ymin, xmax, ymax], "text": line[1][0]})
78+
return tokens
79+
else:
80+
ocr_df: pd.DataFrame = pytesseract.image_to_data(
81+
x,
82+
output_type="data.frame",
10083
)
101-
return tokens
84+
85+
ocr_df = ocr_df.dropna()
86+
87+
tokens = []
88+
for idtx in ocr_df.itertuples():
89+
tokens.append(
90+
{
91+
"bbox": [
92+
idtx.left,
93+
idtx.top,
94+
idtx.left + idtx.width,
95+
idtx.top + idtx.height,
96+
],
97+
"text": idtx.text,
98+
},
99+
)
100+
return tokens
102101

103102
def run_prediction(self, x: Image):
104103
"""Predict table structure"""

0 commit comments

Comments
 (0)