Skip to content

Commit 47c116a

Browse files
rbiseck3cragwolfe
andauthored
Fall back to pytesseract if paddleocr fails (#127)
* Fall back to pytesseract if paddleocr fails * Add docstring and bump version * change new label in changelog * update changelog to fold current change into existing version * Set the version to 0.5.2 * add generated pycharm files to gitignore --------- Co-authored-by: cragwolfe <[email protected]>
1 parent a8fefc2 commit 47c116a

File tree

3 files changed

+63
-48
lines changed

3 files changed

+63
-48
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,9 @@ venv.bak/
118118
# Rope project settings
119119
.ropeproject
120120

121+
# Pycharm
122+
.idea/
123+
121124
# mkdocs documentation
122125
/site
123126

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
* Combine inferred elements with extracted elements
44
* Add ruff to keep code consistent with unstructured
5+
* Configure fallback for OCR token if paddleocr doesn't work to use tesseract
56

67
## 0.5.1
78

unstructured_inference/models/tables.py

Lines changed: 59 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -55,60 +55,71 @@ def initialize(
5555
)
5656
self.model.to(device)
5757

58+
def get_tokens(self, x: Image):
59+
"""Get OCR tokens from either paddleocr or tesseract"""
60+
if platform.machine() == "x86_64":
61+
try:
62+
from unstructured_inference.models import paddle_ocr
63+
64+
paddle_result = paddle_ocr.load_agent().ocr(np.array(x), cls=True)
65+
66+
tokens = []
67+
for idx in range(len(paddle_result)):
68+
res = paddle_result[idx]
69+
for line in res:
70+
xmin = min([i[0] for i in line[0]])
71+
ymin = min([i[1] for i in line[0]])
72+
xmax = max([i[0] for i in line[0]])
73+
ymax = max([i[1] for i in line[0]])
74+
tokens.append({"bbox": [xmin, ymin, xmax, ymax], "text": line[1][0]})
75+
return tokens
76+
except ModuleNotFoundError:
77+
logging.warning(
78+
"No module named 'unstructured_paddleocr', falling back to tesseract",
79+
)
80+
pass
81+
zoom = 6
82+
img = cv2.resize(
83+
cv2.cvtColor(np.array(x), cv2.COLOR_RGB2BGR),
84+
None,
85+
fx=zoom,
86+
fy=zoom,
87+
interpolation=cv2.INTER_CUBIC,
88+
)
89+
90+
kernel = np.ones((1, 1), np.uint8)
91+
img = cv2.dilate(img, kernel, iterations=1)
92+
img = cv2.erode(img, kernel, iterations=1)
93+
94+
ocr_df: pd.DataFrame = pytesseract.image_to_data(
95+
Image.fromarray(img),
96+
output_type="data.frame",
97+
)
98+
99+
ocr_df = ocr_df.dropna()
100+
101+
tokens = []
102+
for idtx in ocr_df.itertuples():
103+
tokens.append(
104+
{
105+
"bbox": [
106+
idtx.left / zoom,
107+
idtx.top / zoom,
108+
(idtx.left + idtx.width) / zoom,
109+
(idtx.top + idtx.height) / zoom,
110+
],
111+
"text": idtx.text,
112+
},
113+
)
114+
return tokens
115+
58116
def run_prediction(self, x: Image):
59117
"""Predict table structure"""
60118
with torch.no_grad():
61119
encoding = self.feature_extractor(x, return_tensors="pt").to(self.device)
62120
outputs_structure = self.model(**encoding)
63121

64-
if platform.machine() == "x86_64":
65-
from unstructured_inference.models import paddle_ocr
66-
67-
paddle_result = paddle_ocr.load_agent().ocr(np.array(x), cls=True)
68-
69-
tokens = []
70-
for idx in range(len(paddle_result)):
71-
res = paddle_result[idx]
72-
for line in res:
73-
xmin = min([i[0] for i in line[0]])
74-
ymin = min([i[1] for i in line[0]])
75-
xmax = max([i[0] for i in line[0]])
76-
ymax = max([i[1] for i in line[0]])
77-
tokens.append({"bbox": [xmin, ymin, xmax, ymax], "text": line[1][0]})
78-
else:
79-
zoom = 6
80-
img = cv2.resize(
81-
cv2.cvtColor(np.array(x), cv2.COLOR_RGB2BGR),
82-
None,
83-
fx=zoom,
84-
fy=zoom,
85-
interpolation=cv2.INTER_CUBIC,
86-
)
87-
88-
kernel = np.ones((1, 1), np.uint8)
89-
img = cv2.dilate(img, kernel, iterations=1)
90-
img = cv2.erode(img, kernel, iterations=1)
91-
92-
ocr_df: pd.DataFrame = pytesseract.image_to_data(
93-
Image.fromarray(img),
94-
output_type="data.frame",
95-
)
96-
97-
ocr_df = ocr_df.dropna()
98-
99-
tokens = []
100-
for idtx in ocr_df.itertuples():
101-
tokens.append(
102-
{
103-
"bbox": [
104-
idtx.left / zoom,
105-
idtx.top / zoom,
106-
(idtx.left + idtx.width) / zoom,
107-
(idtx.top + idtx.height) / zoom,
108-
],
109-
"text": idtx.text,
110-
},
111-
)
122+
tokens = self.get_tokens(x=x)
112123

113124
sorted(tokens, key=lambda x: x["bbox"][1] * 10000 + x["bbox"][0])
114125

0 commit comments

Comments
 (0)