Skip to content

Commit c4d3e8b

Browse files
authored
feat: add autoscaling for table images (#210)
Auto scale table images so that the text height is optimum for `tesseract` OCR inference. This functionality will scaling images where the estimated mean text height based on the `inference_config` setup: table images with text height below `inference_config.TESSERACT_MIN_TEXT_HEIGHT` or above `inference_config.TESSERACT_MAX_TEXT_HEIGHT` are scaled so that the text height is at `inference_config.TESSERACT_OPTIMUM_TEXT_HEIGHT`. This PR resolves [CORE-1863](https://unstructured-ai.atlassian.net/browse/CORE-1863) ## test - this PR adds a unit test to confirm auto scale is triggered - test the tokens computed without zoom and with zoom with the attached image: with zoom the tokens should include the correct text "Japanese" in the table on the page. Without zoom (call get_tokens using main) we won't see this token and instead you might find a token that look like "Inpanere". For this specific document it is best to set `TESSERACT_MIN_TEXT_HEIGHT` to 12. ![layout-parser-paper-with-table](https://github.com/Unstructured-IO/unstructured-inference/assets/647930/7963bba0-67cb-48ee-b338-52b1c2620fc0) [CORE-1863]: https://unstructured-ai.atlassian.net/browse/CORE-1863?atlOrigin=eyJpIjoiNWRkNTljNzYxNjVmNDY3MDlhMDU5Y2ZhYzA5YTRkZjUiLCJwIjoiZ2l0aHViLWNvbS1KU1cifQ
1 parent cb2aff2 commit c4d3e8b

File tree

10 files changed

+113
-11
lines changed

10 files changed

+113
-11
lines changed

CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,8 @@
1+
## 0.6.4
2+
3+
* add a function to automatically scale table crop images based on text height so the text height is optimum for `tesseract` OCR task
4+
* add the new image auto scaling parameters to `config.py`
5+
16
## 0.6.3
27

38
* fix a bug where padded table structure bounding boxes are not shifted back into the original image coordinates correctly

requirements/base.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ humanfriendly==10.0
4444
# via coloredlogs
4545
idna==3.4
4646
# via requests
47-
importlib-resources==6.0.1
47+
importlib-resources==6.1.0
4848
# via matplotlib
4949
iopath==0.1.10
5050
# via layoutparser

requirements/dev.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ importlib-metadata==6.8.0
101101
# jupyterlab
102102
# jupyterlab-server
103103
# nbconvert
104-
importlib-resources==6.0.1
104+
importlib-resources==6.1.0
105105
# via
106106
# -c requirements/base.txt
107107
# jsonschema
@@ -139,7 +139,7 @@ json5==0.9.14
139139
# via jupyterlab-server
140140
jsonpointer==2.4
141141
# via jsonschema
142-
jsonschema[format-nongpl]==4.19.0
142+
jsonschema[format-nongpl]==4.19.1
143143
# via
144144
# jupyter-events
145145
# jupyterlab-server

requirements/test.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ flake8
1212
flake8-docstrings
1313
mypy
1414
pytest-cov
15+
pytest-mock
1516
pdf2image>=1.16.2
1617
huggingface_hub>=0.11.1
1718
ruff

requirements/test.txt

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,9 +97,13 @@ pydocstyle==6.3.0
9797
pyflakes==3.1.0
9898
# via flake8
9999
pytest==7.4.2
100-
# via pytest-cov
100+
# via
101+
# pytest-cov
102+
# pytest-mock
101103
pytest-cov==4.1.0
102104
# via -r requirements/test.in
105+
pytest-mock==3.11.1
106+
# via -r requirements/test.in
103107
pyyaml==6.0.1
104108
# via
105109
# -c requirements/base.txt

test_unstructured_inference/models/test_tables.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import os
2+
from pathlib import Path
23

34
import numpy as np
45
import pytest
@@ -589,6 +590,21 @@ def test_cells_to_html():
589590
assert tables.cells_to_html(cells) == expected
590591

591592

593+
def test_auto_zoom(mocker):
594+
spy = mocker.spy(tables, "zoom_image")
595+
model = tables.UnstructuredTableTransformerModel()
596+
model.initialize("microsoft/table-transformer-structure-recognition")
597+
image = Image.open(
598+
Path(os.path.dirname(os.path.abspath(__file__)))
599+
/ ".."
600+
/ ".."
601+
/ "sample-docs"
602+
/ "layout-parser-paper-fast.jpg",
603+
)
604+
model.get_tokens(image)
605+
assert spy.call_count == 1
606+
607+
592608
def test_padded_results_has_right_dimensions(table_transformer, example_image):
593609
str_class_name2idx = tables.get_class_map("structure")
594610
# a simpler mapping so we keep all structure in the returned objs below for test
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.6.3" # pragma: no cover
1+
__version__ = "0.6.4" # pragma: no cover

unstructured_inference/config.py

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,35 @@ def TABLE_IMAGE_BACKGROUND_PAD(self) -> int:
4444
The padding adds NO image data around an identified table bounding box; it simply adds white
4545
background around the image
4646
"""
47-
return self._get_int("TABLE_IMAGE_BACKGROUND_PAD", 0)
47+
return self._get_int("TABLE_IMAGE_BACKGROUND_PAD", 20)
48+
49+
@property
50+
def TESSERACT_MIN_TEXT_HEIGHT(self) -> int:
51+
"""minimum text height acceptable from tesseract OCR results
52+
53+
if estimated text height from tesseract OCR results is lower than this value the image is
54+
scaled up to be processed again
55+
"""
56+
return self._get_int("TESSERACT_MIN_TEXT_HEIGHT", 12)
57+
58+
@property
59+
def TESSERACT_MAX_TEXT_HEIGHT(self) -> int:
60+
"""maximum text height acceptable from tesseract OCR results
61+
62+
if estimated text height from tesseract OCR results is higher than this value the image is
63+
scaled down to be processed again
64+
"""
65+
return self._get_int("TESSERACT_MAX_TEXT_HEIGHT", 100)
66+
67+
@property
68+
def TESSERACT_OPTIMUM_TEXT_HEIGHT(self) -> int:
69+
"""optimum text height for tesseract OCR"""
70+
return self._get_int("TESSERACT_OPTIMUM_TEXT_HEIGHT", 20)
71+
72+
@property
73+
def TESSERACT_TEXT_HEIGHT_QUANTILE(self) -> float:
74+
"""the quantile to check for text height"""
75+
return self._get_float("TESSERACT_TEXT_HEIGHT_QUANTILE", 0.5)
4876

4977
@property
5078
def TT_TABLE_CONF(self) -> float:

unstructured_inference/models/tables.py

Lines changed: 49 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from pathlib import Path
88
from typing import List, Optional, Union
99

10+
import cv2
1011
import numpy as np
1112
import pandas as pd
1213
import pytesseract
@@ -17,6 +18,9 @@
1718
from unstructured_inference.config import inference_config
1819
from unstructured_inference.logger import logger
1920
from unstructured_inference.models.table_postprocess import Rect
21+
from unstructured_inference.models.tesseract import (
22+
TESSERACT_TEXT_HEIGHT,
23+
)
2024
from unstructured_inference.models.unstructuredmodel import UnstructuredModel
2125
from unstructured_inference.utils import pad_image_with_background_color
2226

@@ -79,23 +83,45 @@ def get_tokens(self, x: Image):
7983
ymax = max([i[1] for i in line[0]])
8084
tokens.append({"bbox": [xmin, ymin, xmax, ymax], "text": line[1][0]})
8185
else:
86+
zoom = 1
87+
8288
logger.info("Processing table OCR with tesseract...")
8389
ocr_df: pd.DataFrame = pytesseract.image_to_data(
8490
x,
8591
output_type="data.frame",
8692
)
87-
8893
ocr_df = ocr_df.dropna()
8994

95+
# tesseract performance degrades when the text height is out of the preferred zone so we
96+
# zoom the image (in or out depending on estimated text height) for optimum OCR results
97+
# but this needs to be evaluated based on actual use case as the optimum scaling also
98+
# depend on type of characters (font, language, etc); be careful about this
99+
# functionality
100+
text_height = ocr_df[TESSERACT_TEXT_HEIGHT].quantile(
101+
inference_config.TESSERACT_TEXT_HEIGHT_QUANTILE,
102+
)
103+
if (
104+
text_height < inference_config.TESSERACT_MIN_TEXT_HEIGHT
105+
or text_height > inference_config.TESSERACT_MAX_TEXT_HEIGHT
106+
):
107+
# rounding avoids unnecessary precision and potential numerical issues assocaited
108+
# with numbers very close to 1 inside cv2 image processing
109+
zoom = np.round(inference_config.TESSERACT_OPTIMUM_TEXT_HEIGHT / text_height, 1)
110+
ocr_df = pytesseract.image_to_data(
111+
zoom_image(x, zoom),
112+
output_type="data.frame",
113+
)
114+
ocr_df = ocr_df.dropna()
115+
90116
tokens = []
91117
for idtx in ocr_df.itertuples():
92118
tokens.append(
93119
{
94120
"bbox": [
95-
idtx.left,
96-
idtx.top,
97-
idtx.left + idtx.width,
98-
idtx.top + idtx.height,
121+
idtx.left / zoom,
122+
idtx.top / zoom,
123+
(idtx.left + idtx.width) / zoom,
124+
(idtx.top + idtx.height) / zoom,
99125
],
100126
"text": idtx.text,
101127
},
@@ -688,3 +714,21 @@ def cells_to_html(cells):
688714
tcell.text = cell["cell text"]
689715

690716
return str(ET.tostring(table, encoding="unicode", short_empty_elements=False))
717+
718+
719+
def zoom_image(image: Image, zoom: float) -> Image:
720+
"""scale an image based on the zoom factor using cv2; the scaled image is post processed by
721+
dilation then erosion to improve edge sharpness for OCR tasks"""
722+
new_image = cv2.resize(
723+
cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR),
724+
None,
725+
fx=zoom,
726+
fy=zoom,
727+
interpolation=cv2.INTER_CUBIC,
728+
)
729+
730+
kernel = np.ones((1, 1), np.uint8)
731+
new_image = cv2.dilate(new_image, kernel, iterations=1)
732+
new_image = cv2.erode(new_image, kernel, iterations=1)
733+
734+
return Image.fromarray(new_image)

unstructured_inference/models/tesseract.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,10 @@
1616
os.environ["OMP_THREAD_LIMIT"] = "1"
1717

1818

19+
# this field is defined by pytesseract/unstructured.pytesseract
20+
TESSERACT_TEXT_HEIGHT = "height"
21+
22+
1923
def load_agent(languages: str = "eng"):
2024
"""Loads the Tesseract OCR agent as a global variable to ensure that we only load it once.
2125

0 commit comments

Comments
 (0)