feat: add autoscaling for table images (#210)

badGarnet · web-flow · commit c4d3e8b0bf43 · 2023-09-25T20:33:05.000Z
Auto scale table images so that the text height is optimum for `tesseract` OCR inference. This functionality will scaling images where the estimated mean text height based on the `inference_config` setup: table images with text height below `inference_config.TESSERACT_MIN_TEXT_HEIGHT` or above `inference_config.TESSERACT_MAX_TEXT_HEIGHT` are scaled so that the text height is at `inference_config.TESSERACT_OPTIMUM_TEXT_HEIGHT`. This PR resolves [CORE-1863](https://unstructured-ai.atlassian.net/browse/CORE-1863) ## test - this PR adds a unit test to confirm auto scale is triggered - test the tokens computed without zoom and with zoom with the attached image: with zoom the tokens should include the correct text "Japanese" in the table on the page. Without zoom (call get_tokens using main) we won't see this token and instead you might find a token that look like "Inpanere". For this specific document it is best to set `TESSERACT_MIN_TEXT_HEIGHT` to 12. ![layout-parser-paper-with-table](https://github.com/Unstructured-IO/unstructured-inference/assets/647930/7963bba0-67cb-48ee-b338-52b1c2620fc0) [CORE-1863]: https://unstructured-ai.atlassian.net/browse/CORE-1863?atlOrigin=eyJpIjoiNWRkNTljNzYxNjVmNDY3MDlhMDU5Y2ZhYzA5YTRkZjUiLCJwIjoiZ2l0aHViLWNvbS1KU1cifQ
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,8 @@
+## 0.6.4
+
+* add a function to automatically scale table crop images based on text height so the text height is optimum for `tesseract` OCR task
+* add the new image auto scaling parameters to `config.py`
+
 ## 0.6.3
 
 * fix a bug where padded table structure bounding boxes are not shifted back into the original image coordinates correctly
diff --git a/requirements/base.txt b/requirements/base.txt
@@ -44,7 +44,7 @@ humanfriendly==10.0
     # via coloredlogs
 idna==3.4
     # via requests
-importlib-resources==6.0.1
+importlib-resources==6.1.0
     # via matplotlib
 iopath==0.1.10
     # via layoutparser
diff --git a/requirements/dev.txt b/requirements/dev.txt
@@ -101,7 +101,7 @@ importlib-metadata==6.8.0
     #   jupyterlab
     #   jupyterlab-server
     #   nbconvert
-importlib-resources==6.0.1
+importlib-resources==6.1.0
     # via
     #   -c requirements/base.txt
     #   jsonschema
@@ -139,7 +139,7 @@ json5==0.9.14
     # via jupyterlab-server
 jsonpointer==2.4
     # via jsonschema
-jsonschema[format-nongpl]==4.19.0
+jsonschema[format-nongpl]==4.19.1
     # via
     #   jupyter-events
     #   jupyterlab-server
diff --git a/requirements/test.in b/requirements/test.in
@@ -12,6 +12,7 @@ flake8
 flake8-docstrings
 mypy
 pytest-cov
+pytest-mock
 pdf2image>=1.16.2
 huggingface_hub>=0.11.1
 ruff
diff --git a/requirements/test.txt b/requirements/test.txt
@@ -97,9 +97,13 @@ pydocstyle==6.3.0
 pyflakes==3.1.0
     # via flake8
 pytest==7.4.2
-    # via pytest-cov
+    # via
+    #   pytest-cov
+    #   pytest-mock
 pytest-cov==4.1.0
     # via -r requirements/test.in
+pytest-mock==3.11.1
+    # via -r requirements/test.in
 pyyaml==6.0.1
     # via
     #   -c requirements/base.txt
diff --git a/test_unstructured_inference/models/test_tables.py b/test_unstructured_inference/models/test_tables.py
@@ -1,4 +1,5 @@
 import os
+from pathlib import Path
 
 import numpy as np
 import pytest
@@ -589,6 +590,21 @@ def test_cells_to_html():
     assert tables.cells_to_html(cells) == expected
 
 
+def test_auto_zoom(mocker):
+    spy = mocker.spy(tables, "zoom_image")
+    model = tables.UnstructuredTableTransformerModel()
+    model.initialize("microsoft/table-transformer-structure-recognition")
+    image = Image.open(
+        Path(os.path.dirname(os.path.abspath(__file__)))
+        / ".."
+        / ".."
+        / "sample-docs"
+        / "layout-parser-paper-fast.jpg",
+    )
+    model.get_tokens(image)
+    assert spy.call_count == 1
+
+
 def test_padded_results_has_right_dimensions(table_transformer, example_image):
     str_class_name2idx = tables.get_class_map("structure")
     # a simpler mapping so we keep all structure in the returned objs below for test
diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py
@@ -1 +1 @@
-__version__ = "0.6.3"  # pragma: no cover
+__version__ = "0.6.4"  # pragma: no cover
diff --git a/unstructured_inference/config.py b/unstructured_inference/config.py
@@ -44,7 +44,35 @@ def TABLE_IMAGE_BACKGROUND_PAD(self) -> int:
         The padding adds NO image data around an identified table bounding box; it simply adds white
         background around the image
         """
-        return self._get_int("TABLE_IMAGE_BACKGROUND_PAD", 0)
+        return self._get_int("TABLE_IMAGE_BACKGROUND_PAD", 20)
+
+    @property
+    def TESSERACT_MIN_TEXT_HEIGHT(self) -> int:
+        """minimum text height acceptable from tesseract OCR results
+
+        if estimated text height from tesseract OCR results is lower than this value the image is
+        scaled up to be processed again
+        """
+        return self._get_int("TESSERACT_MIN_TEXT_HEIGHT", 12)
+
+    @property
+    def TESSERACT_MAX_TEXT_HEIGHT(self) -> int:
+        """maximum text height acceptable from tesseract OCR results
+
+        if estimated text height from tesseract OCR results is higher than this value the image is
+        scaled down to be processed again
+        """
+        return self._get_int("TESSERACT_MAX_TEXT_HEIGHT", 100)
+
+    @property
+    def TESSERACT_OPTIMUM_TEXT_HEIGHT(self) -> int:
+        """optimum text height for tesseract OCR"""
+        return self._get_int("TESSERACT_OPTIMUM_TEXT_HEIGHT", 20)
+
+    @property
+    def TESSERACT_TEXT_HEIGHT_QUANTILE(self) -> float:
+        """the quantile to check for text height"""
+        return self._get_float("TESSERACT_TEXT_HEIGHT_QUANTILE", 0.5)
 
     @property
     def TT_TABLE_CONF(self) -> float:
diff --git a/unstructured_inference/models/tables.py b/unstructured_inference/models/tables.py
@@ -7,6 +7,7 @@
 from pathlib import Path
 from typing import List, Optional, Union
 
+import cv2
 import numpy as np
 import pandas as pd
 import pytesseract
@@ -17,6 +18,9 @@
 from unstructured_inference.config import inference_config
 from unstructured_inference.logger import logger
 from unstructured_inference.models.table_postprocess import Rect
+from unstructured_inference.models.tesseract import (
+    TESSERACT_TEXT_HEIGHT,
+)
 from unstructured_inference.models.unstructuredmodel import UnstructuredModel
 from unstructured_inference.utils import pad_image_with_background_color
 
@@ -79,23 +83,45 @@ def get_tokens(self, x: Image):
                     ymax = max([i[1] for i in line[0]])
                     tokens.append({"bbox": [xmin, ymin, xmax, ymax], "text": line[1][0]})
         else:
+            zoom = 1
+
             logger.info("Processing table OCR with tesseract...")
             ocr_df: pd.DataFrame = pytesseract.image_to_data(
                 x,
                 output_type="data.frame",
             )
-
             ocr_df = ocr_df.dropna()
 
+            # tesseract performance degrades when the text height is out of the preferred zone so we
+            # zoom the image (in or out depending on estimated text height) for optimum OCR results
+            # but this needs to be evaluated based on actual use case as the optimum scaling also
+            # depend on type of characters (font, language, etc); be careful about this
+            # functionality
+            text_height = ocr_df[TESSERACT_TEXT_HEIGHT].quantile(
+                inference_config.TESSERACT_TEXT_HEIGHT_QUANTILE,
+            )
+            if (
+                text_height < inference_config.TESSERACT_MIN_TEXT_HEIGHT
+                or text_height > inference_config.TESSERACT_MAX_TEXT_HEIGHT
+            ):
+                # rounding avoids unnecessary precision and potential numerical issues assocaited
+                # with numbers very close to 1 inside cv2 image processing
+                zoom = np.round(inference_config.TESSERACT_OPTIMUM_TEXT_HEIGHT / text_height, 1)
+                ocr_df = pytesseract.image_to_data(
+                    zoom_image(x, zoom),
+                    output_type="data.frame",
+                )
+                ocr_df = ocr_df.dropna()
+
             tokens = []
             for idtx in ocr_df.itertuples():
                 tokens.append(
                     {
                         "bbox": [
-                            idtx.left,
-                            idtx.top,
-                            idtx.left + idtx.width,
-                            idtx.top + idtx.height,
+                            idtx.left / zoom,
+                            idtx.top / zoom,
+                            (idtx.left + idtx.width) / zoom,
+                            (idtx.top + idtx.height) / zoom,
                         ],
                         "text": idtx.text,
                     },
@@ -688,3 +714,21 @@ def cells_to_html(cells):
         tcell.text = cell["cell text"]
 
     return str(ET.tostring(table, encoding="unicode", short_empty_elements=False))
+
+
+def zoom_image(image: Image, zoom: float) -> Image:
+    """scale an image based on the zoom factor using cv2; the scaled image is post processed by
+    dilation then erosion to improve edge sharpness for OCR tasks"""
+    new_image = cv2.resize(
+        cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR),
+        None,
+        fx=zoom,
+        fy=zoom,
+        interpolation=cv2.INTER_CUBIC,
+    )
+
+    kernel = np.ones((1, 1), np.uint8)
+    new_image = cv2.dilate(new_image, kernel, iterations=1)
+    new_image = cv2.erode(new_image, kernel, iterations=1)
+
+    return Image.fromarray(new_image)
diff --git a/unstructured_inference/models/tesseract.py b/unstructured_inference/models/tesseract.py
@@ -16,6 +16,10 @@
     os.environ["OMP_THREAD_LIMIT"] = "1"
 
 
+# this field is defined by pytesseract/unstructured.pytesseract
+TESSERACT_TEXT_HEIGHT = "height"
+
+
 def load_agent(languages: str = "eng"):
     """Loads the Tesseract OCR agent as a global variable to ensure that we only load it once.
 

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.6.3" # pragma: no cover`
	`1`	`+__version__ = "0.6.4" # pragma: no cover`