chore: support paddle with both cpu and gpu if it is installed (#207)

yuming-long · web-flow · commit 5bc67b562252 · 2023-09-12T14:16:57.000-04:00
address: #200 ### Summary * Before this PR, we only used paddle if the platform is `x86_64`, remove this platform check and just assume that paddle and paddle OCR is already installed on user's platform * Introduce an env variable `TABLE_OCR` that can be used to specify if we use paddle/tesseract for table extraction, default is tesseract * Add logic for parameter `use_gpu` while setting up paddle ocr instance, if there is gpu advices available, paddle will use gpu ### Test * paddle on CPU: please check unit test `test_table_prediction_paddle`, the result is not good (can use gdb to print value of `prediction`) but can prove that paddle ocr is running in CI tests * paddle on GPU: on a gpu instance: * rerun test `PYTHONPATH=. pytest test_unstructured_inference/models/test_tables.py::test_table_prediction_paddle` and run * from another terminal run `nvidia-smi -l 1` to monitor gpu usage, and you can see the python test is using gpu memory
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,7 @@
+## 0.5.26
+
+* support paddle with both cpu and gpu and assumed it is pre-installed
+
 ## 0.5.25
 
 * fix a bug where `cells_to_html` doesn't handle cells spanning multiple rows properly
diff --git a/Makefile b/Makefile
@@ -34,6 +34,8 @@ install-detectron2:
 
 .PHONY: install-paddleocr
 install-paddleocr:
+	pip install paddlepaddle
+	pip install paddlepaddle-gpu
 	pip install "unstructured.PaddleOCR"
 
 .PHONY: install-test
diff --git a/README.md b/README.md
@@ -36,17 +36,21 @@ tips on installing Detectron2 on Windows.
 
 ### PaddleOCR
 
-[PaddleOCR](https://github.com/Unstructured-IO/unstructured.PaddleOCR) is suggested for table processing for `x86_64` architectures.
-It **should not be installed under MacOS running Apple Silicon**.
+[PaddleOCR](https://github.com/Unstructured-IO/unstructured.PaddleOCR) is suggested for table processing. Please set
+environment variable `TABLE_OCR`
+to `paddle` if you wish to use paddle for table processing instead of default `tesseract`.
 
 PaddleOCR may be with installed with:
 
 ```shell
-# x86_64 only!
+pip install paddepaddle
 pip install "unstructured.PaddleOCR"
 ```
 
-If paddle is not available, OCR is handled by tesseract instead.
+We suggest that you install paddlepaddle-gpu with `pip install paddepaddle-gpu` if you have gpu devices available for better OCR performance.
+
+Please note that **paddlepaddle does not work on MacOS with Apple Silicon**. So if you want it running on Apple M1/M2 chip, we have a custom wheel of paddlepaddle for aarch64 architecture, you can install it with `pip install unstructured.paddlepaddle`, and run it inside a docker container.
+
 
 ### Repository
 
diff --git a/setup.py b/setup.py
@@ -74,21 +74,4 @@ def load_text_from_file(filename: str):
     version=__version__,
     entry_points={},
     install_requires=load_requirements(),
-    extras_require={
-        "tables": [
-            'unstructured.PaddleOCR ; platform_machine=="x86_64"',
-            # NOTE(crag): workaround issue for error output below
-            # ERROR test_unstructured/partition/test_common.py - TypeError: Descriptors cannot not
-            # be created directly.
-            # If this call came from a _pb2.py file, your generated code is out of date and must be
-            # regenerated with protoc >= 3.19.0.
-            # If you cannot immediately regenerate your protos, some other possible workarounds are:
-            #  1. Downgrade the protobuf package to 3.20.x or lower.
-            #  2. Set PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python (but this will use pure-Python
-            #     parsing and will be much slower).
-            'protobuf<3.21 ; platform_machine=="x86_64"',
-            # NOTE(alan): Pin to get around error: undefined symbol: _dl_sym, version GLIBC_PRIVATE
-            'paddlepaddle>=2.4 ; platform_machine=="x86_64"',
-        ],
-    },
 )
diff --git a/test_unstructured_inference/models/test_tables.py b/test_unstructured_inference/models/test_tables.py
@@ -1,5 +1,3 @@
-from unittest.mock import patch
-
 import pytest
 from transformers.models.table_transformer.modeling_table_transformer import (
     TableTransformerDecoder,
@@ -326,35 +324,50 @@ def test_align_rows(rows, bbox, output):
     assert postprocess.align_rows(rows, bbox) == output
 
 
-# TODO: break this test down so it doesn't account for nearly 8% of test coverage
-@pytest.mark.parametrize(
-    ("model_path", "platform_type"),
-    [
-        ("microsoft/table-transformer-structure-recognition", "arm64"),
-        ("microsoft/table-transformer-structure-recognition", "x86_64"),
-    ],
-)
-def test_table_prediction(model_path, platform_type):
-    with patch("platform.machine", return_value=platform_type):
+def test_table_prediction_tesseract():
+    table_model = tables.UnstructuredTableTransformerModel()
+    from PIL import Image
+
+    table_model.initialize(model="microsoft/table-transformer-structure-recognition")
+    img = Image.open("./sample-docs/table-multi-row-column-cells.png").convert("RGB")
+    prediction = table_model.predict(img)
+    # assert rows spans two rows are detected
+    assert '<table><thead><th rowspan="2">' in prediction
+    # one of the safest rows to detect should be present
+    assert (
+        "<tr>"
+        "<td>Blind</td>"
+        "<td>5</td>"
+        "<td>1</td>"
+        "<td>4</td>"
+        "<td>34.5%, n=1</td>"
+        "<td>1199 sec, n=1</td>"
+        "</tr>"
+    ) in prediction
+
+
+def test_table_prediction_paddle(monkeypatch):
+    monkeypatch.setenv("TABLE_OCR", "paddle")
+    table_model = tables.UnstructuredTableTransformerModel()
+    from PIL import Image
+
+    table_model.initialize(model="microsoft/table-transformer-structure-recognition")
+    img = Image.open("./sample-docs/table-multi-row-column-cells.png").convert("RGB")
+    prediction = table_model.predict(img)
+    # Note(yuming): lossen paddle table prediction output test since performance issue
+    # assert rows spans two rows are detected
+    assert '<table><thead><th rowspan="2">' in prediction
+
+
+def test_table_prediction_invalid_table_ocr(monkeypatch):
+    monkeypatch.setenv("TABLE_OCR", "invalid_table_ocr")
+    with pytest.raises(ValueError):
         table_model = tables.UnstructuredTableTransformerModel()
         from PIL import Image
 
-        table_model.initialize(model=model_path)
+        table_model.initialize(model="microsoft/table-transformer-structure-recognition")
         img = Image.open("./sample-docs/table-multi-row-column-cells.png").convert("RGB")
-        prediction = table_model.predict(img)
-        # assert rows spans two rows are detected
-        assert '<table><thead><th rowspan="2">' in prediction
-        # one of the safest rows to detect should be present
-        assert (
-            "<tr>"
-            "<td>Blind</td>"
-            "<td>5</td>"
-            "<td>1</td>"
-            "<td>4</td>"
-            "<td>34.5%, n=1</td>"
-            "<td>1199 sec, n=1</td>"
-            "</tr>"
-        ) in prediction
+        _ = table_model.predict(img)
 
 
 def test_intersect():
diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py
@@ -1 +1 @@
-__version__ = "0.5.25"  # pragma: no cover
+__version__ = "0.5.26"  # pragma: no cover
diff --git a/unstructured_inference/models/paddle_ocr.py b/unstructured_inference/models/paddle_ocr.py
@@ -1,12 +1,38 @@
+import paddle
+from unstructured_paddleocr import PaddleOCR
+
 paddle_ocr = None  # type: ignore
 
 
-def load_agent():
+def load_agent(language: str = "en"):
     """Loads the PaddleOCR agent as a global variable to ensure that we only load it once."""
 
-    from unstructured_paddleocr import PaddleOCR
+    # Disable signal handlers at C++ level upon failing
+    # ref: https://www.paddlepaddle.org.cn/documentation/docs/en/api/paddle/
+    #      disable_signal_handler_en.html#disable-signal-handler
+    paddle.disable_signal_handler()
+    # Use paddlepaddle-gpu if there is gpu device available
+    gpu_available = paddle.device.cuda.device_count() > 0
 
     global paddle_ocr
-    paddle_ocr = PaddleOCR(use_angle_cls=True, lang="en", mkl_dnn=True, show_log=False)
-
+    if paddle_ocr is None:
+        try:
+            # Enable MKL-DNN for paddle to speed up OCR if OS supports it
+            # ref: https://paddle-inference.readthedocs.io/en/master/
+            #      api_reference/cxx_api_doc/Config/CPUConfig.html
+            paddle_ocr = PaddleOCR(
+                use_angle_cls=True,
+                use_gpu=gpu_available,
+                lang=language,
+                enable_mkldnn=True,
+                show_log=False,
+            )
+        except AttributeError:
+            paddle_ocr = PaddleOCR(
+                use_angle_cls=True,
+                use_gpu=gpu_available,
+                lang=language,
+                enable_mkldnn=False,
+                show_log=False,
+            )
     return paddle_ocr
diff --git a/unstructured_inference/models/tables.py b/unstructured_inference/models/tables.py
@@ -1,7 +1,7 @@
 # https://github.com/microsoft/table-transformer/blob/main/src/inference.py
 # https://github.com/NielsRogge/Transformers-Tutorials/blob/master/Table%20Transformer/Using_Table_Transformer_for_table_detection_and_table_structure_recognition.ipynb
 import logging
-import platform
+import os
 import xml.etree.ElementTree as ET
 from collections import defaultdict
 from pathlib import Path
@@ -56,49 +56,48 @@ def initialize(
 
     def get_tokens(self, x: Image):
         """Get OCR tokens from either paddleocr or tesseract"""
-        if platform.machine() == "x86_64":
-            try:
-                from unstructured_inference.models import paddle_ocr
-
-                paddle_result = paddle_ocr.load_agent().ocr(np.array(x), cls=True)
-
-                tokens = []
-                for idx in range(len(paddle_result)):
-                    res = paddle_result[idx]
-                    for line in res:
-                        xmin = min([i[0] for i in line[0]])
-                        ymin = min([i[1] for i in line[0]])
-                        xmax = max([i[0] for i in line[0]])
-                        ymax = max([i[1] for i in line[0]])
-                        tokens.append({"bbox": [xmin, ymin, xmax, ymax], "text": line[1][0]})
-                return tokens
-            except ModuleNotFoundError:
-                logging.warning(
-                    "No module named 'unstructured_paddleocr', falling back to tesseract",
-                )
-                pass
-
-        ocr_df: pd.DataFrame = pytesseract.image_to_data(
-            x,
-            output_type="data.frame",
-        )
-
-        ocr_df = ocr_df.dropna()
-
-        tokens = []
-        for idtx in ocr_df.itertuples():
-            tokens.append(
-                {
-                    "bbox": [
-                        idtx.left,
-                        idtx.top,
-                        idtx.left + idtx.width,
-                        idtx.top + idtx.height,
-                    ],
-                    "text": idtx.text,
-                },
+        table_ocr = os.getenv("TABLE_OCR", "tesseract").lower()
+        if table_ocr not in ["paddle", "tesseract"]:
+            raise ValueError(
+                "Environment variable TABLE_OCR must be set to 'tesseract' or 'paddle'.",
+            )
+        if table_ocr == "paddle":
+            from unstructured_inference.models import paddle_ocr
+
+            paddle_result = paddle_ocr.load_agent().ocr(np.array(x), cls=True)
+
+            tokens = []
+            for idx in range(len(paddle_result)):
+                res = paddle_result[idx]
+                for line in res:
+                    xmin = min([i[0] for i in line[0]])
+                    ymin = min([i[1] for i in line[0]])
+                    xmax = max([i[0] for i in line[0]])
+                    ymax = max([i[1] for i in line[0]])
+                    tokens.append({"bbox": [xmin, ymin, xmax, ymax], "text": line[1][0]})
+            return tokens
+        else:
+            ocr_df: pd.DataFrame = pytesseract.image_to_data(
+                x,
+                output_type="data.frame",
             )
-        return tokens
+
+            ocr_df = ocr_df.dropna()
+
+            tokens = []
+            for idtx in ocr_df.itertuples():
+                tokens.append(
+                    {
+                        "bbox": [
+                            idtx.left,
+                            idtx.top,
+                            idtx.left + idtx.width,
+                            idtx.top + idtx.height,
+                        ],
+                        "text": idtx.text,
+                    },
+                )
+            return tokens
 
     def run_prediction(self, x: Image):
         """Predict table structure"""

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.5.25" # pragma: no cover`
	`1`	`+__version__ = "0.5.26" # pragma: no cover`