Fall back to pytesseract if paddleocr fails (#127)

rbiseck3 · cragwolfe · web-flow · commit 47c116acacbc · 2023-06-21T19:45:29.000-04:00
* Fall back to pytesseract if paddleocr fails

* Add docstring and bump version

* change new label in changelog

* update changelog to fold current change into existing version

* Set the version to 0.5.2

* add generated pycharm files to gitignore

---------

Co-authored-by: cragwolfe &lt;crag@unstructured.io&gt;
diff --git a/.gitignore b/.gitignore
@@ -118,6 +118,9 @@ venv.bak/
 # Rope project settings
 .ropeproject
 
+# Pycharm
+.idea/
+
 # mkdocs documentation
 /site
 
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,7 @@
 
 * Combine inferred elements with extracted elements
 * Add ruff to keep code consistent with unstructured
+* Configure fallback for OCR token if paddleocr doesn't work to use tesseract
 
 ## 0.5.1
 
diff --git a/unstructured_inference/models/tables.py b/unstructured_inference/models/tables.py
@@ -55,60 +55,71 @@ def initialize(
             )
         self.model.to(device)
 
+    def get_tokens(self, x: Image):
+        """Get OCR tokens from either paddleocr or tesseract"""
+        if platform.machine() == "x86_64":
+            try:
+                from unstructured_inference.models import paddle_ocr
+
+                paddle_result = paddle_ocr.load_agent().ocr(np.array(x), cls=True)
+
+                tokens = []
+                for idx in range(len(paddle_result)):
+                    res = paddle_result[idx]
+                    for line in res:
+                        xmin = min([i[0] for i in line[0]])
+                        ymin = min([i[1] for i in line[0]])
+                        xmax = max([i[0] for i in line[0]])
+                        ymax = max([i[1] for i in line[0]])
+                        tokens.append({"bbox": [xmin, ymin, xmax, ymax], "text": line[1][0]})
+                return tokens
+            except ModuleNotFoundError:
+                logging.warning(
+                    "No module named 'unstructured_paddleocr', falling back to tesseract",
+                )
+                pass
+        zoom = 6
+        img = cv2.resize(
+            cv2.cvtColor(np.array(x), cv2.COLOR_RGB2BGR),
+            None,
+            fx=zoom,
+            fy=zoom,
+            interpolation=cv2.INTER_CUBIC,
+        )
+
+        kernel = np.ones((1, 1), np.uint8)
+        img = cv2.dilate(img, kernel, iterations=1)
+        img = cv2.erode(img, kernel, iterations=1)
+
+        ocr_df: pd.DataFrame = pytesseract.image_to_data(
+            Image.fromarray(img),
+            output_type="data.frame",
+        )
+
+        ocr_df = ocr_df.dropna()
+
+        tokens = []
+        for idtx in ocr_df.itertuples():
+            tokens.append(
+                {
+                    "bbox": [
+                        idtx.left / zoom,
+                        idtx.top / zoom,
+                        (idtx.left + idtx.width) / zoom,
+                        (idtx.top + idtx.height) / zoom,
+                    ],
+                    "text": idtx.text,
+                },
+            )
+        return tokens
+
     def run_prediction(self, x: Image):
         """Predict table structure"""
         with torch.no_grad():
             encoding = self.feature_extractor(x, return_tensors="pt").to(self.device)
             outputs_structure = self.model(**encoding)
 
-        if platform.machine() == "x86_64":
-            from unstructured_inference.models import paddle_ocr
-
-            paddle_result = paddle_ocr.load_agent().ocr(np.array(x), cls=True)
-
-            tokens = []
-            for idx in range(len(paddle_result)):
-                res = paddle_result[idx]
-                for line in res:
-                    xmin = min([i[0] for i in line[0]])
-                    ymin = min([i[1] for i in line[0]])
-                    xmax = max([i[0] for i in line[0]])
-                    ymax = max([i[1] for i in line[0]])
-                    tokens.append({"bbox": [xmin, ymin, xmax, ymax], "text": line[1][0]})
-        else:
-            zoom = 6
-            img = cv2.resize(
-                cv2.cvtColor(np.array(x), cv2.COLOR_RGB2BGR),
-                None,
-                fx=zoom,
-                fy=zoom,
-                interpolation=cv2.INTER_CUBIC,
-            )
-
-            kernel = np.ones((1, 1), np.uint8)
-            img = cv2.dilate(img, kernel, iterations=1)
-            img = cv2.erode(img, kernel, iterations=1)
-
-            ocr_df: pd.DataFrame = pytesseract.image_to_data(
-                Image.fromarray(img),
-                output_type="data.frame",
-            )
-
-            ocr_df = ocr_df.dropna()
-
-            tokens = []
-            for idtx in ocr_df.itertuples():
-                tokens.append(
-                    {
-                        "bbox": [
-                            idtx.left / zoom,
-                            idtx.top / zoom,
-                            (idtx.left + idtx.width) / zoom,
-                            (idtx.top + idtx.height) / zoom,
-                        ],
-                        "text": idtx.text,
-                    },
-                )
+        tokens = self.get_tokens(x=x)
 
         sorted(tokens, key=lambda x: x["bbox"][1] * 10000 + x["bbox"][0])