chore: skip paddle unittests local for mac (#214)

yuming-long · web-flow · commit bfb90e32df86 · 2023-09-19T18:07:36.000-04:00
## Summary

Paddle still hanging on mac so unittests related to it will fail/hang,
skip those tests for local `make test`
* Added `@pytest.mark.skipif(skip_outside_ci)` check for any test that
use paddle

## Test

Run `make test` on m1 chip and test will pass (tho coverage will drop
from 95% to 93%)
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -91,7 +91,7 @@ jobs:
     - name: Test
       run: |
         source .venv/bin/activate
-        make test
+        CI=true make test
         make check-coverage
 
   test_ingest:
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,7 @@
+## 0.5.29-dev0
+
+* fix paddle unit tests where `make test` fails since paddle doesn't work on M1/M2 chip locally
+
 ## 0.5.28
 
 * add env variable `ENTIRE_PAGE_OCR` to specify using paddle or tesseract on entire page OCR
diff --git a/Makefile b/Makefile
@@ -34,9 +34,9 @@ install-detectron2:
 
 .PHONY: install-paddleocr
 install-paddleocr:
-	pip install paddlepaddle
-	pip install paddlepaddle-gpu
-	pip install "unstructured.PaddleOCR"
+	pip install --no-cache-dir paddlepaddle
+	pip install --no-cache-dir paddlepaddle-gpu
+	pip install --no-cache-dir "unstructured.PaddleOCR"
 
 .PHONY: install-test
 install-test: install-base
@@ -62,14 +62,16 @@ pip-compile:
 # Test and Lint #
 #################
 
+export CI ?= false
+
 ## test:                    runs all unittests
 .PHONY: test
 test:
-	PYTHONPATH=. pytest -m "not slow" test_${PACKAGE_NAME} --cov=${PACKAGE_NAME} --cov-report term-missing
+	PYTHONPATH=. CI=$(CI) pytest -m "not slow" test_${PACKAGE_NAME} --cov=${PACKAGE_NAME} --cov-report term-missing
 
 .PHONY: test-slow
 test-slow:
-	PYTHONPATH=. pytest test_${PACKAGE_NAME} --cov=${PACKAGE_NAME} --cov-report term-missing
+	PYTHONPATH=. CI=$(CI) pytest test_${PACKAGE_NAME} --cov=${PACKAGE_NAME} --cov-report term-missing
 
 ## check:                   runs linters (includes tests)
 .PHONY: check
diff --git a/test_unstructured_inference/inference/test_layout.py b/test_unstructured_inference/inference/test_layout.py
@@ -1,3 +1,4 @@
+import os
 import os.path
 import tempfile
 from functools import partial
@@ -17,6 +18,8 @@
     UnstructuredObjectDetectionModel,
 )
 
+skip_outside_ci = os.getenv("CI", "").lower() in {"", "false", "f", "0"}
+
 
 @pytest.fixture()
 def mock_image():
@@ -158,9 +161,9 @@ def join(self):
         pass
 
 
-@pytest.mark.parametrize("entire_page_ocr", ["paddle", "tesseract"])
-def test_get_page_elements_with_ocr(monkeypatch, entire_page_ocr):
-    monkeypatch.setenv("ENTIRE_PAGE_OCR", entire_page_ocr)
+@pytest.mark.skipif(skip_outside_ci, reason="Skipping paddle test run outside of CI")
+def test_get_page_elements_with_paddle_ocr(monkeypatch):
+    monkeypatch.setenv("ENTIRE_PAGE_OCR", "paddle")
     text_block = layout.TextRegion(2, 4, 6, 8, text=None)
     image_block = layout.ImageTextRegion(8, 14, 16, 18)
     doc_initial_layout = [text_block, image_block]
@@ -186,7 +189,38 @@ def test_get_page_elements_with_ocr(monkeypatch, entire_page_ocr):
         detection_model=MockLayoutModel(doc_final_layout),
         # Note(yuming): there are differnt language codes for same language
         # between paddle and tesseract
-        ocr_languages="en" if entire_page_ocr == "paddle" else "eng",
+        ocr_languages="en",
+    )
+    page.get_elements_with_detection_model()
+
+    assert str(page) == "\n\nAn Even Catchier Title"
+
+
+def test_get_page_elements_with_tesseract_ocr(monkeypatch):
+    monkeypatch.setenv("ENTIRE_PAGE_OCR", "tesseract")
+    text_block = layout.TextRegion(2, 4, 6, 8, text=None)
+    image_block = layout.ImageTextRegion(8, 14, 16, 18)
+    doc_initial_layout = [text_block, image_block]
+    text_layoutelement = layoutelement.LayoutElement(
+        2,
+        4,
+        6,
+        8,
+        text=None,
+        type="UncategorizedText",
+    )
+    image_layoutelement = layoutelement.LayoutElement(8, 14, 16, 18, text=None, type="Image")
+    doc_final_layout = [text_layoutelement, image_layoutelement]
+
+    monkeypatch.setattr(detectron2, "is_detectron2_available", lambda *args: True)
+    monkeypatch.setattr(elements, "ocr", lambda *args, **kwargs: "An Even Catchier Title")
+
+    image = Image.fromarray(np.random.randint(12, 14, size=(40, 10, 3)), mode="RGB")
+    page = layout.PageLayout(
+        number=0,
+        image=image,
+        layout=doc_initial_layout,
+        detection_model=MockLayoutModel(doc_final_layout),
     )
     page.get_elements_with_detection_model()
 
diff --git a/test_unstructured_inference/models/test_tables.py b/test_unstructured_inference/models/test_tables.py
@@ -1,3 +1,5 @@
+import os
+
 import pytest
 from transformers.models.table_transformer.modeling_table_transformer import (
     TableTransformerDecoder,
@@ -6,6 +8,8 @@
 import unstructured_inference.models.table_postprocess as postprocess
 from unstructured_inference.models import tables
 
+skip_outside_ci = os.getenv("CI", "").lower() in {"", "false", "f", "0"}
+
 
 @pytest.mark.parametrize(
     "model_path",
@@ -346,6 +350,7 @@ def test_table_prediction_tesseract():
     ) in prediction
 
 
+@pytest.mark.skipif(skip_outside_ci, reason="Skipping paddle test run outside of CI")
 def test_table_prediction_paddle(monkeypatch):
     monkeypatch.setenv("TABLE_OCR", "paddle")
     table_model = tables.UnstructuredTableTransformerModel()
diff --git a/test_unstructured_inference/test_elements.py b/test_unstructured_inference/test_elements.py
@@ -1,4 +1,5 @@
 import logging
+import os
 from random import randint
 from unittest.mock import PropertyMock, patch
 
@@ -7,6 +8,8 @@
 
 from unstructured_inference.inference import elements
 
+skip_outside_ci = os.getenv("CI", "").lower() in {"", "false", "f", "0"}
+
 
 def intersect_brute(rect1, rect2):
     return any(
@@ -188,6 +191,7 @@ def test_intersection_over_min(
     )
 
 
+@pytest.mark.skipif(skip_outside_ci, reason="Skipping paddle test run outside of CI")
 def test_ocr_paddle(monkeypatch, caplog):
     monkeypatch.setenv("ENTIRE_PAGE_OCR", "paddle")
     image = Image.new("RGB", (100, 100), (255, 255, 255))
diff --git a/unstructured_inference/__version__.py b/unstructured_inference/__version__.py
@@ -1 +1 @@
-__version__ = "0.5.28"  # pragma: no cover
+__version__ = "0.5.29-dev0"  # pragma: no cover

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.5.28" # pragma: no cover`
	`1`	`+__version__ = "0.5.29-dev0" # pragma: no cover`