Skip to content

Commit bfb90e3

Browse files
authored
chore: skip paddle unittests local for mac (#214)
## Summary Paddle still hanging on mac so unittests related to it will fail/hang, skip those tests for local `make test` * Added `@pytest.mark.skipif(skip_outside_ci)` check for any test that use paddle ## Test Run `make test` on m1 chip and test will pass (tho coverage will drop from 95% to 93%)
1 parent 173f633 commit bfb90e3

File tree

7 files changed

+60
-11
lines changed

7 files changed

+60
-11
lines changed

.github/workflows/ci.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ jobs:
9191
- name: Test
9292
run: |
9393
source .venv/bin/activate
94-
make test
94+
CI=true make test
9595
make check-coverage
9696
9797
test_ingest:

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
## 0.5.29-dev0
2+
3+
* fix paddle unit tests where `make test` fails since paddle doesn't work on M1/M2 chip locally
4+
15
## 0.5.28
26

37
* add env variable `ENTIRE_PAGE_OCR` to specify using paddle or tesseract on entire page OCR

Makefile

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,9 @@ install-detectron2:
3434

3535
.PHONY: install-paddleocr
3636
install-paddleocr:
37-
pip install paddlepaddle
38-
pip install paddlepaddle-gpu
39-
pip install "unstructured.PaddleOCR"
37+
pip install --no-cache-dir paddlepaddle
38+
pip install --no-cache-dir paddlepaddle-gpu
39+
pip install --no-cache-dir "unstructured.PaddleOCR"
4040

4141
.PHONY: install-test
4242
install-test: install-base
@@ -62,14 +62,16 @@ pip-compile:
6262
# Test and Lint #
6363
#################
6464

65+
export CI ?= false
66+
6567
## test: runs all unittests
6668
.PHONY: test
6769
test:
68-
PYTHONPATH=. pytest -m "not slow" test_${PACKAGE_NAME} --cov=${PACKAGE_NAME} --cov-report term-missing
70+
PYTHONPATH=. CI=$(CI) pytest -m "not slow" test_${PACKAGE_NAME} --cov=${PACKAGE_NAME} --cov-report term-missing
6971

7072
.PHONY: test-slow
7173
test-slow:
72-
PYTHONPATH=. pytest test_${PACKAGE_NAME} --cov=${PACKAGE_NAME} --cov-report term-missing
74+
PYTHONPATH=. CI=$(CI) pytest test_${PACKAGE_NAME} --cov=${PACKAGE_NAME} --cov-report term-missing
7375

7476
## check: runs linters (includes tests)
7577
.PHONY: check

test_unstructured_inference/inference/test_layout.py

Lines changed: 38 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import os
12
import os.path
23
import tempfile
34
from functools import partial
@@ -17,6 +18,8 @@
1718
UnstructuredObjectDetectionModel,
1819
)
1920

21+
skip_outside_ci = os.getenv("CI", "").lower() in {"", "false", "f", "0"}
22+
2023

2124
@pytest.fixture()
2225
def mock_image():
@@ -158,9 +161,9 @@ def join(self):
158161
pass
159162

160163

161-
@pytest.mark.parametrize("entire_page_ocr", ["paddle", "tesseract"])
162-
def test_get_page_elements_with_ocr(monkeypatch, entire_page_ocr):
163-
monkeypatch.setenv("ENTIRE_PAGE_OCR", entire_page_ocr)
164+
@pytest.mark.skipif(skip_outside_ci, reason="Skipping paddle test run outside of CI")
165+
def test_get_page_elements_with_paddle_ocr(monkeypatch):
166+
monkeypatch.setenv("ENTIRE_PAGE_OCR", "paddle")
164167
text_block = layout.TextRegion(2, 4, 6, 8, text=None)
165168
image_block = layout.ImageTextRegion(8, 14, 16, 18)
166169
doc_initial_layout = [text_block, image_block]
@@ -186,7 +189,38 @@ def test_get_page_elements_with_ocr(monkeypatch, entire_page_ocr):
186189
detection_model=MockLayoutModel(doc_final_layout),
187190
# Note(yuming): there are differnt language codes for same language
188191
# between paddle and tesseract
189-
ocr_languages="en" if entire_page_ocr == "paddle" else "eng",
192+
ocr_languages="en",
193+
)
194+
page.get_elements_with_detection_model()
195+
196+
assert str(page) == "\n\nAn Even Catchier Title"
197+
198+
199+
def test_get_page_elements_with_tesseract_ocr(monkeypatch):
200+
monkeypatch.setenv("ENTIRE_PAGE_OCR", "tesseract")
201+
text_block = layout.TextRegion(2, 4, 6, 8, text=None)
202+
image_block = layout.ImageTextRegion(8, 14, 16, 18)
203+
doc_initial_layout = [text_block, image_block]
204+
text_layoutelement = layoutelement.LayoutElement(
205+
2,
206+
4,
207+
6,
208+
8,
209+
text=None,
210+
type="UncategorizedText",
211+
)
212+
image_layoutelement = layoutelement.LayoutElement(8, 14, 16, 18, text=None, type="Image")
213+
doc_final_layout = [text_layoutelement, image_layoutelement]
214+
215+
monkeypatch.setattr(detectron2, "is_detectron2_available", lambda *args: True)
216+
monkeypatch.setattr(elements, "ocr", lambda *args, **kwargs: "An Even Catchier Title")
217+
218+
image = Image.fromarray(np.random.randint(12, 14, size=(40, 10, 3)), mode="RGB")
219+
page = layout.PageLayout(
220+
number=0,
221+
image=image,
222+
layout=doc_initial_layout,
223+
detection_model=MockLayoutModel(doc_final_layout),
190224
)
191225
page.get_elements_with_detection_model()
192226

test_unstructured_inference/models/test_tables.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import os
2+
13
import pytest
24
from transformers.models.table_transformer.modeling_table_transformer import (
35
TableTransformerDecoder,
@@ -6,6 +8,8 @@
68
import unstructured_inference.models.table_postprocess as postprocess
79
from unstructured_inference.models import tables
810

11+
skip_outside_ci = os.getenv("CI", "").lower() in {"", "false", "f", "0"}
12+
913

1014
@pytest.mark.parametrize(
1115
"model_path",
@@ -346,6 +350,7 @@ def test_table_prediction_tesseract():
346350
) in prediction
347351

348352

353+
@pytest.mark.skipif(skip_outside_ci, reason="Skipping paddle test run outside of CI")
349354
def test_table_prediction_paddle(monkeypatch):
350355
monkeypatch.setenv("TABLE_OCR", "paddle")
351356
table_model = tables.UnstructuredTableTransformerModel()

test_unstructured_inference/test_elements.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import logging
2+
import os
23
from random import randint
34
from unittest.mock import PropertyMock, patch
45

@@ -7,6 +8,8 @@
78

89
from unstructured_inference.inference import elements
910

11+
skip_outside_ci = os.getenv("CI", "").lower() in {"", "false", "f", "0"}
12+
1013

1114
def intersect_brute(rect1, rect2):
1215
return any(
@@ -188,6 +191,7 @@ def test_intersection_over_min(
188191
)
189192

190193

194+
@pytest.mark.skipif(skip_outside_ci, reason="Skipping paddle test run outside of CI")
191195
def test_ocr_paddle(monkeypatch, caplog):
192196
monkeypatch.setenv("ENTIRE_PAGE_OCR", "paddle")
193197
image = Image.new("RGB", (100, 100), (255, 255, 255))
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.5.28" # pragma: no cover
1+
__version__ = "0.5.29-dev0" # pragma: no cover

0 commit comments

Comments
 (0)