1+ import os
12import os .path
23import tempfile
34from functools import partial
1718 UnstructuredObjectDetectionModel ,
1819)
1920
21+ skip_outside_ci = os .getenv ("CI" , "" ).lower () in {"" , "false" , "f" , "0" }
22+
2023
2124@pytest .fixture ()
2225def mock_image ():
@@ -158,9 +161,9 @@ def join(self):
158161 pass
159162
160163
161- @pytest .mark .parametrize ( "entire_page_ocr" , [ " paddle" , "tesseract" ] )
162- def test_get_page_elements_with_ocr (monkeypatch , entire_page_ocr ):
163- monkeypatch .setenv ("ENTIRE_PAGE_OCR" , entire_page_ocr )
164+ @pytest .mark .skipif ( skip_outside_ci , reason = "Skipping paddle test run outside of CI" )
165+ def test_get_page_elements_with_paddle_ocr (monkeypatch ):
166+ monkeypatch .setenv ("ENTIRE_PAGE_OCR" , "paddle" )
164167 text_block = layout .TextRegion (2 , 4 , 6 , 8 , text = None )
165168 image_block = layout .ImageTextRegion (8 , 14 , 16 , 18 )
166169 doc_initial_layout = [text_block , image_block ]
@@ -186,7 +189,38 @@ def test_get_page_elements_with_ocr(monkeypatch, entire_page_ocr):
186189 detection_model = MockLayoutModel (doc_final_layout ),
187190 # Note(yuming): there are differnt language codes for same language
188191 # between paddle and tesseract
189- ocr_languages = "en" if entire_page_ocr == "paddle" else "eng" ,
192+ ocr_languages = "en" ,
193+ )
194+ page .get_elements_with_detection_model ()
195+
196+ assert str (page ) == "\n \n An Even Catchier Title"
197+
198+
199+ def test_get_page_elements_with_tesseract_ocr (monkeypatch ):
200+ monkeypatch .setenv ("ENTIRE_PAGE_OCR" , "tesseract" )
201+ text_block = layout .TextRegion (2 , 4 , 6 , 8 , text = None )
202+ image_block = layout .ImageTextRegion (8 , 14 , 16 , 18 )
203+ doc_initial_layout = [text_block , image_block ]
204+ text_layoutelement = layoutelement .LayoutElement (
205+ 2 ,
206+ 4 ,
207+ 6 ,
208+ 8 ,
209+ text = None ,
210+ type = "UncategorizedText" ,
211+ )
212+ image_layoutelement = layoutelement .LayoutElement (8 , 14 , 16 , 18 , text = None , type = "Image" )
213+ doc_final_layout = [text_layoutelement , image_layoutelement ]
214+
215+ monkeypatch .setattr (detectron2 , "is_detectron2_available" , lambda * args : True )
216+ monkeypatch .setattr (elements , "ocr" , lambda * args , ** kwargs : "An Even Catchier Title" )
217+
218+ image = Image .fromarray (np .random .randint (12 , 14 , size = (40 , 10 , 3 )), mode = "RGB" )
219+ page = layout .PageLayout (
220+ number = 0 ,
221+ image = image ,
222+ layout = doc_initial_layout ,
223+ detection_model = MockLayoutModel (doc_final_layout ),
190224 )
191225 page .get_elements_with_detection_model ()
192226
0 commit comments