88from PIL import Image
99
1010import unstructured_inference .models .base as models
11- from unstructured_inference .inference import elements , layout
11+ from unstructured_inference .inference import elements , layout , layoutelement
1212from unstructured_inference .models import detectron2 , tesseract
1313
1414
@@ -18,14 +18,30 @@ def mock_image():
1818
1919
2020@pytest .fixture ()
21- def mock_page_layout ():
21+ def mock_initial_layout ():
2222 text_block = layout .EmbeddedTextRegion (2 , 4 , 6 , 8 , text = "A very repetitive narrative. " * 10 )
2323
2424 title_block = layout .EmbeddedTextRegion (1 , 2 , 3 , 4 , text = "A Catchy Title" )
2525
2626 return [text_block , title_block ]
2727
2828
29+ @pytest .fixture ()
30+ def mock_final_layout ():
31+ text_block = layoutelement .LayoutElement (
32+ 2 ,
33+ 4 ,
34+ 6 ,
35+ 8 ,
36+ text = "A very repetitive narrative. " * 10 ,
37+ type = "NarrativeText" ,
38+ )
39+
40+ title_block = layoutelement .LayoutElement (1 , 2 , 3 , 4 , text = "A Catchy Title" , type = "Title" )
41+
42+ return [text_block , title_block ]
43+
44+
2945def test_pdf_page_converts_images_to_array (mock_image ):
3046 page = layout .PageLayout (number = 0 , image = mock_image , layout = [])
3147 assert page .image_array is None
@@ -62,13 +78,13 @@ def initialize(self, *args, **kwargs):
6278 pass
6379
6480
65- def test_get_page_elements (monkeypatch , mock_page_layout ):
81+ def test_get_page_elements (monkeypatch , mock_final_layout ):
6682 image = np .random .randint (12 , 24 , (40 , 40 ))
6783 page = layout .PageLayout (
6884 number = 0 ,
6985 image = image ,
70- layout = mock_page_layout ,
71- model = MockLayoutModel (mock_page_layout ),
86+ layout = mock_final_layout ,
87+ model = MockLayoutModel (mock_final_layout ),
7288 )
7389
7490 elements = page .get_elements_with_model (inplace = False )
@@ -94,7 +110,17 @@ def join(self):
94110def test_get_page_elements_with_ocr (monkeypatch ):
95111 text_block = layout .TextRegion (2 , 4 , 6 , 8 , text = None )
96112 image_block = layout .ImageTextRegion (8 , 14 , 16 , 18 )
97- doc_layout = [text_block , image_block ]
113+ doc_initial_layout = [text_block , image_block ]
114+ text_layoutelement = layoutelement .LayoutElement (
115+ 2 ,
116+ 4 ,
117+ 6 ,
118+ 8 ,
119+ text = None ,
120+ type = "UncategorizedText" ,
121+ )
122+ image_layoutelement = layoutelement .LayoutElement (8 , 14 , 16 , 18 , text = None , type = "Image" )
123+ doc_final_layout = [text_layoutelement , image_layoutelement ]
98124
99125 monkeypatch .setattr (detectron2 , "is_detectron2_available" , lambda * args : True )
100126 monkeypatch .setattr (elements , "ocr" , lambda * args , ** kwargs : "An Even Catchier Title" )
@@ -103,24 +129,24 @@ def test_get_page_elements_with_ocr(monkeypatch):
103129 page = layout .PageLayout (
104130 number = 0 ,
105131 image = image ,
106- layout = doc_layout ,
107- model = MockLayoutModel (doc_layout ),
132+ layout = doc_initial_layout ,
133+ model = MockLayoutModel (doc_final_layout ),
108134 )
109135 page .get_elements_with_model ()
110136
111137 assert str (page ) == "\n \n An Even Catchier Title"
112138
113139
114- def test_read_pdf (monkeypatch , mock_page_layout ):
140+ def test_read_pdf (monkeypatch , mock_initial_layout , mock_final_layout ):
115141 image = np .random .randint (12 , 24 , (40 , 40 ))
116142 images = [image , image ]
117143
118- layouts = [mock_page_layout , mock_page_layout ]
144+ layouts = [mock_initial_layout , mock_initial_layout ]
119145
120146 monkeypatch .setattr (
121147 models ,
122148 "UnstructuredDetectronModel" ,
123- partial (MockLayoutModel , layout = mock_page_layout ),
149+ partial (MockLayoutModel , layout = mock_final_layout ),
124150 )
125151 monkeypatch .setattr (detectron2 , "is_detectron2_available" , lambda * args : True )
126152
@@ -139,8 +165,8 @@ def test_read_pdf(monkeypatch, mock_page_layout):
139165
140166
141167@pytest .mark .parametrize ("model_name" , [None , "checkbox" , "fake" ])
142- def test_process_data_with_model (monkeypatch , mock_page_layout , model_name ):
143- monkeypatch .setattr (layout , "get_model" , lambda x : MockLayoutModel (mock_page_layout ))
168+ def test_process_data_with_model (monkeypatch , mock_final_layout , model_name ):
169+ monkeypatch .setattr (layout , "get_model" , lambda x : MockLayoutModel (mock_final_layout ))
144170 monkeypatch .setattr (
145171 layout .DocumentLayout ,
146172 "from_file" ,
@@ -158,11 +184,10 @@ def test_process_data_with_model_raises_on_invalid_model_name():
158184
159185
160186@pytest .mark .parametrize ("model_name" , [None , "checkbox" ])
161- def test_process_file_with_model (monkeypatch , mock_page_layout , model_name ):
187+ def test_process_file_with_model (monkeypatch , mock_final_layout , model_name ):
162188 def mock_initialize (self , * args , ** kwargs ):
163- self .model = MockLayoutModel (mock_page_layout )
189+ self .model = MockLayoutModel (mock_final_layout )
164190
165- monkeypatch .setattr (models , "get_model" , lambda x : MockLayoutModel (mock_page_layout ))
166191 monkeypatch .setattr (
167192 layout .DocumentLayout ,
168193 "from_file" ,
@@ -276,17 +301,17 @@ def test_get_elements_from_block_raises():
276301
277302
278303@pytest .mark .parametrize ("filetype" , ["png" , "jpg" ])
279- def test_from_image_file (monkeypatch , mock_page_layout , filetype ):
304+ def test_from_image_file (monkeypatch , mock_final_layout , filetype ):
280305 def mock_get_elements (self , * args , ** kwargs ):
281- self .elements = [mock_page_layout ]
306+ self .elements = [mock_final_layout ]
282307
283308 monkeypatch .setattr (layout .PageLayout , "get_elements_with_model" , mock_get_elements )
284309 elements = (
285310 layout .DocumentLayout .from_image_file (f"sample-docs/loremipsum.{ filetype } " )
286311 .pages [0 ]
287312 .elements
288313 )
289- assert elements [0 ] == mock_page_layout
314+ assert elements [0 ] == mock_final_layout
290315
291316
292317def test_from_image_file_raises_with_empty_fn ():
@@ -307,9 +332,9 @@ def test_from_file_raises_on_length_mismatch(monkeypatch):
307332
308333
309334@pytest .mark .parametrize ("idx" , range (2 ))
310- def test_get_elements_from_layout (mock_page_layout , idx ):
311- page = MockPageLayout (layout = mock_page_layout )
312- block = mock_page_layout [idx ].pad (3 )
335+ def test_get_elements_from_layout (mock_initial_layout , idx ):
336+ page = MockPageLayout (layout = mock_initial_layout )
337+ block = mock_initial_layout [idx ].pad (3 )
313338 fixed_layout = [block ]
314339 elements = page .get_elements_from_layout (fixed_layout )
315340 assert elements [0 ].text == block .text
@@ -483,6 +508,7 @@ def test_load_pdf_image_placement():
483508 assert image_region .y2 < images [5 ].height / 2
484509
485510
511+ @pytest .mark .skip ("Temporarily removed multicolumn to fix ordering" )
486512def test_load_pdf_with_multicolumn_layout_and_ocr (filename = "sample-docs/design-thinking.pdf" ):
487513 layouts , images = layout .load_pdf (filename )
488514 doc = layout .process_file_with_model (filename = filename , model_name = None )
@@ -522,6 +548,34 @@ def test_annotate():
522548 assert ((annotated_array [:, :, 2 ] == 1 ).mean ()) > 0.992
523549
524550
551+ def test_textregion_returns_empty_ocr_never (mock_image ):
552+ tr = elements .TextRegion (0 , 0 , 24 , 24 )
553+ assert tr .extract_text (objects = None , image = mock_image , ocr_strategy = "never" ) == ""
554+
555+
556+ @pytest .mark .parametrize (("text" , "expected" ), [("asdf" , "asdf" ), (None , "" )])
557+ def test_embedded_text_region (text , expected ):
558+ etr = elements .EmbeddedTextRegion (0 , 0 , 24 , 24 , text = text )
559+ assert etr .extract_text (objects = None ) == expected
560+
561+
562+ @pytest .mark .parametrize (
563+ ("text" , "ocr_strategy" , "expected" ),
564+ [
565+ (None , "never" , "" ),
566+ (None , "always" , "asdf" ),
567+ ("i have text" , "never" , "i have text" ),
568+ ("i have text" , "always" , "i have text" ),
569+ ],
570+ )
571+ def test_image_text_region (text , ocr_strategy , expected , mock_image ):
572+ itr = elements .ImageTextRegion (0 , 0 , 24 , 24 , text = text )
573+ with patch .object (elements , "ocr" , return_value = "asdf" ):
574+ assert (
575+ itr .extract_text (objects = None , image = mock_image , ocr_strategy = ocr_strategy ) == expected
576+ )
577+
578+
525579@pytest .fixture ()
526580def ordering_layout ():
527581 elements = [
@@ -537,7 +591,11 @@ def ordering_layout():
537591
538592
539593def test_layout_order (ordering_layout ):
540- with patch .object (layout , "get_model" , lambda : lambda x : ordering_layout ):
594+ with patch .object (layout , "get_model" , lambda : lambda x : ordering_layout ), patch .object (
595+ layout ,
596+ "load_pdf" ,
597+ lambda * args , ** kwargs : ([[]], [mock_image ]),
598+ ):
541599 doc = layout .DocumentLayout .from_file ("sample-docs/layout-parser-paper.pdf" )
542600 page = doc .pages [0 ]
543601 for n , element in enumerate (page .elements ):
0 commit comments