1+ import os .path
12import tempfile
23from functools import partial
34from itertools import product
910
1011import unstructured_inference .models .base as models
1112from unstructured_inference .inference import elements , layout , layoutelement
13+ from unstructured_inference .inference .layout import create_image_output_dir
1214from unstructured_inference .models import detectron2 , tesseract
1315from unstructured_inference .models .unstructuredmodel import (
1416 UnstructuredElementExtractionModel ,
@@ -47,12 +49,22 @@ def mock_final_layout():
4749
4850
4951def test_pdf_page_converts_images_to_array (mock_image ):
52+ def verify_image_array ():
53+ assert page .image_array is None
54+ image_array = page ._get_image_array ()
55+ assert isinstance (image_array , np .ndarray )
56+ assert page .image_array .all () == image_array .all ()
57+
58+ # Scenario 1: where self.image exists
5059 page = layout .PageLayout (number = 0 , image = mock_image , layout = [])
51- assert page . image_array is None
60+ verify_image_array ()
5261
53- image_array = page ._get_image_array ()
54- assert isinstance (image_array , np .ndarray )
55- assert page .image_array .all () == image_array .all ()
62+ # Scenario 2: where self.image is None, but self.image_path exists
63+ page .image_array = None
64+ page .image = None
65+ page .image_path = "mock_path_to_image"
66+ with patch .object (Image , "open" , return_value = mock_image ):
67+ verify_image_array ()
5668
5769
5870def test_ocr (monkeypatch ):
@@ -141,31 +153,35 @@ def test_get_page_elements_with_ocr(monkeypatch):
141153 assert str (page ) == "\n \n An Even Catchier Title"
142154
143155
144- def test_read_pdf (monkeypatch , mock_initial_layout , mock_final_layout ):
145- image = np .random .randint (12 , 24 , (40 , 40 ))
146- images = [image , image ]
156+ def test_read_pdf (monkeypatch , mock_initial_layout , mock_final_layout , mock_image ):
157+ with tempfile .TemporaryDirectory () as tmpdir :
158+ image_path1 = os .path .join (tmpdir , "mock1.jpg" )
159+ image_path2 = os .path .join (tmpdir , "mock2.jpg" )
160+ mock_image .save (image_path1 )
161+ mock_image .save (image_path2 )
162+ image_paths = [image_path1 , image_path2 ]
147163
148- layouts = [mock_initial_layout , mock_initial_layout ]
164+ layouts = [mock_initial_layout , mock_initial_layout ]
149165
150- monkeypatch .setattr (
151- models ,
152- "UnstructuredDetectronModel" ,
153- partial (MockLayoutModel , layout = mock_final_layout ),
154- )
155- monkeypatch .setattr (detectron2 , "is_detectron2_available" , lambda * args : True )
166+ monkeypatch .setattr (
167+ models ,
168+ "UnstructuredDetectronModel" ,
169+ partial (MockLayoutModel , layout = mock_final_layout ),
170+ )
171+ monkeypatch .setattr (detectron2 , "is_detectron2_available" , lambda * args : True )
156172
157- with patch .object (layout , "load_pdf" , return_value = (layouts , images )):
158- model = layout .get_model ("detectron2_lp" )
159- doc = layout .DocumentLayout .from_file ("fake-file.pdf" , detection_model = model )
173+ with patch .object (layout , "load_pdf" , return_value = (layouts , image_paths )):
174+ model = layout .get_model ("detectron2_lp" )
175+ doc = layout .DocumentLayout .from_file ("fake-file.pdf" , detection_model = model )
160176
161- assert str (doc ).startswith ("A Catchy Title" )
162- assert str (doc ).count ("A Catchy Title" ) == 2 # Once for each page
163- assert str (doc ).endswith ("A very repetitive narrative. " )
177+ assert str (doc ).startswith ("A Catchy Title" )
178+ assert str (doc ).count ("A Catchy Title" ) == 2 # Once for each page
179+ assert str (doc ).endswith ("A very repetitive narrative. " )
164180
165- assert doc .pages [0 ].elements [0 ].to_dict ()["text" ] == "A Catchy Title"
181+ assert doc .pages [0 ].elements [0 ].to_dict ()["text" ] == "A Catchy Title"
166182
167- pages = doc .pages
168- assert str (doc ) == "\n \n " .join ([str (page ) for page in pages ])
183+ pages = doc .pages
184+ assert str (doc ) == "\n \n " .join ([str (page ) for page in pages ])
169185
170186
171187@pytest .mark .parametrize ("model_name" , [None , "checkbox" , "fake" ])
@@ -320,12 +336,53 @@ def mock_get_elements(self, *args, **kwargs):
320336 self .elements = [mock_final_layout ]
321337
322338 monkeypatch .setattr (layout .PageLayout , "get_elements_with_detection_model" , mock_get_elements )
323- elements = (
324- layout .DocumentLayout .from_image_file (f"sample-docs/loremipsum.{ filetype } " )
325- .pages [0 ]
326- .elements
327- )
328- assert elements [0 ] == mock_final_layout
339+ filename = f"sample-docs/loremipsum.{ filetype } "
340+ image = Image .open (filename )
341+ image_metadata = {
342+ "format" : image .format ,
343+ "width" : image .width ,
344+ "height" : image .height ,
345+ }
346+
347+ doc = layout .DocumentLayout .from_image_file (filename )
348+ page = doc .pages [0 ]
349+ assert page .elements [0 ] == mock_final_layout
350+ assert page .image is None
351+ assert page .image_path == os .path .abspath (filename )
352+ assert page .image_metadata == image_metadata
353+
354+
355+ def test_from_file (monkeypatch , mock_final_layout ):
356+ def mock_get_elements (self , * args , ** kwargs ):
357+ self .elements = [mock_final_layout ]
358+
359+ monkeypatch .setattr (layout .PageLayout , "get_elements_with_detection_model" , mock_get_elements )
360+
361+ with tempfile .TemporaryDirectory () as tmpdir :
362+ image_path = os .path .join (tmpdir , "loremipsum.ppm" )
363+ image = Image .open ("sample-docs/loremipsum.jpg" )
364+ image .save (image_path )
365+ image_metadata = {
366+ "format" : "PPM" ,
367+ "width" : image .width ,
368+ "height" : image .height ,
369+ }
370+
371+ with patch .object (
372+ layout ,
373+ "create_image_output_dir" ,
374+ return_value = tmpdir ,
375+ ), patch .object (
376+ layout ,
377+ "load_pdf" ,
378+ lambda * args , ** kwargs : ([[]], [image_path ]),
379+ ):
380+ doc = layout .DocumentLayout .from_file ("fake-file.pdf" )
381+ page = doc .pages [0 ]
382+ assert page .elements [0 ] == mock_final_layout
383+ assert page .image_metadata == image_metadata
384+ assert page .image_path == image_path
385+ assert page .image is None
329386
330387
331388def test_from_image_file_raises_with_empty_fn ():
@@ -526,6 +583,14 @@ def test_load_pdf_image_placement():
526583 assert image_region .y2 < images [5 ].height / 2
527584
528585
586+ def test_load_pdf_raises_with_path_only_no_output_folder ():
587+ with pytest .raises (ValueError ):
588+ layout .load_pdf (
589+ "sample-docs/loremipsum-flat.pdf" ,
590+ path_only = True ,
591+ )
592+
593+
529594@pytest .mark .skip ("Temporarily removed multicolumn to fix ordering" )
530595def test_load_pdf_with_multicolumn_layout_and_ocr (filename = "sample-docs/design-thinking.pdf" ):
531596 layouts , images = layout .load_pdf (filename )
@@ -544,6 +609,21 @@ def test_load_pdf_with_multicolumn_layout_and_ocr(filename="sample-docs/design-t
544609
545610@pytest .mark .parametrize ("colors" , ["red" , None ])
546611def test_annotate (colors ):
612+ def check_annotated_image ():
613+ annotated_array = np .array (annotated_image )
614+ for coords in [coords1 , coords2 ]:
615+ x1 , y1 , x2 , y2 = coords
616+ # Make sure the pixels on the edge of the box are red
617+ for i , expected in zip (range (3 ), [255 , 0 , 0 ]):
618+ assert all (annotated_array [y1 , x1 :x2 , i ] == expected )
619+ assert all (annotated_array [y2 , x1 :x2 , i ] == expected )
620+ assert all (annotated_array [y1 :y2 , x1 , i ] == expected )
621+ assert all (annotated_array [y1 :y2 , x2 , i ] == expected )
622+ # Make sure almost all the pixels are not changed
623+ assert ((annotated_array [:, :, 0 ] == 1 ).mean ()) > 0.992
624+ assert ((annotated_array [:, :, 1 ] == 1 ).mean ()) > 0.992
625+ assert ((annotated_array [:, :, 2 ] == 1 ).mean ()) > 0.992
626+
547627 test_image_arr = np .ones ((100 , 100 , 3 ), dtype = "uint8" )
548628 image = Image .fromarray (test_image_arr )
549629 page = layout .PageLayout (number = 1 , image = image , layout = None )
@@ -552,19 +632,17 @@ def test_annotate(colors):
552632 coords2 = (1 , 10 , 7 , 11 )
553633 rect2 = elements .Rectangle (* coords2 )
554634 page .elements = [rect1 , rect2 ]
635+
636+ # Scenario 1: where self.image exists
555637 annotated_image = page .annotate (colors = colors )
556- annotated_array = np .array (annotated_image )
557- for x1 , y1 , x2 , y2 in [coords1 , coords2 ]:
558- # Make sure the pixels on the edge of the box are red
559- for i , expected in zip (range (3 ), [255 , 0 , 0 ]):
560- assert all (annotated_array [y1 , x1 :x2 , i ] == expected )
561- assert all (annotated_array [y2 , x1 :x2 , i ] == expected )
562- assert all (annotated_array [y1 :y2 , x1 , i ] == expected )
563- assert all (annotated_array [y1 :y2 , x2 , i ] == expected )
564- # Make sure almost all the pixels are not changed
565- assert ((annotated_array [:, :, 0 ] == 1 ).mean ()) > 0.992
566- assert ((annotated_array [:, :, 1 ] == 1 ).mean ()) > 0.992
567- assert ((annotated_array [:, :, 2 ] == 1 ).mean ()) > 0.992
638+ check_annotated_image ()
639+
640+ # Scenario 2: where self.image is None, but self.image_path exists
641+ with patch .object (Image , "open" , return_value = image ):
642+ page .image = None
643+ page .image_path = "mock_path_to_image"
644+ annotated_image = page .annotate (colors = colors )
645+ check_annotated_image ()
568646
569647
570648def test_textregion_returns_empty_ocr_never (mock_image ):
@@ -609,18 +687,21 @@ def ordering_layout():
609687 return elements
610688
611689
612- def test_layout_order (ordering_layout ):
613- with patch .object (layout , "get_model" , lambda : lambda x : ordering_layout ), patch .object (
614- layout ,
615- "load_pdf" ,
616- lambda * args , ** kwargs : ([[]], [mock_image ]),
617- ), patch .object (
618- layout ,
619- "UnstructuredObjectDetectionModel" ,
620- object ,
621- ):
622- doc = layout .DocumentLayout .from_file ("sample-docs/layout-parser-paper.pdf" )
623- page = doc .pages [0 ]
690+ def test_layout_order (mock_image , ordering_layout ):
691+ with tempfile .TemporaryDirectory () as tmpdir :
692+ mock_image_path = os .path .join (tmpdir , "mock.jpg" )
693+ mock_image .save (mock_image_path )
694+ with patch .object (layout , "get_model" , lambda : lambda x : ordering_layout ), patch .object (
695+ layout ,
696+ "load_pdf" ,
697+ lambda * args , ** kwargs : ([[]], [mock_image_path ]),
698+ ), patch .object (
699+ layout ,
700+ "UnstructuredObjectDetectionModel" ,
701+ object ,
702+ ):
703+ doc = layout .DocumentLayout .from_file ("sample-docs/layout-parser-paper.pdf" )
704+ page = doc .pages [0 ]
624705 for n , element in enumerate (page .elements ):
625706 assert element .text == str (n )
626707
@@ -690,6 +771,7 @@ def test_from_image(
690771 ) as mock_detection :
691772 layout .PageLayout .from_image (
692773 mock_image ,
774+ image_path = None ,
693775 detection_model = detection_model ,
694776 element_extraction_model = element_extraction_model ,
695777 )
@@ -748,3 +830,13 @@ def test_exposed_pdf_image_dpi(pdf_image_dpi, expected, monkeypatch):
748830 with patch .object (layout .PageLayout , "from_image" ) as mock_from_image :
749831 layout .DocumentLayout .from_file ("sample-docs/loremipsum.pdf" , pdf_image_dpi = pdf_image_dpi )
750832 assert mock_from_image .call_args [0 ][0 ].height == expected
833+
834+
835+ def test_create_image_output_dir ():
836+ with tempfile .TemporaryDirectory () as tmpdir :
837+ tmp_f_path = os .path .join (tmpdir , "loremipsum.pdf" )
838+ output_dir = create_image_output_dir (tmp_f_path )
839+ expected_output_dir = os .path .join (os .path .abspath (tmpdir ), "loremipsum" )
840+ assert os .path .isdir (output_dir )
841+ assert os .path .isabs (output_dir )
842+ assert output_dir == expected_output_dir
0 commit comments