1818
1919DIRECTORY = pathlib .Path (__file__ ).parent .resolve ()
2020
21+ # NOTE(crag): point to freemium API for now
22+ API_URL = "https://api.unstructured.io/general/v0/general"
23+
2124is_in_ci = os .getenv ("CI" , "" ).lower () not in {"" , "false" , "f" , "0" }
2225skip_not_on_main = os .getenv ("GITHUB_REF_NAME" , "" ).lower () != "main"
2326
@@ -105,70 +108,68 @@ def test_partition_via_api_raises_with_bad_response(request: FixtureRequest):
105108@pytest .mark .skipif (not is_in_ci , reason = "Skipping test run outside of CI" )
106109@pytest .mark .skipif (skip_not_on_main , reason = "Skipping test run outside of main branch" )
107110def test_partition_via_api_with_no_strategy ():
111+ test_file = example_doc_path ("pdf/loremipsum-flat.pdf" )
108112 elements_no_strategy = partition_via_api (
109- filename = example_doc_path ( "layout-parser-paper-fast.pdf" ) ,
113+ filename = test_file ,
110114 strategy = "auto" ,
111115 api_key = get_api_key (),
112116 # The url has changed since the 06/24 API release while the sdk defaults to the old url
113- api_url = "https://api.unstructuredapp.io/general/v0/general" ,
117+ api_url = API_URL ,
114118 skip_infer_table_types = ["pdf" ],
115119 )
116120 elements_hi_res = partition_via_api (
117- filename = example_doc_path ( "layout-parser-paper-fast.pdf" ) ,
121+ filename = test_file ,
118122 strategy = "hi_res" ,
119123 api_key = get_api_key (),
120124 # The url has changed since the 06/24 API release while the sdk defaults to the old url
121- api_url = "https://api.unstructuredapp.io/general/v0/general" ,
125+ api_url = API_URL ,
126+ skip_infer_table_types = ["pdf" ],
127+ )
128+ elements_fast_res = partition_via_api (
129+ filename = test_file ,
130+ strategy = "fast" ,
131+ api_key = get_api_key (),
132+ # The url has changed since the 06/24 API release while the sdk defaults to the old url
133+ api_url = API_URL ,
122134 skip_infer_table_types = ["pdf" ],
123135 )
124136
125137 # confirm that hi_res strategy was not passed as default to partition by comparing outputs
126138 # elements_hi_res[3].text =
127139 # 'LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis'
128140 # while elements_no_strategy[3].text = ']' (as of this writing)
129- assert elements_no_strategy [3 ].text != elements_hi_res [3 ].text
141+ assert len (elements_no_strategy ) == len (elements_hi_res )
142+ assert len (elements_hi_res ) != len (elements_fast_res )
143+
144+ # NOTE(crag): slightly out scope assertion, but avoid extra API call
145+ assert elements_hi_res [0 ].metadata .coordinates is None
130146
131147
132148@pytest .mark .skipif (not is_in_ci , reason = "Skipping test run outside of CI" )
133149@pytest .mark .skipif (skip_not_on_main , reason = "Skipping test run outside of main branch" )
134150def test_partition_via_api_with_image_hi_res_strategy_includes_coordinates ():
135151 # coordinates not included by default to limit payload size
136152 elements = partition_via_api (
137- filename = example_doc_path ("layout-parser-paper-fast .pdf" ),
153+ filename = example_doc_path ("pdf/fake-memo .pdf" ),
138154 strategy = "hi_res" ,
139155 coordinates = "true" ,
140156 api_key = get_api_key (),
141- # The url has changed since the 06/24 API release while the sdk defaults to the old url
142- api_url = "https://api.unstructuredapp.io/general/v0/general" ,
157+ api_url = API_URL ,
143158 )
144159
145160 assert elements [0 ].metadata .coordinates is not None
146161
147162
148- @pytest .mark .skipif (not is_in_ci , reason = "Skipping test run outside of CI" )
149- @pytest .mark .skipif (skip_not_on_main , reason = "Skipping test run outside of main branch" )
150- def test_partition_via_api_valid_request_data_kwargs ():
151- elements = partition_via_api (
152- filename = example_doc_path ("layout-parser-paper-fast.pdf" ),
153- strategy = "fast" ,
154- api_key = get_api_key (),
155- # The url has changed since the 06/24 API release while the sdk defaults to the old url
156- api_url = "https://api.unstructuredapp.io/general/v0/general" ,
157- )
158-
159- assert isinstance (elements , list )
160-
161-
162163@pytest .mark .skipif (not is_in_ci , reason = "Skipping test run outside of CI" )
163164@pytest .mark .skipif (skip_not_on_main , reason = "Skipping test run outside of main branch" )
164165def test_partition_via_api_image_block_extraction ():
165166 elements = partition_via_api (
166- filename = example_doc_path ("embedded-images-tables.pdf" ),
167+ filename = example_doc_path ("pdf/ embedded-images-tables.pdf" ),
167168 strategy = "hi_res" ,
168169 extract_image_block_types = ["image" , "table" ],
169170 api_key = get_api_key (),
170171 # The url has changed since the 06/24 API release while the sdk defaults to the old url
171- api_url = "https://api.unstructuredapp.io/general/v0/general" ,
172+ api_url = API_URL ,
172173 )
173174 image_elements = [el for el in elements if el .category == ElementType .IMAGE ]
174175 for el in image_elements :
@@ -357,18 +358,20 @@ def get_api_key():
357358@pytest .mark .skipif (skip_not_on_main , reason = "Skipping test run outside of main branch" )
358359def test_partition_multiple_via_api_valid_request_data_kwargs ():
359360 filenames = [
360- example_doc_path ("layout-parser-paper-fast.pdf " ),
361- example_doc_path ("layout-parser-paper-fast.jpg " ),
361+ example_doc_path ("fake-text.txt " ),
362+ example_doc_path ("fake-email.txt " ),
362363 ]
363364
364- elements = partition_multiple_via_api (
365+ list_of_lists_of_elements = partition_multiple_via_api (
365366 filenames = filenames ,
366- strategy = "auto " ,
367+ strategy = "fast " ,
367368 api_key = get_api_key (),
368- # The url has changed since the 06/24 API release while the sdk defaults to the old url
369- api_url = "https://api.unstructuredapp.io/general/v0/general" ,
369+ api_url = API_URL ,
370370 )
371- assert isinstance (elements , list )
371+ # assert there is a list of elements for each file
372+ assert len (list_of_lists_of_elements ) == 2
373+ assert isinstance (list_of_lists_of_elements [0 ], list )
374+ assert isinstance (list_of_lists_of_elements [1 ], list )
372375
373376
374377@pytest .mark .skipif (not is_in_ci , reason = "Skipping test run outside of CI" )
@@ -383,7 +386,7 @@ def test_partition_multiple_via_api_invalid_request_data_kwargs():
383386 strategy = "not_a_strategy" ,
384387 api_key = get_api_key (),
385388 # The url has changed since the 06/24 API release while the sdk defaults to the old url
386- api_url = "https://api.unstructuredapp.io/general/v0/general" ,
389+ api_url = API_URL ,
387390 )
388391
389392
0 commit comments