Skip to content

Commit 3bb0ee1

Browse files
authored
chore: fix tests breaking on main (#3603)
Fix API tests (really more like integration tests) that run only on main. Also use less compute intensive files to speedup test time and remove a useless test. Tests in `test_unstructured/partition/test_api.py` pass, temporarily running outside of main per per screenshot: ![image](https://github.com/user-attachments/assets/f15d440a-2574-40f2-98b4-adf57fbae704) https://github.com/Unstructured-IO/unstructured/actions/runs/10754098974/job/29824415513
1 parent c060467 commit 3bb0ee1

File tree

1 file changed

+35
-32
lines changed

1 file changed

+35
-32
lines changed

test_unstructured/partition/test_api.py

Lines changed: 35 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,9 @@
1818

1919
DIRECTORY = pathlib.Path(__file__).parent.resolve()
2020

21+
# NOTE(crag): point to freemium API for now
22+
API_URL = "https://api.unstructured.io/general/v0/general"
23+
2124
is_in_ci = os.getenv("CI", "").lower() not in {"", "false", "f", "0"}
2225
skip_not_on_main = os.getenv("GITHUB_REF_NAME", "").lower() != "main"
2326

@@ -105,70 +108,68 @@ def test_partition_via_api_raises_with_bad_response(request: FixtureRequest):
105108
@pytest.mark.skipif(not is_in_ci, reason="Skipping test run outside of CI")
106109
@pytest.mark.skipif(skip_not_on_main, reason="Skipping test run outside of main branch")
107110
def test_partition_via_api_with_no_strategy():
111+
test_file = example_doc_path("pdf/loremipsum-flat.pdf")
108112
elements_no_strategy = partition_via_api(
109-
filename=example_doc_path("layout-parser-paper-fast.pdf"),
113+
filename=test_file,
110114
strategy="auto",
111115
api_key=get_api_key(),
112116
# The url has changed since the 06/24 API release while the sdk defaults to the old url
113-
api_url="https://api.unstructuredapp.io/general/v0/general",
117+
api_url=API_URL,
114118
skip_infer_table_types=["pdf"],
115119
)
116120
elements_hi_res = partition_via_api(
117-
filename=example_doc_path("layout-parser-paper-fast.pdf"),
121+
filename=test_file,
118122
strategy="hi_res",
119123
api_key=get_api_key(),
120124
# The url has changed since the 06/24 API release while the sdk defaults to the old url
121-
api_url="https://api.unstructuredapp.io/general/v0/general",
125+
api_url=API_URL,
126+
skip_infer_table_types=["pdf"],
127+
)
128+
elements_fast_res = partition_via_api(
129+
filename=test_file,
130+
strategy="fast",
131+
api_key=get_api_key(),
132+
# The url has changed since the 06/24 API release while the sdk defaults to the old url
133+
api_url=API_URL,
122134
skip_infer_table_types=["pdf"],
123135
)
124136

125137
# confirm that hi_res strategy was not passed as default to partition by comparing outputs
126138
# elements_hi_res[3].text =
127139
# 'LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis'
128140
# while elements_no_strategy[3].text = ']' (as of this writing)
129-
assert elements_no_strategy[3].text != elements_hi_res[3].text
141+
assert len(elements_no_strategy) == len(elements_hi_res)
142+
assert len(elements_hi_res) != len(elements_fast_res)
143+
144+
# NOTE(crag): slightly out scope assertion, but avoid extra API call
145+
assert elements_hi_res[0].metadata.coordinates is None
130146

131147

132148
@pytest.mark.skipif(not is_in_ci, reason="Skipping test run outside of CI")
133149
@pytest.mark.skipif(skip_not_on_main, reason="Skipping test run outside of main branch")
134150
def test_partition_via_api_with_image_hi_res_strategy_includes_coordinates():
135151
# coordinates not included by default to limit payload size
136152
elements = partition_via_api(
137-
filename=example_doc_path("layout-parser-paper-fast.pdf"),
153+
filename=example_doc_path("pdf/fake-memo.pdf"),
138154
strategy="hi_res",
139155
coordinates="true",
140156
api_key=get_api_key(),
141-
# The url has changed since the 06/24 API release while the sdk defaults to the old url
142-
api_url="https://api.unstructuredapp.io/general/v0/general",
157+
api_url=API_URL,
143158
)
144159

145160
assert elements[0].metadata.coordinates is not None
146161

147162

148-
@pytest.mark.skipif(not is_in_ci, reason="Skipping test run outside of CI")
149-
@pytest.mark.skipif(skip_not_on_main, reason="Skipping test run outside of main branch")
150-
def test_partition_via_api_valid_request_data_kwargs():
151-
elements = partition_via_api(
152-
filename=example_doc_path("layout-parser-paper-fast.pdf"),
153-
strategy="fast",
154-
api_key=get_api_key(),
155-
# The url has changed since the 06/24 API release while the sdk defaults to the old url
156-
api_url="https://api.unstructuredapp.io/general/v0/general",
157-
)
158-
159-
assert isinstance(elements, list)
160-
161-
162163
@pytest.mark.skipif(not is_in_ci, reason="Skipping test run outside of CI")
163164
@pytest.mark.skipif(skip_not_on_main, reason="Skipping test run outside of main branch")
164165
def test_partition_via_api_image_block_extraction():
165166
elements = partition_via_api(
166-
filename=example_doc_path("embedded-images-tables.pdf"),
167+
filename=example_doc_path("pdf/embedded-images-tables.pdf"),
167168
strategy="hi_res",
168169
extract_image_block_types=["image", "table"],
169170
api_key=get_api_key(),
170171
# The url has changed since the 06/24 API release while the sdk defaults to the old url
171-
api_url="https://api.unstructuredapp.io/general/v0/general",
172+
api_url=API_URL,
172173
)
173174
image_elements = [el for el in elements if el.category == ElementType.IMAGE]
174175
for el in image_elements:
@@ -357,18 +358,20 @@ def get_api_key():
357358
@pytest.mark.skipif(skip_not_on_main, reason="Skipping test run outside of main branch")
358359
def test_partition_multiple_via_api_valid_request_data_kwargs():
359360
filenames = [
360-
example_doc_path("layout-parser-paper-fast.pdf"),
361-
example_doc_path("layout-parser-paper-fast.jpg"),
361+
example_doc_path("fake-text.txt"),
362+
example_doc_path("fake-email.txt"),
362363
]
363364

364-
elements = partition_multiple_via_api(
365+
list_of_lists_of_elements = partition_multiple_via_api(
365366
filenames=filenames,
366-
strategy="auto",
367+
strategy="fast",
367368
api_key=get_api_key(),
368-
# The url has changed since the 06/24 API release while the sdk defaults to the old url
369-
api_url="https://api.unstructuredapp.io/general/v0/general",
369+
api_url=API_URL,
370370
)
371-
assert isinstance(elements, list)
371+
# assert there is a list of elements for each file
372+
assert len(list_of_lists_of_elements) == 2
373+
assert isinstance(list_of_lists_of_elements[0], list)
374+
assert isinstance(list_of_lists_of_elements[1], list)
372375

373376

374377
@pytest.mark.skipif(not is_in_ci, reason="Skipping test run outside of CI")
@@ -383,7 +386,7 @@ def test_partition_multiple_via_api_invalid_request_data_kwargs():
383386
strategy="not_a_strategy",
384387
api_key=get_api_key(),
385388
# The url has changed since the 06/24 API release while the sdk defaults to the old url
386-
api_url="https://api.unstructuredapp.io/general/v0/general",
389+
api_url=API_URL,
387390
)
388391

389392

0 commit comments

Comments
 (0)