Skip to content

Commit d9332ac

Browse files
committed
test: added partition_via_api tests removed from unstructured recently
1 parent 529fe87 commit d9332ac

File tree

1 file changed

+133
-0
lines changed

1 file changed

+133
-0
lines changed
Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
import os
2+
from pathlib import Path
3+
4+
import httpx
5+
import pytest
6+
from unstructured.partition.api import partition_via_api
7+
8+
from unstructured_client import UnstructuredClient
9+
10+
11+
@pytest.fixture(scope="module")
12+
def client() -> UnstructuredClient:
13+
_client = UnstructuredClient(api_key_auth=os.getenv("UNSTRUCTURED_API_KEY"), server='free-api')
14+
yield _client
15+
16+
17+
@pytest.fixture(scope="module")
18+
def doc_path() -> Path:
19+
return Path(__file__).resolve().parents[2] / "_sample_docs"
20+
21+
22+
MOCK_TEXT = """[
23+
{
24+
"element_id": "f49fbd614ddf5b72e06f59e554e6ae2b",
25+
"text": "This is a test email to use for unit tests.",
26+
"type": "NarrativeText",
27+
"metadata": {
28+
"sent_from": [
29+
"Matthew Robinson <[email protected]>"
30+
],
31+
"sent_to": [
32+
"Matthew Robinson <[email protected]>"
33+
],
34+
"subject": "Test Email",
35+
"filename": "fake-email.eml",
36+
"filetype": "message/rfc822"
37+
}
38+
}
39+
]"""
40+
41+
42+
@pytest.mark.parametrize(("url", "full_url"), [
43+
("http://localhost:8000", "http://localhost:8000/general/v0/general"),
44+
("http://localhost:8000/general/v0/general", "http://localhost:8000/general/v0/general"),
45+
]
46+
)
47+
def test_partition_via_api_custom_url(httpx_mock, doc_path: Path, url: str, full_url: str):
48+
"""
49+
Assert that we can specify api_url and requests are sent to the right place
50+
"""
51+
52+
filename = "layout-parser-paper-fast.pdf"
53+
54+
# adding response automatically checks whether a response to a request to given URL was found
55+
httpx_mock.add_response(
56+
method="POST",
57+
url=full_url,
58+
headers={"Content-Type": "application/json"},
59+
content=MOCK_TEXT.encode(),
60+
)
61+
62+
partition_via_api(filename=str(doc_path/filename), api_url=url, metadata_filename=filename)
63+
64+
65+
66+
def test_partition_via_api_pass_list_type_parameters(httpx_mock, doc_path: Path):
67+
url = "http://localhost:8000/general/v0/general"
68+
filename = "layout-parser-paper-fast.pdf"
69+
70+
httpx_mock.add_response(
71+
method="POST",
72+
headers={"Content-Type": "application/json"},
73+
content=MOCK_TEXT.encode(),
74+
url=url,
75+
)
76+
77+
params = dict(
78+
split_pdf_page=False,
79+
strategy="hi_res",
80+
extract_image_block_types=["image", "table"],
81+
skip_infer_table_types=["pdf", "docx"],
82+
languages=["eng"],
83+
)
84+
85+
partition_via_api(filename=str(doc_path / filename),
86+
api_url=url,
87+
metadata_filename=filename,
88+
**params)
89+
90+
requests = httpx_mock.get_requests()
91+
92+
assert len(requests) == 1
93+
94+
request = requests[0]
95+
96+
parsed_multipart_form = _parse_multipart_data(request)
97+
assert "coordinates" in parsed_multipart_form
98+
for key, value in params.items():
99+
assert key in parsed_multipart_form
100+
assert parsed_multipart_form[key] == value
101+
102+
103+
def _parse_multipart_data(request: httpx.Request) -> dict:
104+
"""Parser for multipart form data in raw format to a dictionary. Ommits "files" field
105+
Includes table-like entries.
106+
"""
107+
data = request.content
108+
boundary = request.headers["Content-Type"].split("boundary=")[1]
109+
parts = data.split(f"--{boundary}".encode())
110+
parts = [part.strip() for part in parts if part.strip()]
111+
parsed_data = {}
112+
for part in parts:
113+
if b"Content-Disposition: form-data" in part:
114+
try:
115+
semicolon_pos = part.find(b";")
116+
contents = part[semicolon_pos + 2:]
117+
if b"name=\"files\"" in contents:
118+
continue
119+
contents = contents.decode()
120+
key, value = contents.split("\r\n\r\n")
121+
key = key.replace("name=", "").strip('"')
122+
if "[]" in key:
123+
key = key.replace("[]", "")
124+
if key not in parsed_data:
125+
parsed_data[key] = []
126+
parsed_data[key].append(value)
127+
elif value in ["true", "false"]:
128+
parsed_data[key] = value == "true"
129+
else:
130+
parsed_data[key] = value
131+
except Exception as ex:
132+
print(ex)
133+
return parsed_data

0 commit comments

Comments
 (0)