Skip to content

Commit b75cb2c

Browse files
test: add contract tests for partition via api (#195)
Adding "contract" tests that were previously removed from `unstructured` [here](https://github.com/Unstructured-IO/unstructured/pull/3298/files#r1655111293) and [here](https://github.com/Unstructured-IO/unstructured/pull/3298/files#r1657546163). These tests are supposed to fail if the changes in the client code break `partition_via_api` functionality in `unstructured`.
1 parent 529fe87 commit b75cb2c

File tree

3 files changed

+165
-0
lines changed

3 files changed

+165
-0
lines changed

.github/workflows/ci.yaml

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,3 +63,24 @@ jobs:
6363
env:
6464
UNSTRUCTURED_API_KEY: ${{ secrets.UNSTRUCTURED_API_KEY }}
6565

66+
test_contract:
67+
strategy:
68+
matrix:
69+
python-version: [ "3.9","3.10","3.11", "3.12" ]
70+
runs-on: ubuntu-latest
71+
env:
72+
POETRY_VIRTUALENVS_IN_PROJECT: "true"
73+
steps:
74+
- uses: actions/checkout@v4
75+
- name: Set up Python ${{ matrix.python-version }}
76+
uses: actions/setup-python@v5
77+
with:
78+
python-version: ${{ matrix.python-version }}
79+
- name: Install dependencies
80+
run: |
81+
make install
82+
source .venv/bin/activate && make install-test-contract
83+
- name: Run unit tests
84+
run: |
85+
poetry run make test-contract
86+

Makefile

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,11 @@ install:
1919
install-speakeasy-cli:
2020
curl -fsSL https://raw.githubusercontent.com/speakeasy-api/speakeasy/main/install.sh | sh
2121

22+
## install-test: install test requirements as they cannot be put into pyproject.toml due to python version requirements mismatch
23+
.PHONY: install-test-contract
24+
install-test-contract:
25+
pip install unstructured pytest-httpx
26+
2227
#################
2328
# Test and Lint #
2429
#################
@@ -30,6 +35,10 @@ test: test-unit test-integration-docker
3035
test-unit:
3136
PYTHONPATH=. pytest _test_unstructured_client -v -k "unit"
3237

38+
.PHONY: test-contract
39+
test-contract:
40+
PYTHONPATH=. pytest _test_contract -v
41+
3342
# Assumes you have unstructured-api running on localhost:8000
3443
.PHONY: test-integration
3544
test-integration:
Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
import os
2+
from pathlib import Path
3+
4+
import httpx
5+
import pytest
6+
from unstructured.partition.api import partition_via_api
7+
8+
from unstructured_client import UnstructuredClient
9+
10+
11+
@pytest.fixture(scope="module")
12+
def client() -> UnstructuredClient:
13+
_client = UnstructuredClient(api_key_auth=os.getenv("UNSTRUCTURED_API_KEY"), server='free-api')
14+
yield _client
15+
16+
17+
@pytest.fixture(scope="module")
18+
def doc_path() -> Path:
19+
samples_path = Path(__file__).resolve().parents[1] / "_sample_docs"
20+
assert samples_path.exists()
21+
return samples_path
22+
23+
24+
MOCK_TEXT = """[
25+
{
26+
"element_id": "f49fbd614ddf5b72e06f59e554e6ae2b",
27+
"text": "This is a test email to use for unit tests.",
28+
"type": "NarrativeText",
29+
"metadata": {
30+
"sent_from": [
31+
"Matthew Robinson <[email protected]>"
32+
],
33+
"sent_to": [
34+
"Matthew Robinson <[email protected]>"
35+
],
36+
"subject": "Test Email",
37+
"filename": "fake-email.eml",
38+
"filetype": "message/rfc822"
39+
}
40+
}
41+
]"""
42+
43+
44+
@pytest.mark.parametrize(
45+
("url", "full_url"), [
46+
("http://localhost:8000", "http://localhost:8000/general/v0/general"),
47+
("http://localhost:8000/general/v0/general", "http://localhost:8000/general/v0/general"),
48+
]
49+
)
50+
def test_partition_via_api_custom_url(httpx_mock, doc_path: Path, url: str, full_url: str):
51+
"""
52+
Assert that we can specify api_url and requests are sent to the right place
53+
"""
54+
55+
filename = "layout-parser-paper-fast.pdf"
56+
57+
# adding response automatically checks whether a response to a request to given URL was found
58+
httpx_mock.add_response(
59+
method="POST",
60+
url=full_url,
61+
headers={"Content-Type": "application/json"},
62+
content=MOCK_TEXT.encode(),
63+
)
64+
65+
partition_via_api(filename=str(doc_path / filename), api_url=url, metadata_filename=filename)
66+
67+
68+
def test_partition_via_api_pass_list_type_parameters(httpx_mock, doc_path: Path):
69+
url = "http://localhost:8000/general/v0/general"
70+
filename = "layout-parser-paper-fast.pdf"
71+
72+
httpx_mock.add_response(
73+
method="POST",
74+
headers={"Content-Type": "application/json"},
75+
content=MOCK_TEXT.encode(),
76+
url=url,
77+
)
78+
79+
params = dict(
80+
split_pdf_page=False,
81+
strategy="hi_res",
82+
extract_image_block_types=["image", "table"],
83+
skip_infer_table_types=["pdf", "docx"],
84+
languages=["eng"],
85+
)
86+
87+
partition_via_api(filename=str(doc_path / filename),
88+
api_url=url,
89+
metadata_filename=filename,
90+
**params)
91+
92+
requests = httpx_mock.get_requests()
93+
94+
assert len(requests) == 1
95+
96+
request = requests[0]
97+
98+
parsed_multipart_form = _parse_multipart_data(request)
99+
assert "coordinates" in parsed_multipart_form
100+
for key, value in params.items():
101+
assert key in parsed_multipart_form
102+
assert parsed_multipart_form[key] == value
103+
104+
105+
def _parse_multipart_data(request: httpx.Request) -> dict:
106+
"""Parser for multipart form data in raw format to a dictionary. Ommits "files" field
107+
Includes table-like entries.
108+
"""
109+
data = request.content
110+
boundary = request.headers["Content-Type"].split("boundary=")[1]
111+
parts = data.split(f"--{boundary}".encode())
112+
parts = [part.strip() for part in parts if part.strip()]
113+
parsed_data = {}
114+
for part in parts:
115+
if b"Content-Disposition: form-data" in part:
116+
try:
117+
semicolon_pos = part.find(b";")
118+
contents = part[semicolon_pos + 2:]
119+
if b"name=\"files\"" in contents:
120+
continue
121+
contents = contents.decode()
122+
key, value = contents.split("\r\n\r\n")
123+
key = key.replace("name=", "").strip('"')
124+
if "[]" in key:
125+
key = key.replace("[]", "")
126+
if key not in parsed_data:
127+
parsed_data[key] = []
128+
parsed_data[key].append(value)
129+
elif value in ["true", "false"]:
130+
parsed_data[key] = value == "true"
131+
else:
132+
parsed_data[key] = value
133+
except Exception as ex:
134+
print(ex)
135+
return parsed_data

0 commit comments

Comments
 (0)