Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -63,3 +63,24 @@ jobs:
env:
UNSTRUCTURED_API_KEY: ${{ secrets.UNSTRUCTURED_API_KEY }}

test_contract:
strategy:
matrix:
python-version: [ "3.9","3.10","3.11", "3.12" ]
runs-on: ubuntu-latest
env:
POETRY_VIRTUALENVS_IN_PROJECT: "true"
steps:
- uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
make install
source .venv/bin/activate && make install-test-contract
- name: Run unit tests
run: |
poetry run make test-contract

9 changes: 9 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,11 @@ install:
install-speakeasy-cli:
curl -fsSL https://raw.githubusercontent.com/speakeasy-api/speakeasy/main/install.sh | sh

## install-test: install test requirements as they cannot be put into pyproject.toml due to python version requirements mismatch
.PHONY: install-test-contract
install-test-contract:
pip install unstructured pytest-httpx

#################
# Test and Lint #
#################
Expand All @@ -30,6 +35,10 @@ test: test-unit test-integration-docker
test-unit:
PYTHONPATH=. pytest _test_unstructured_client -v -k "unit"

.PHONY: test-contract
test-contract:
PYTHONPATH=. pytest _test_contract -v

# Assumes you have unstructured-api running on localhost:8000
.PHONY: test-integration
test-integration:
Expand Down
135 changes: 135 additions & 0 deletions _test_contract/test_partition_via_api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
import os
from pathlib import Path

import httpx
import pytest
from unstructured.partition.api import partition_via_api

from unstructured_client import UnstructuredClient


@pytest.fixture(scope="module")
def client() -> UnstructuredClient:
_client = UnstructuredClient(api_key_auth=os.getenv("UNSTRUCTURED_API_KEY"), server='free-api')
yield _client


@pytest.fixture(scope="module")
def doc_path() -> Path:
samples_path = Path(__file__).resolve().parents[1] / "_sample_docs"
assert samples_path.exists()
return samples_path


MOCK_TEXT = """[
{
"element_id": "f49fbd614ddf5b72e06f59e554e6ae2b",
"text": "This is a test email to use for unit tests.",
"type": "NarrativeText",
"metadata": {
"sent_from": [
"Matthew Robinson <[email protected]>"
],
"sent_to": [
"Matthew Robinson <[email protected]>"
],
"subject": "Test Email",
"filename": "fake-email.eml",
"filetype": "message/rfc822"
}
}
]"""


@pytest.mark.parametrize(
("url", "full_url"), [
("http://localhost:8000", "http://localhost:8000/general/v0/general"),
("http://localhost:8000/general/v0/general", "http://localhost:8000/general/v0/general"),
]
)
def test_partition_via_api_custom_url(httpx_mock, doc_path: Path, url: str, full_url: str):
"""
Assert that we can specify api_url and requests are sent to the right place
"""

filename = "layout-parser-paper-fast.pdf"

# adding response automatically checks whether a response to a request to given URL was found
httpx_mock.add_response(
method="POST",
url=full_url,
headers={"Content-Type": "application/json"},
content=MOCK_TEXT.encode(),
)

partition_via_api(filename=str(doc_path / filename), api_url=url, metadata_filename=filename)


def test_partition_via_api_pass_list_type_parameters(httpx_mock, doc_path: Path):
url = "http://localhost:8000/general/v0/general"
filename = "layout-parser-paper-fast.pdf"

httpx_mock.add_response(
method="POST",
headers={"Content-Type": "application/json"},
content=MOCK_TEXT.encode(),
url=url,
)

params = dict(
split_pdf_page=False,
strategy="hi_res",
extract_image_block_types=["image", "table"],
skip_infer_table_types=["pdf", "docx"],
languages=["eng"],
)

partition_via_api(filename=str(doc_path / filename),
api_url=url,
metadata_filename=filename,
**params)

requests = httpx_mock.get_requests()

assert len(requests) == 1

request = requests[0]

parsed_multipart_form = _parse_multipart_data(request)
assert "coordinates" in parsed_multipart_form
for key, value in params.items():
assert key in parsed_multipart_form
assert parsed_multipart_form[key] == value


def _parse_multipart_data(request: httpx.Request) -> dict:
"""Parser for multipart form data in raw format to a dictionary. Ommits "files" field
Includes table-like entries.
"""
data = request.content
boundary = request.headers["Content-Type"].split("boundary=")[1]
parts = data.split(f"--{boundary}".encode())
parts = [part.strip() for part in parts if part.strip()]
parsed_data = {}
for part in parts:
if b"Content-Disposition: form-data" in part:
try:
semicolon_pos = part.find(b";")
contents = part[semicolon_pos + 2:]
if b"name=\"files\"" in contents:
continue
contents = contents.decode()
key, value = contents.split("\r\n\r\n")
key = key.replace("name=", "").strip('"')
if "[]" in key:
key = key.replace("[]", "")
if key not in parsed_data:
parsed_data[key] = []
parsed_data[key].append(value)
elif value in ["true", "false"]:
parsed_data[key] = value == "true"
else:
parsed_data[key] = value
except Exception as ex:
print(ex)
return parsed_data