diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 9d96e9f8..2000c2bb 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -63,3 +63,24 @@ jobs: env: UNSTRUCTURED_API_KEY: ${{ secrets.UNSTRUCTURED_API_KEY }} + test_contract: + strategy: + matrix: + python-version: [ "3.9","3.10","3.11", "3.12" ] + runs-on: ubuntu-latest + env: + POETRY_VIRTUALENVS_IN_PROJECT: "true" + steps: + - uses: actions/checkout@v4 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + make install + source .venv/bin/activate && make install-test-contract + - name: Run unit tests + run: | + poetry run make test-contract + diff --git a/Makefile b/Makefile index 8f97a22b..4233335c 100644 --- a/Makefile +++ b/Makefile @@ -19,6 +19,11 @@ install: install-speakeasy-cli: curl -fsSL https://raw.githubusercontent.com/speakeasy-api/speakeasy/main/install.sh | sh +## install-test: install test requirements as they cannot be put into pyproject.toml due to python version requirements mismatch +.PHONY: install-test-contract +install-test-contract: + pip install unstructured pytest-httpx + ################# # Test and Lint # ################# @@ -30,6 +35,10 @@ test: test-unit test-integration-docker test-unit: PYTHONPATH=. pytest _test_unstructured_client -v -k "unit" +.PHONY: test-contract +test-contract: + PYTHONPATH=. pytest _test_contract -v + # Assumes you have unstructured-api running on localhost:8000 .PHONY: test-integration test-integration: diff --git a/_test_contract/test_partition_via_api.py b/_test_contract/test_partition_via_api.py new file mode 100644 index 00000000..1e367d9d --- /dev/null +++ b/_test_contract/test_partition_via_api.py @@ -0,0 +1,135 @@ +import os +from pathlib import Path + +import httpx +import pytest +from unstructured.partition.api import partition_via_api + +from unstructured_client import UnstructuredClient + + +@pytest.fixture(scope="module") +def client() -> UnstructuredClient: + _client = UnstructuredClient(api_key_auth=os.getenv("UNSTRUCTURED_API_KEY"), server='free-api') + yield _client + + +@pytest.fixture(scope="module") +def doc_path() -> Path: + samples_path = Path(__file__).resolve().parents[1] / "_sample_docs" + assert samples_path.exists() + return samples_path + + +MOCK_TEXT = """[ + { + "element_id": "f49fbd614ddf5b72e06f59e554e6ae2b", + "text": "This is a test email to use for unit tests.", + "type": "NarrativeText", + "metadata": { + "sent_from": [ + "Matthew Robinson " + ], + "sent_to": [ + "Matthew Robinson " + ], + "subject": "Test Email", + "filename": "fake-email.eml", + "filetype": "message/rfc822" + } + } +]""" + + +@pytest.mark.parametrize( + ("url", "full_url"), [ + ("http://localhost:8000", "http://localhost:8000/general/v0/general"), + ("http://localhost:8000/general/v0/general", "http://localhost:8000/general/v0/general"), + ] +) +def test_partition_via_api_custom_url(httpx_mock, doc_path: Path, url: str, full_url: str): + """ + Assert that we can specify api_url and requests are sent to the right place + """ + + filename = "layout-parser-paper-fast.pdf" + + # adding response automatically checks whether a response to a request to given URL was found + httpx_mock.add_response( + method="POST", + url=full_url, + headers={"Content-Type": "application/json"}, + content=MOCK_TEXT.encode(), + ) + + partition_via_api(filename=str(doc_path / filename), api_url=url, metadata_filename=filename) + + +def test_partition_via_api_pass_list_type_parameters(httpx_mock, doc_path: Path): + url = "http://localhost:8000/general/v0/general" + filename = "layout-parser-paper-fast.pdf" + + httpx_mock.add_response( + method="POST", + headers={"Content-Type": "application/json"}, + content=MOCK_TEXT.encode(), + url=url, + ) + + params = dict( + split_pdf_page=False, + strategy="hi_res", + extract_image_block_types=["image", "table"], + skip_infer_table_types=["pdf", "docx"], + languages=["eng"], + ) + + partition_via_api(filename=str(doc_path / filename), + api_url=url, + metadata_filename=filename, + **params) + + requests = httpx_mock.get_requests() + + assert len(requests) == 1 + + request = requests[0] + + parsed_multipart_form = _parse_multipart_data(request) + assert "coordinates" in parsed_multipart_form + for key, value in params.items(): + assert key in parsed_multipart_form + assert parsed_multipart_form[key] == value + + +def _parse_multipart_data(request: httpx.Request) -> dict: + """Parser for multipart form data in raw format to a dictionary. Ommits "files" field + Includes table-like entries. + """ + data = request.content + boundary = request.headers["Content-Type"].split("boundary=")[1] + parts = data.split(f"--{boundary}".encode()) + parts = [part.strip() for part in parts if part.strip()] + parsed_data = {} + for part in parts: + if b"Content-Disposition: form-data" in part: + try: + semicolon_pos = part.find(b";") + contents = part[semicolon_pos + 2:] + if b"name=\"files\"" in contents: + continue + contents = contents.decode() + key, value = contents.split("\r\n\r\n") + key = key.replace("name=", "").strip('"') + if "[]" in key: + key = key.replace("[]", "") + if key not in parsed_data: + parsed_data[key] = [] + parsed_data[key].append(value) + elif value in ["true", "false"]: + parsed_data[key] = value == "true" + else: + parsed_data[key] = value + except Exception as ex: + print(ex) + return parsed_data