Skip to content

Commit d9c3b4a

Browse files
test: pdf check tests before request
1 parent 4de9a67 commit d9c3b4a

File tree

6 files changed

+128
-0
lines changed

6 files changed

+128
-0
lines changed

_sample_docs/failing-encrypted.pdf

936 Bytes
Binary file not shown.

_sample_docs/failing-invalid.pdf

70 Bytes
Binary file not shown.
160 Bytes
Binary file not shown.
4.74 KB
Binary file not shown.

_test_unstructured_client/integration/test_integration.py

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
import pytest
99
from deepdiff import DeepDiff
10+
from httpx import RequestError
1011
from unstructured_client import UnstructuredClient
1112
from unstructured_client.models import shared, operations
1213
from unstructured_client.models.errors import SDKError, ServerError, HTTPValidationError
@@ -348,3 +349,77 @@ def test_partition_strategy_vlm_anthropic(split_pdf, vlm_model, vlm_model_provid
348349
assert response.status_code == 200
349350
assert len(response.elements) > 0
350351
assert response.elements[0]["metadata"]["partitioner_type"] == "vlm_partition"
352+
353+
354+
@pytest.mark.parametrize(
355+
("pdf_name", "expected_error_message"),
356+
[
357+
(
358+
"failing-encrypted.pdf",
359+
"File is encrypted. Please decrypt it with password.",
360+
),
361+
(
362+
"failing-missing-root.pdf",
363+
"File does not appear to be a valid PDF. Error: Cannot find Root object in pdf",
364+
),
365+
(
366+
"failing-missing-pages.pdf",
367+
"File does not appear to be a valid PDF. Error: Invalid object in /Pages",
368+
),
369+
],
370+
)
371+
def test_returns_request_error_for_invalid_pdf(
372+
caplog: pytest.LogCaptureFixture,
373+
doc_path: Path,
374+
client: UnstructuredClient,
375+
pdf_name: str,
376+
expected_error_message: str,
377+
):
378+
"""Test that we get a RequestError with the correct error message for invalid PDF files."""
379+
with open(doc_path / pdf_name, "rb") as f:
380+
files = shared.Files(
381+
content=f.read(),
382+
file_name=pdf_name,
383+
)
384+
385+
req = operations.PartitionRequest(
386+
partition_parameters=shared.PartitionParameters(
387+
files=files,
388+
strategy="fast",
389+
split_pdf_page=True,
390+
)
391+
)
392+
393+
with pytest.raises(RequestError) as exc_info:
394+
client.general.partition(request=req)
395+
396+
assert exc_info.value.request is not None
397+
assert expected_error_message in caplog.text
398+
399+
400+
def test_returns_422_for_invalid_pdf(
401+
caplog: pytest.LogCaptureFixture,
402+
doc_path: Path,
403+
client: UnstructuredClient,
404+
):
405+
"""Test that we get a RequestError with the correct error message for invalid PDF files."""
406+
pdf_name = "failing-invalid.pdf"
407+
with open(doc_path / pdf_name, "rb") as f:
408+
files = shared.Files(
409+
content=f.read(),
410+
file_name=pdf_name,
411+
)
412+
413+
req = operations.PartitionRequest(
414+
partition_parameters=shared.PartitionParameters(
415+
files=files,
416+
strategy="fast",
417+
split_pdf_page=True,
418+
)
419+
)
420+
421+
with pytest.raises(HTTPValidationError):
422+
client.general.partition(request=req)
423+
424+
assert "File does not appear to be a valid PDF" in caplog.text
425+
assert "422" in caplog.text
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
from __future__ import annotations
2+
3+
import io
4+
5+
import pytest
6+
from pypdf import PdfReader
7+
8+
from unstructured_client._hooks.custom.pdf_utils import check_pdf, PDFValidationError
9+
from _test_unstructured_client.unit_utils import sample_docs_path
10+
11+
12+
def _open_pdf(pdf_path: str) -> PdfReader:
13+
with open(pdf_path, "rb") as f:
14+
pdf_content = f.read()
15+
return PdfReader(io.BytesIO(pdf_content))
16+
17+
18+
def test_check_pdf_with_valid_pdf():
19+
pdf_path = sample_docs_path("list-item-example-1.pdf")
20+
pdf = _open_pdf(pdf_path)
21+
22+
result = check_pdf(pdf)
23+
assert isinstance(result, PdfReader)
24+
25+
26+
@pytest.mark.parametrize(
27+
("pdf_name", "expected_error_message"),
28+
[
29+
(
30+
"failing-encrypted.pdf",
31+
"File is encrypted. Please decrypt it with password.",
32+
),
33+
(
34+
"failing-missing-root.pdf",
35+
"File does not appear to be a valid PDF. Error: Cannot find Root object in pdf",
36+
),
37+
(
38+
"failing-missing-pages.pdf",
39+
"File does not appear to be a valid PDF. Error: Invalid object in /Pages",
40+
),
41+
],
42+
)
43+
def test_check_pdf_raises_pdf_validation_error(
44+
pdf_name: str, expected_error_message: str
45+
):
46+
"""Test that we get a PDFValidationError with the correct error message for invalid PDF files."""
47+
pdf_path = sample_docs_path(pdf_name)
48+
pdf = _open_pdf(pdf_path)
49+
50+
with pytest.raises(PDFValidationError) as exc_info:
51+
check_pdf(pdf)
52+
53+
assert exc_info.value.message == expected_error_message

0 commit comments

Comments
 (0)