Skip to content

Commit c1637cb

Browse files
test: unit and integration test pdf check thrown error code and message
1 parent 5b6d1c4 commit c1637cb

File tree

6 files changed

+97
-0
lines changed

6 files changed

+97
-0
lines changed

sample-docs/failing-encrypted.pdf

936 Bytes
Binary file not shown.

sample-docs/failing-invalid.pdf

70 Bytes
Binary file not shown.
160 Bytes
Binary file not shown.
4.74 KB
Binary file not shown.

test_general/api/test_app.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1155,3 +1155,45 @@ def test_include_slide_notes(monkeypatch, test_default, include_slide_notes, tes
11551155
assert "Here are important notes" == df["text"][0]
11561156
else:
11571157
assert "Here are important notes" != df["text"][0]
1158+
1159+
1160+
@pytest.mark.parametrize(
1161+
("pdf_name", "expected_error_message"),
1162+
[
1163+
("failing-encrypted.pdf", "File is encrypted. Please decrypt it with password."),
1164+
(
1165+
"failing-invalid.pdf",
1166+
"File does not appear to be a valid PDF. Error: Stream has ended unexpectedly",
1167+
),
1168+
(
1169+
"failing-missing-root.pdf",
1170+
"File does not appear to be a valid PDF. Error: Cannot find Root object in pdf",
1171+
),
1172+
(
1173+
"failing-missing-pages.pdf",
1174+
"File does not appear to be a valid PDF. Error: Invalid object in /Pages",
1175+
),
1176+
],
1177+
)
1178+
@pytest.mark.parametrize(
1179+
"strategy",
1180+
[
1181+
"auto",
1182+
"fast",
1183+
"hi_res",
1184+
"ocr_only",
1185+
],
1186+
)
1187+
def test_failing_pdfs_return_422(pdf_name: str, expected_error_message: str, strategy: str):
1188+
client = TestClient(app)
1189+
test_file = Path(__file__).parent.parent.parent / "sample-docs" / pdf_name
1190+
1191+
with open(test_file, "rb") as f:
1192+
response = client.post(
1193+
MAIN_API_ROUTE,
1194+
files=[("files", (str(test_file), f))],
1195+
data={"strategy": strategy},
1196+
)
1197+
1198+
assert response.status_code == 422
1199+
assert expected_error_message == str(response.json()["detail"])

test_general/api/test_general.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
from __future__ import annotations
2+
3+
import io
4+
from pathlib import Path
5+
6+
import pytest
7+
from fastapi import HTTPException
8+
from pypdf import PdfReader
9+
10+
from prepline_general.api.general import _check_pdf
11+
12+
TEST_ASSETS_DIR = Path(__file__).parent.parent.parent / "sample-docs"
13+
14+
15+
def _open_pdf(pdf_path: str) -> io.BytesIO:
16+
with open(pdf_path, "rb") as f:
17+
pdf_content = f.read()
18+
return io.BytesIO(pdf_content)
19+
20+
21+
def test_check_pdf_with_valid_pdf():
22+
pdf_path = str(TEST_ASSETS_DIR / "list-item-example.pdf")
23+
pdf = _open_pdf(pdf_path)
24+
25+
result = _check_pdf(pdf)
26+
assert isinstance(result, PdfReader)
27+
28+
29+
@pytest.mark.parametrize(
30+
("pdf_name", "expected_error_message"),
31+
[
32+
("failing-encrypted.pdf", "File is encrypted. Please decrypt it with password."),
33+
(
34+
"failing-invalid.pdf",
35+
"File does not appear to be a valid PDF. Error: Stream has ended unexpectedly",
36+
),
37+
(
38+
"failing-missing-root.pdf",
39+
"File does not appear to be a valid PDF. Error: Cannot find Root object in pdf",
40+
),
41+
(
42+
"failing-missing-pages.pdf",
43+
"File does not appear to be a valid PDF. Error: Invalid object in /Pages",
44+
),
45+
],
46+
)
47+
def test_check_pdf_with_invalid_pdf(pdf_name: str, expected_error_message: str):
48+
pdf_path = str(TEST_ASSETS_DIR / pdf_name)
49+
pdf = _open_pdf(pdf_path)
50+
51+
with pytest.raises(HTTPException) as exc_info:
52+
_check_pdf(pdf)
53+
54+
assert exc_info.value.status_code == 422
55+
assert expected_error_message == str(exc_info.value.detail)

0 commit comments

Comments
 (0)