Skip to content

Commit 1d799b9

Browse files
fix: Return 422 for failing pdfs (#518)
1 parent 6468aa6 commit 1d799b9

File tree

10 files changed

+132
-10
lines changed

10 files changed

+132
-10
lines changed

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
## 0.0.88
2+
* Return 422 HTTP code when PDF can't be processed
3+
14
## 0.0.87
25
* Patch various CVEs
36
* Enable pytest concurrency
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.0.87" # pragma: no cover
1+
__version__ = "0.0.88" # pragma: no cover

prepline_general/api/general.py

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -470,20 +470,38 @@ def _check_free_memory():
470470

471471

472472
def _check_pdf(file: IO[bytes]):
473-
"""Check if the PDF file is encrypted, otherwise assume it is not a valid PDF."""
473+
"""
474+
Check if PDF is:
475+
- Encrypted
476+
- Has corrupted pages
477+
- Has corrupted root object
478+
479+
Throws:
480+
- HTTPException 442 UNPROCESSABLE ENTITY if file is encrypted or corrupted
481+
"""
474482
try:
475483
pdf = PdfReader(file)
476484

477485
# This will raise if the file is encrypted
478486
pdf.metadata
487+
488+
# This will raise if the file's root object is corrupted
489+
pdf.root_object
490+
491+
# This will raise if the file's pages are corrupted
492+
list(pdf.pages)
493+
479494
return pdf
480495
except FileNotDecryptedError:
481496
raise HTTPException(
482-
status_code=400,
497+
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
483498
detail="File is encrypted. Please decrypt it with password.",
484499
)
485-
except PdfReadError:
486-
raise HTTPException(status_code=422, detail="File does not appear to be a valid PDF")
500+
except PdfReadError as e:
501+
raise HTTPException(
502+
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
503+
detail=f"File does not appear to be a valid PDF. Error: {e}",
504+
)
487505

488506

489507
def _validate_strategy(strategy: str) -> str:

preprocessing-pipeline-family.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
name: general
2-
version: 0.0.87
2+
version: 0.0.88

sample-docs/failing-encrypted.pdf

936 Bytes
Binary file not shown.

sample-docs/failing-invalid.pdf

70 Bytes
Binary file not shown.
160 Bytes
Binary file not shown.
4.74 KB
Binary file not shown.

test_general/api/test_app.py

Lines changed: 50 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -495,7 +495,7 @@ def test_general_api_returns_422_bad_pdf():
495495
response = client.post(
496496
MAIN_API_ROUTE, files=[("files", (str(tmp.name), open(tmp.name, "rb"), "application/pdf"))]
497497
)
498-
assert response.json() == {"detail": "File does not appear to be a valid PDF"}
498+
assert "File does not appear to be a valid PDF" in response.json()["detail"]
499499
assert response.status_code == 422
500500
tmp.close()
501501

@@ -506,10 +506,56 @@ def test_general_api_returns_422_bad_pdf():
506506
files=[("files", (str(test_file), open(test_file, "rb"), "application/pdf"))],
507507
)
508508

509-
assert response.json() == {"detail": "File does not appear to be a valid PDF"}
509+
assert "File does not appear to be a valid PDF" in response.json()["detail"]
510510
assert response.status_code == 422
511511

512512

513+
@pytest.mark.parametrize(
514+
("pdf_name", "expected_error_message"),
515+
[
516+
(
517+
"failing-invalid.pdf",
518+
"File does not appear to be a valid PDF. Error: Stream has ended unexpectedly",
519+
),
520+
(
521+
"failing-missing-root.pdf",
522+
"File does not appear to be a valid PDF. Error: Cannot find Root object in pdf",
523+
),
524+
(
525+
"failing-missing-pages.pdf",
526+
"File does not appear to be a valid PDF. Error: Invalid object in /Pages",
527+
),
528+
],
529+
)
530+
@pytest.mark.parametrize(
531+
"strategy",
532+
[
533+
"auto",
534+
"fast",
535+
"hi_res",
536+
"ocr_only",
537+
],
538+
)
539+
def test_general_api_returns_422_invalid_pdf(
540+
pdf_name: str, expected_error_message: str, strategy: str
541+
):
542+
"""
543+
Verify that we get a 422 with the correct error message for invalid PDF files
544+
"""
545+
client = TestClient(app)
546+
test_file = Path(__file__).parent.parent.parent / "sample-docs" / pdf_name
547+
548+
with open(test_file, "rb") as f:
549+
response = client.post(
550+
MAIN_API_ROUTE,
551+
files=[("files", (str(test_file), f))],
552+
data={"strategy": strategy},
553+
)
554+
555+
assert response.status_code == 422
556+
assert expected_error_message == str(response.json()["detail"])
557+
558+
513559
def test_general_api_returns_503(monkeypatch):
514560
"""
515561
When available memory is below the minimum. return a 503, unless our origin ip is 10.{4,5}.x.x
@@ -939,13 +985,13 @@ def test_encrypted_pdf():
939985
writer.encrypt(user_password="password123")
940986
writer.write(temp_file.name)
941987

942-
# Response should be 400
988+
# Response should be 422
943989
response = client.post(
944990
MAIN_API_ROUTE,
945991
files=[("files", (str(temp_file.name), open(temp_file.name, "rb"), "application/pdf"))],
946992
)
947993
assert response.json() == {"detail": "File is encrypted. Please decrypt it with password."}
948-
assert response.status_code == 400
994+
assert response.status_code == 422
949995

950996
# This file is owner encrypted, i.e. readable with edit restrictions
951997
writer = PdfWriter()

test_general/api/test_general.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
from __future__ import annotations
2+
3+
import io
4+
from pathlib import Path
5+
6+
import pytest
7+
from fastapi import HTTPException
8+
from pypdf import PdfReader
9+
10+
from prepline_general.api.general import _check_pdf
11+
12+
TEST_ASSETS_DIR = Path(__file__).parent.parent.parent / "sample-docs"
13+
14+
15+
def _open_pdf(pdf_path: str) -> io.BytesIO:
16+
with open(pdf_path, "rb") as f:
17+
pdf_content = f.read()
18+
return io.BytesIO(pdf_content)
19+
20+
21+
def test_check_pdf_with_valid_pdf():
22+
pdf_path = str(TEST_ASSETS_DIR / "list-item-example.pdf")
23+
pdf = _open_pdf(pdf_path)
24+
25+
result = _check_pdf(pdf)
26+
assert isinstance(result, PdfReader)
27+
28+
29+
@pytest.mark.parametrize(
30+
("pdf_name", "expected_error_message"),
31+
[
32+
("failing-encrypted.pdf", "File is encrypted. Please decrypt it with password."),
33+
(
34+
"failing-invalid.pdf",
35+
"File does not appear to be a valid PDF. Error: Stream has ended unexpectedly",
36+
),
37+
(
38+
"failing-missing-root.pdf",
39+
"File does not appear to be a valid PDF. Error: Cannot find Root object in pdf",
40+
),
41+
(
42+
"failing-missing-pages.pdf",
43+
"File does not appear to be a valid PDF. Error: Invalid object in /Pages",
44+
),
45+
],
46+
)
47+
def test_check_pdf_with_invalid_pdf(pdf_name: str, expected_error_message: str):
48+
pdf_path = str(TEST_ASSETS_DIR / pdf_name)
49+
pdf = _open_pdf(pdf_path)
50+
51+
with pytest.raises(HTTPException) as exc_info:
52+
_check_pdf(pdf)
53+
54+
assert exc_info.value.status_code == 422
55+
assert expected_error_message == str(exc_info.value.detail)

0 commit comments

Comments
 (0)