Skip to content

Commit 2ace6f5

Browse files
authored
fix: Do not return 400 error for encrypted but readable pdfs (#249)
Fixes #236 The `pdf.is_encrypted` check is true for files with edit protections, so we were returning 400 "This file is encrypted" for files that were perfectly readable. To verify, run this branch with `make run-web-app` and submit the pdf in the linked issue. Other changes: * Remove filename from some error messages. This can get way too verbose.
1 parent 8317021 commit 2ace6f5

File tree

4 files changed

+50
-28
lines changed

4 files changed

+50
-28
lines changed

CHANGELOG.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
1-
## 0.0.47-dev1
1+
## 0.0.47
22

33
* **Adds `chunking_strategy` kwarg and associated params** These params allow users to "chunk" elements into larger or smaller `CompositeElement`s
44
* **Remove `parent_id` from the element metadata**. New metadata fields are causing errors with existing installs. We'll readd this once a fix is widely available.
5+
* **Fix some pdfs incorrectly returning a file is encrypted error**. The `pypdf.is_encrypted` check caused us to return this error even if the file is readable.
6+
>>>>>>> main
57
68
## 0.0.46
79

prepline_general/api/general.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -273,14 +273,17 @@ def pipeline_api(
273273
if file_content_type == "application/pdf":
274274
try:
275275
pdf = PdfReader(file)
276+
277+
# This will raise if the file is encrypted
278+
pdf.metadata
276279
except pypdf.errors.EmptyFileError:
277280
raise HTTPException(
278-
status_code=400, detail=f"{filename} does not appear to be a valid PDF"
281+
status_code=400, detail=f"File does not appear to be a valid PDF"
279282
)
280-
if pdf.is_encrypted:
283+
except pypdf.errors.FileNotDecryptedError:
281284
raise HTTPException(
282285
status_code=400,
283-
detail=f"File: {filename} is encrypted. Please decrypt it with password.",
286+
detail=f"File is encrypted. Please decrypt it with password.",
284287
)
285288

286289
strategy = (m_strategy[0] if len(m_strategy) else "auto").lower()
@@ -475,7 +478,6 @@ def get_validated_mimetype(file):
475478
raise HTTPException(
476479
status_code=400,
477480
detail=(
478-
f"Unable to process {file.filename}: "
479481
f"File type {content_type} is not supported."
480482
),
481483
)
-4.45 MB
Binary file not shown.

test_general/api/test_app.py

Lines changed: 41 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import pandas as pd
77
from fastapi.testclient import TestClient
88
from fastapi import HTTPException
9+
from pypdf import PdfWriter, PdfReader
910
from unittest.mock import Mock, ANY
1011

1112
from prepline_general.api.app import app
@@ -355,9 +356,7 @@ def test_general_api_returns_400_unsupported_file(example_filename):
355356
response = client.post(
356357
MAIN_API_ROUTE, files=[("files", (str(test_file), open(test_file, "rb"), filetype))]
357358
)
358-
assert response.json() == {
359-
"detail": f"Unable to process {str(test_file)}: " f"File type {filetype} is not supported."
360-
}
359+
assert response.json() == {"detail": f"File type {filetype} is not supported."}
361360
assert response.status_code == 400
362361

363362

@@ -371,7 +370,7 @@ def test_general_api_returns_400_bad_pdf():
371370
response = client.post(
372371
MAIN_API_ROUTE, files=[("files", (str(tmp.name), open(tmp.name, "rb"), "application/pdf"))]
373372
)
374-
assert response.json() == {"detail": f"{tmp.name} does not appear to be a valid PDF"}
373+
assert response.json() == {"detail": "File does not appear to be a valid PDF"}
375374
assert response.status_code == 400
376375
tmp.close()
377376

@@ -576,25 +575,6 @@ def test_partition_file_via_api_not_retryable_error_code(monkeypatch, mocker):
576575
assert remote_partition.call_count < 4
577576

578577

579-
def test_password_protected_pdf():
580-
"""
581-
Verify we get a 400 error if the PDF is password protected
582-
"""
583-
client = TestClient(app)
584-
# a password protected pdf file, password is "password"
585-
test_file = Path("sample-docs") / "layout-parser-paper-password-protected.pdf"
586-
587-
response = client.post(
588-
MAIN_API_ROUTE,
589-
files=[("files", (str(test_file), open(test_file, "rb")))],
590-
data={"strategy": "fast"},
591-
)
592-
assert response.status_code == 400
593-
assert response.json() == {
594-
"detail": f"File: {str(test_file)} is encrypted. Please decrypt it with password."
595-
}
596-
597-
598578
def test_chunking_strategy_param():
599579
"""
600580
Verify that responses do not chunk elements unless requested
@@ -666,3 +646,41 @@ def test_chunking_strategy_additional_params():
666646
response_multipage_true_combine_chars_5000.json()
667647
!= response_from_multipage_false_combine_chars_0.json()
668648
)
649+
650+
651+
def test_encrypted_pdf():
652+
"""
653+
Test that we throw an error if a pdf is password protected.
654+
A pdf can be encrypted but still readable - don't throw an error here.
655+
"""
656+
client = TestClient(app)
657+
test_file = Path("sample-docs") / "layout-parser-paper-fast.pdf"
658+
original_pdf = PdfReader(test_file)
659+
660+
with tempfile.NamedTemporaryFile() as temp_file:
661+
# This file is user encrypted and cannot be read
662+
writer = PdfWriter()
663+
writer.append_pages_from_reader(original_pdf)
664+
writer.encrypt(user_password="password123")
665+
writer.write(temp_file.name)
666+
667+
# Response should be 400
668+
response = client.post(
669+
MAIN_API_ROUTE,
670+
files=[("files", (str(temp_file.name), open(temp_file.name, "rb"), "application/pdf"))],
671+
)
672+
assert response.json() == {"detail": "File is encrypted. Please decrypt it with password."}
673+
assert response.status_code == 400
674+
675+
# This file is owner encrypted, i.e. readable with edit restrictions
676+
writer = PdfWriter()
677+
writer.append_pages_from_reader(original_pdf)
678+
writer.encrypt(user_password="", owner_password="password123", permissions_flag=0b1100)
679+
writer.write(temp_file.name)
680+
681+
# Response should be 200
682+
response = client.post(
683+
MAIN_API_ROUTE,
684+
files=[("files", (str(temp_file.name), open(temp_file.name, "rb"), "application/pdf"))],
685+
)
686+
assert response.status_code == 200

0 commit comments

Comments
 (0)