Skip to content

Commit 73cbbf9

Browse files
authored
Chore: throw 400 error if a PDF is password protected (#147)
* pip compile expect don't bump to 0.8.0 * Revert "pip compile expect don't bump to 0.8.0" This reverts commit 7a62df5. * only do pycryptodome install * disable test * note * stage for 400 * password protect 400 error * add test file * lint * lint.. * put back coordinate test * name nit: pdf_page_splits -> pdf_pages * new pip-compile * changelog
1 parent b23908d commit 73cbbf9

File tree

8 files changed

+162
-104
lines changed

8 files changed

+162
-104
lines changed

CHANGELOG.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
1-
## 0.0.32-dev0
1+
## 0.0.32-dev1
22

3+
* Throw 400 error if a PDF is password protected
34
* Improve logging of params to single line json
45
* Add support for `include_page_breaks` parameter
56

pipeline-notebooks/pipeline-general.ipynb

Lines changed: 22 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -549,11 +549,10 @@
549549
"\n",
550550
"from concurrent.futures import ThreadPoolExecutor\n",
551551
"from functools import partial\n",
552+
"import pypdf\n",
552553
"from pypdf import PdfReader, PdfWriter\n",
553554
"from unstructured.partition.auto import partition\n",
554555
"from unstructured.staging.base import convert_to_isd, convert_to_dataframe, elements_from_json\n",
555-
"import tempfile\n",
556-
"import pdfminer\n",
557556
"import requests\n",
558557
"import time"
559558
]
@@ -598,7 +597,7 @@
598597
"source": [
599598
"# pipeline-api\n",
600599
"\n",
601-
"def get_pdf_splits(pdf, split_size=1):\n",
600+
"def get_pdf_splits(pdf_pages, split_size=1):\n",
602601
" '''\n",
603602
" Given a pdf (PdfReader) with n pages, split it into pdfs each with split_size # of pages\n",
604603
" Return the files with their page offset in the form [( BytesIO, int)]\n",
@@ -607,12 +606,12 @@
607606
"\n",
608607
" offset = 0\n",
609608
"\n",
610-
" while offset < len(pdf.pages):\n",
609+
" while offset < len(pdf_pages):\n",
611610
" new_pdf = PdfWriter()\n",
612611
" pdf_buffer = io.BytesIO()\n",
613612
"\n",
614613
" end = offset+split_size\n",
615-
" for page in pdf.pages[offset : end]:\n",
614+
" for page in pdf_pages[offset : end]:\n",
616615
" new_pdf.add_page(page)\n",
617616
"\n",
618617
" new_pdf.write(pdf_buffer)\n",
@@ -678,7 +677,7 @@
678677
"\n",
679678
" return elements\n",
680679
"\n",
681-
"def partition_pdf_splits(request, file, file_filename, content_type, coordinates, **partition_kwargs):\n",
680+
"def partition_pdf_splits(request, pdf_pages, file, file_filename, content_type, coordinates, **partition_kwargs):\n",
682681
" '''\n",
683682
" Split a pdf into chunks and process in parallel with more api calls, or partition\n",
684683
" locally if the chunk is small enough. As soon as any remote call fails, bubble up\n",
@@ -691,10 +690,9 @@
691690
" partition_kwargs holds any others parameters that will be forwarded, or passed to partition\n",
692691
" ''' \n",
693692
" pages_per_pdf = int(os.environ.get(\"UNSTRUCTURED_PARALLEL_MODE_SPLIT_SIZE\", 1))\n",
694-
" pdf = PdfReader(file)\n",
695693
"\n",
696694
" # If it's small enough, just process locally\n",
697-
" if len(pdf.pages) <= pages_per_pdf:\n",
695+
" if len(pdf_pages) <= pages_per_pdf:\n",
698696
" return partition(\n",
699697
" file=file,\n",
700698
" file_filename=file_filename,\n",
@@ -703,7 +701,7 @@
703701
" )\n",
704702
"\n",
705703
" results = []\n",
706-
" page_tuples = get_pdf_splits(pdf, split_size=pages_per_pdf)\n",
704+
" page_tuples = get_pdf_splits(pdf_pages, split_size=pages_per_pdf)\n",
707705
" \n",
708706
" partition_func = partial(\n",
709707
" partition_file_via_api,\n",
@@ -771,6 +769,20 @@
771769
" # Note(yuming): convert file type for msg files\n",
772770
" # since fast api might sent the wrong one.\n",
773771
" file_content_type = \"application/x-ole-storage\"\n",
772+
" \n",
773+
" if filename.endswith(\".pdf\"):\n",
774+
" try: \n",
775+
" pdf = PdfReader(file)\n",
776+
" except pypdf.errors.EmptyFileError:\n",
777+
" raise HTTPException(\n",
778+
" status_code=400,\n",
779+
" detail=f\"{filename} does not appear to be a valid PDF\"\n",
780+
" )\n",
781+
" if pdf.is_encrypted:\n",
782+
" raise HTTPException(\n",
783+
" status_code=400,\n",
784+
" detail=f\"File: {filename} is encrypted. Please decrypt it with password.\"\n",
785+
" )\n",
774786
" \n",
775787
" strategy = (m_strategy[0] if len(m_strategy) else 'auto').lower()\n",
776788
" strategies = ['fast', 'hi_res', 'auto', 'ocr_only']\n",
@@ -828,6 +840,7 @@
828840
" if file_content_type == \"application/pdf\" and pdf_parallel_mode_enabled:\n",
829841
" elements = partition_pdf_splits(\n",
830842
" request,\n",
843+
" pdf_pages = pdf.pages,\n",
831844
" file=file,\n",
832845
" file_filename=filename,\n",
833846
" content_type=file_content_type,\n",
@@ -856,8 +869,6 @@
856869
" if 'Invalid file' in e.args[0]:\n",
857870
" raise HTTPException(status_code=400, detail=f\"{file_content_type} not currently supported\")\n",
858871
" raise e\n",
859-
" except pdfminer.pdfparser.PDFSyntaxError:\n",
860-
" raise HTTPException(status_code=400, detail=f\"{filename} does not appear to be a valid PDF\")\n",
861872
"\n",
862873
" if response_type == \"text/csv\":\n",
863874
" df = convert_to_dataframe(elements)\n",

prepline_general/api/general.py

Lines changed: 21 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,10 @@
2020
import pandas as pd
2121
from concurrent.futures import ThreadPoolExecutor
2222
from functools import partial
23+
import pypdf
2324
from pypdf import PdfReader, PdfWriter
2425
from unstructured.partition.auto import partition
2526
from unstructured.staging.base import convert_to_isd, convert_to_dataframe, elements_from_json
26-
import pdfminer
2727
import requests
2828
import time
2929
from unstructured_inference.models.chipper import MODEL_TYPES as CHIPPER_MODEL_TYPES
@@ -70,7 +70,7 @@ def is_expected_response_type(media_type, response_type):
7070
os.environ["UNSTRUCTURED_ALLOWED_MIMETYPES"] = DEFAULT_MIMETYPES
7171

7272

73-
def get_pdf_splits(pdf, split_size=1):
73+
def get_pdf_splits(pdf_pages, split_size=1):
7474
"""
7575
Given a pdf (PdfReader) with n pages, split it into pdfs each with split_size # of pages
7676
Return the files with their page offset in the form [( BytesIO, int)]
@@ -79,12 +79,12 @@ def get_pdf_splits(pdf, split_size=1):
7979

8080
offset = 0
8181

82-
while offset < len(pdf.pages):
82+
while offset < len(pdf_pages):
8383
new_pdf = PdfWriter()
8484
pdf_buffer = io.BytesIO()
8585

8686
end = offset + split_size
87-
for page in pdf.pages[offset:end]:
87+
for page in pdf_pages[offset:end]:
8888
new_pdf.add_page(page)
8989

9090
new_pdf.write(pdf_buffer)
@@ -150,7 +150,7 @@ def partition_file_via_api(file_tuple, request, filename, content_type, **partit
150150

151151

152152
def partition_pdf_splits(
153-
request, file, file_filename, content_type, coordinates, **partition_kwargs
153+
request, pdf_pages, file, file_filename, content_type, coordinates, **partition_kwargs
154154
):
155155
"""
156156
Split a pdf into chunks and process in parallel with more api calls, or partition
@@ -164,16 +164,15 @@ def partition_pdf_splits(
164164
partition_kwargs holds any others parameters that will be forwarded, or passed to partition
165165
"""
166166
pages_per_pdf = int(os.environ.get("UNSTRUCTURED_PARALLEL_MODE_SPLIT_SIZE", 1))
167-
pdf = PdfReader(file)
168167

169168
# If it's small enough, just process locally
170-
if len(pdf.pages) <= pages_per_pdf:
169+
if len(pdf_pages) <= pages_per_pdf:
171170
return partition(
172171
file=file, file_filename=file_filename, content_type=content_type, **partition_kwargs
173172
)
174173

175174
results = []
176-
page_tuples = get_pdf_splits(pdf, split_size=pages_per_pdf)
175+
page_tuples = get_pdf_splits(pdf_pages, split_size=pages_per_pdf)
177176

178177
partition_func = partial(
179178
partition_file_via_api,
@@ -236,6 +235,19 @@ def pipeline_api(
236235
# since fast api might sent the wrong one.
237236
file_content_type = "application/x-ole-storage"
238237

238+
if filename.endswith(".pdf"):
239+
try:
240+
pdf = PdfReader(file)
241+
except pypdf.errors.EmptyFileError:
242+
raise HTTPException(
243+
status_code=400, detail=f"{filename} does not appear to be a valid PDF"
244+
)
245+
if pdf.is_encrypted:
246+
raise HTTPException(
247+
status_code=400,
248+
detail=f"File: {filename} is encrypted. Please decrypt it with password.",
249+
)
250+
239251
strategy = (m_strategy[0] if len(m_strategy) else "auto").lower()
240252
strategies = ["fast", "hi_res", "auto", "ocr_only"]
241253
if strategy not in strategies:
@@ -301,6 +313,7 @@ def pipeline_api(
301313
if file_content_type == "application/pdf" and pdf_parallel_mode_enabled:
302314
elements = partition_pdf_splits(
303315
request,
316+
pdf_pages=pdf.pages,
304317
file=file,
305318
file_filename=filename,
306319
content_type=file_content_type,
@@ -331,8 +344,6 @@ def pipeline_api(
331344
status_code=400, detail=f"{file_content_type} not currently supported"
332345
)
333346
raise e
334-
except pdfminer.pdfparser.PDFSyntaxError:
335-
raise HTTPException(status_code=400, detail=f"{filename} does not appear to be a valid PDF")
336347

337348
if response_type == "text/csv":
338349
df = convert_to_dataframe(elements)

requirements/base.in

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,5 +8,5 @@ click==8.1.3
88
ratelimit
99
requests
1010
pypdf
11-
11+
pycryptodome
1212

requirements/base.txt

Lines changed: 20 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ beautifulsoup4==4.12.2
2020
# via nbconvert
2121
bleach==6.0.0
2222
# via nbconvert
23-
certifi==2023.5.7
23+
certifi==2023.7.22
2424
# via requests
2525
cffi==1.15.1
2626
# via cryptography
@@ -54,7 +54,7 @@ exceptiongroup==1.1.2
5454
# via anyio
5555
fastapi==0.100.0
5656
# via unstructured-api-tools
57-
fastjsonschema==2.17.1
57+
fastjsonschema==2.18.0
5858
# via nbformat
5959
filelock==3.12.2
6060
# via
@@ -65,7 +65,7 @@ filetype==1.2.0
6565
# via unstructured
6666
flatbuffers==23.5.26
6767
# via onnxruntime
68-
fonttools==4.41.0
68+
fonttools==4.41.1
6969
# via matplotlib
7070
fsspec==2023.6.0
7171
# via huggingface-hub
@@ -103,9 +103,9 @@ jinja2==3.1.2
103103
# unstructured-api-tools
104104
joblib==1.3.1
105105
# via nltk
106-
jsonschema==4.18.3
106+
jsonschema==4.18.4
107107
# via nbformat
108-
jsonschema-specifications==2023.6.1
108+
jsonschema-specifications==2023.7.1
109109
# via jsonschema
110110
jupyter-client==8.3.0
111111
# via nbclient
@@ -146,7 +146,7 @@ mypy-extensions==1.0.0
146146
# via mypy
147147
nbclient==0.8.0
148148
# via nbconvert
149-
nbconvert==7.7.1
149+
nbconvert==7.7.2
150150
# via unstructured-api-tools
151151
nbformat==5.9.1
152152
# via
@@ -203,7 +203,7 @@ pdfminer-six==20221105
203203
# via
204204
# pdfplumber
205205
# unstructured
206-
pdfplumber==0.10.0
206+
pdfplumber==0.10.1
207207
# via layoutparser
208208
pillow==10.0.0
209209
# via
@@ -227,6 +227,8 @@ pycocotools==2.0.6
227227
# via effdet
228228
pycparser==2.21
229229
# via cffi
230+
pycryptodome==3.18.0
231+
# via -r requirements/base.in
230232
pydantic==1.10.11
231233
# via
232234
# -r requirements/base.in
@@ -239,7 +241,7 @@ pypandoc==1.11
239241
# via unstructured
240242
pyparsing==3.0.9
241243
# via matplotlib
242-
pypdf==3.12.2
244+
pypdf==3.13.0
243245
# via -r requirements/base.in
244246
pypdfium2==4.18.0
245247
# via pdfplumber
@@ -264,7 +266,7 @@ python-pptx==0.6.21
264266
# via unstructured
265267
pytz==2023.3
266268
# via pandas
267-
pyyaml==6.0
269+
pyyaml==6.0.1
268270
# via
269271
# huggingface-hub
270272
# layoutparser
@@ -276,7 +278,7 @@ pyzmq==25.1.0
276278
# via jupyter-client
277279
ratelimit==2.2.1
278280
# via -r requirements/base.in
279-
referencing==0.29.1
281+
referencing==0.30.0
280282
# via
281283
# jsonschema
282284
# jsonschema-specifications
@@ -291,7 +293,7 @@ requests==2.31.0
291293
# torchvision
292294
# transformers
293295
# unstructured
294-
rpds-py==0.8.11
296+
rpds-py==0.9.2
295297
# via
296298
# jsonschema
297299
# referencing
@@ -353,13 +355,13 @@ traitlets==5.9.0
353355
# nbclient
354356
# nbconvert
355357
# nbformat
356-
transformers==4.30.2
358+
transformers==4.31.0
357359
# via unstructured-inference
358-
types-requests==2.31.0.1
360+
types-requests==2.31.0.2
359361
# via unstructured-api-tools
360-
types-ujson==5.8.0.0
362+
types-ujson==5.8.0.1
361363
# via unstructured-api-tools
362-
types-urllib3==1.26.25.13
364+
types-urllib3==1.26.25.14
363365
# via types-requests
364366
typing-extensions==4.7.1
365367
# via
@@ -371,6 +373,7 @@ typing-extensions==4.7.1
371373
# pypdf
372374
# starlette
373375
# torch
376+
# uvicorn
374377
tzdata==2023.3
375378
# via pandas
376379
unstructured[local-inference]==0.8.1
@@ -379,9 +382,9 @@ unstructured-api-tools==0.10.10
379382
# via -r requirements/base.in
380383
unstructured-inference==0.5.5
381384
# via unstructured
382-
urllib3==2.0.3
385+
urllib3==2.0.4
383386
# via requests
384-
uvicorn[standard]==0.23.0
387+
uvicorn[standard]==0.23.1
385388
# via unstructured-api-tools
386389
uvloop==0.17.0
387390
# via uvicorn

0 commit comments

Comments
 (0)