Skip to content

Commit a21ba5e

Browse files
committed
Add password with PDF files
1 parent ecf0267 commit a21ba5e

File tree

8 files changed

+63
-9
lines changed

8 files changed

+63
-9
lines changed

unstructured/partition/image.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ def partition_image(
3232
starting_page_number: int = 1,
3333
extract_forms: bool = False,
3434
form_extraction_skip_tables: bool = True,
35+
password:Optional[str]=None,
3536
**kwargs: Any,
3637
) -> list[Element]:
3738
"""Parses an image into a list of interpreted elements.
@@ -91,6 +92,8 @@ def partition_image(
9192
(results in adding FormKeysValues elements to output).
9293
form_extraction_skip_tables
9394
Whether the form extraction logic should ignore regions designated as Tables.
95+
password
96+
The password to decrypt the PDF file.
9497
"""
9598
exactly_one(filename=filename, file=file)
9699

@@ -113,5 +116,6 @@ def partition_image(
113116
starting_page_number=starting_page_number,
114117
extract_forms=extract_forms,
115118
form_extraction_skip_tables=form_extraction_skip_tables,
119+
password=password,
116120
**kwargs,
117121
)

unstructured/partition/pdf.py

Lines changed: 27 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,7 @@ def partition_pdf(
133133
starting_page_number: int = 1,
134134
extract_forms: bool = False,
135135
form_extraction_skip_tables: bool = True,
136+
password: Optional[str] = None,
136137
**kwargs: Any,
137138
) -> list[Element]:
138139
"""Parses a pdf document into a list of interpreted elements.
@@ -213,6 +214,7 @@ def partition_pdf(
213214
starting_page_number=starting_page_number,
214215
extract_forms=extract_forms,
215216
form_extraction_skip_tables=form_extraction_skip_tables,
217+
password=password,
216218
**kwargs,
217219
)
218220

@@ -234,6 +236,7 @@ def partition_pdf_or_image(
234236
starting_page_number: int = 1,
235237
extract_forms: bool = False,
236238
form_extraction_skip_tables: bool = True,
239+
password: Optional[str] = None,
237240
**kwargs: Any,
238241
) -> list[Element]:
239242
"""Parses a pdf or image document into a list of interpreted elements."""
@@ -262,6 +265,7 @@ def partition_pdf_or_image(
262265
languages=languages,
263266
metadata_last_modified=metadata_last_modified or last_modified,
264267
starting_page_number=starting_page_number,
268+
password=password,
265269
**kwargs,
266270
)
267271
pdf_text_extractable = any(
@@ -311,6 +315,7 @@ def partition_pdf_or_image(
311315
starting_page_number=starting_page_number,
312316
extract_forms=extract_forms,
313317
form_extraction_skip_tables=form_extraction_skip_tables,
318+
password=password,
314319
**kwargs,
315320
)
316321
out_elements = _process_uncategorized_text_elements(elements)
@@ -336,6 +341,7 @@ def partition_pdf_or_image(
336341
is_image=is_image,
337342
metadata_last_modified=metadata_last_modified or last_modified,
338343
starting_page_number=starting_page_number,
344+
password=password,
339345
**kwargs,
340346
)
341347
out_elements = _process_uncategorized_text_elements(elements)
@@ -349,6 +355,7 @@ def extractable_elements(
349355
languages: Optional[list[str]] = None,
350356
metadata_last_modified: Optional[str] = None,
351357
starting_page_number: int = 1,
358+
password:Optional[str] = None,
352359
**kwargs: Any,
353360
) -> list[list[Element]]:
354361
if isinstance(file, bytes):
@@ -359,6 +366,7 @@ def extractable_elements(
359366
languages=languages,
360367
metadata_last_modified=metadata_last_modified,
361368
starting_page_number=starting_page_number,
369+
password=password,
362370
**kwargs,
363371
)
364372

@@ -369,6 +377,7 @@ def _partition_pdf_with_pdfminer(
369377
languages: list[str],
370378
metadata_last_modified: Optional[str],
371379
starting_page_number: int = 1,
380+
password:Optional[str] = None,
372381
**kwargs: Any,
373382
) -> list[list[Element]]:
374383
"""Partitions a PDF using PDFMiner instead of using a layoutmodel. Used for faster
@@ -402,6 +411,7 @@ def _partition_pdf_with_pdfminer(
402411
languages=languages,
403412
metadata_last_modified=metadata_last_modified,
404413
starting_page_number=starting_page_number,
414+
password=password,
405415
**kwargs,
406416
)
407417

@@ -416,14 +426,16 @@ def _process_pdfminer_pages(
416426
metadata_last_modified: Optional[str],
417427
annotation_threshold: Optional[float] = env_config.PDF_ANNOTATION_THRESHOLD,
418428
starting_page_number: int = 1,
429+
password: Optional[str] = None,
419430
**kwargs,
420431
) -> list[list[Element]]:
421432
"""Uses PDFMiner to split a document into pages and process them."""
422433

423434
elements = []
424435

425436
for page_number, (page, page_layout) in enumerate(
426-
open_pdfminer_pages_generator(fp), start=starting_page_number
437+
open_pdfminer_pages_generator(fp, password=password),
438+
start=starting_page_number,
427439
):
428440
width, height = page_layout.width, page_layout.height
429441

@@ -545,6 +557,7 @@ def _partition_pdf_or_image_local(
545557
extract_forms: bool = False,
546558
form_extraction_skip_tables: bool = True,
547559
pdf_hi_res_max_pages: Optional[int] = None,
560+
password:Optional[str] = None,
548561
**kwargs: Any,
549562
) -> list[Element]:
550563
"""Partition using package installed locally"""
@@ -586,6 +599,7 @@ def _partition_pdf_or_image_local(
586599
is_image=is_image,
587600
model_name=hi_res_model_name,
588601
pdf_image_dpi=pdf_image_dpi,
602+
password=password,
589603
)
590604

591605
if hi_res_model_name.startswith("chipper"):
@@ -594,7 +608,8 @@ def _partition_pdf_or_image_local(
594608
final_document_layout = inferred_document_layout
595609
else:
596610
extracted_layout = (
597-
process_file_with_pdfminer(filename=filename, dpi=pdf_image_dpi)
611+
process_file_with_pdfminer(filename=filename, dpi=pdf_image_dpi,
612+
password=password)
598613
if pdf_text_extractable
599614
else []
600615
)
@@ -634,13 +649,15 @@ def _partition_pdf_or_image_local(
634649
ocr_mode=ocr_mode,
635650
pdf_image_dpi=pdf_image_dpi,
636651
ocr_layout_dumper=ocr_layout_dumper,
652+
password=password,
637653
)
638654
else:
639655
inferred_document_layout = process_data_with_model(
640656
file,
641657
is_image=is_image,
642658
model_name=hi_res_model_name,
643659
pdf_image_dpi=pdf_image_dpi,
660+
password=password,
644661
)
645662

646663
if hi_res_model_name.startswith("chipper"):
@@ -652,7 +669,8 @@ def _partition_pdf_or_image_local(
652669
file.seek(0)
653670

654671
extracted_layout = (
655-
process_data_with_pdfminer(file=file, dpi=pdf_image_dpi)
672+
process_data_with_pdfminer(file=file, dpi=pdf_image_dpi,
673+
password=password)
656674
if pdf_text_extractable
657675
else []
658676
)
@@ -694,6 +712,7 @@ def _partition_pdf_or_image_local(
694712
ocr_mode=ocr_mode,
695713
pdf_image_dpi=pdf_image_dpi,
696714
ocr_layout_dumper=ocr_layout_dumper,
715+
password=password,
697716
)
698717

699718
# NOTE(alan): starting with v2, chipper sorts the elements itself.
@@ -734,6 +753,7 @@ def _partition_pdf_or_image_local(
734753
pdf_image_dpi=pdf_image_dpi,
735754
extract_image_block_to_payload=extract_image_block_to_payload,
736755
output_dir_path=extract_image_block_output_dir,
756+
password=password,
737757
)
738758

739759
for el_type in extract_image_block_types:
@@ -807,6 +827,7 @@ def _partition_pdf_or_image_local(
807827
draw_caption=env_config.ANALYSIS_BBOX_DRAW_CAPTION,
808828
resize=env_config.ANALYSIS_BBOX_RESIZE,
809829
format=env_config.ANALYSIS_BBOX_FORMAT,
830+
password=password,
810831
)
811832

812833
return out_elements
@@ -845,6 +866,7 @@ def _partition_pdf_or_image_with_ocr(
845866
is_image: bool = False,
846867
metadata_last_modified: Optional[str] = None,
847868
starting_page_number: int = 1,
869+
password:Optional[str] = None,
848870
**kwargs: Any,
849871
):
850872
"""Partitions an image or PDF using OCR. For PDFs, each page is converted
@@ -869,7 +891,8 @@ def _partition_pdf_or_image_with_ocr(
869891
elements.extend(page_elements)
870892
else:
871893
for page_number, image in enumerate(
872-
convert_pdf_to_images(filename, file), start=starting_page_number
894+
convert_pdf_to_images(filename, file, password=password),
895+
start=starting_page_number
873896
):
874897
page_elements = _partition_pdf_or_image_with_ocr_from_image(
875898
image=image,

unstructured/partition/pdf_image/analysis/bbox_visualisation.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -546,6 +546,7 @@ def __init__(
546546
draw_grid: bool = False,
547547
resize: Optional[float] = None,
548548
format: str = "png",
549+
password: Optional[str] = None,
549550
):
550551
self.draw_caption = draw_caption
551552
self.draw_grid = draw_grid
@@ -554,6 +555,7 @@ def __init__(
554555
self.format = format
555556
self.drawers = []
556557
self.file = file
558+
self.password = password
557559

558560
super().__init__(filename, save_dir)
559561

@@ -678,6 +680,7 @@ def load_source_image(self) -> Generator[Image.Image, None, None]:
678680
file=self.file,
679681
output_folder=temp_dir,
680682
path_only=True,
683+
password=self.password,
681684
)
682685
except Exception as ex: # noqa: E722
683686
print(

unstructured/partition/pdf_image/analysis/tools.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ def save_analysis_artifiacts(
6666
draw_caption: bool = True,
6767
resize: Optional[float] = None,
6868
format: str = "png",
69+
password: Optional[str] = None,
6970
):
7071
"""Save the analysis artifacts for a given file. Loads some settings from
7172
the environment configuration.
@@ -109,6 +110,7 @@ def save_analysis_artifiacts(
109110
draw_caption=draw_caption,
110111
resize=resize,
111112
format=format,
113+
password=password,
112114
)
113115

114116
for layout_dumper in layout_dumpers:
@@ -125,6 +127,7 @@ def render_bboxes_for_file(
125127
draw_caption: bool = True,
126128
resize: Optional[float] = None,
127129
format: str = "png",
130+
password: Optional[str] = None,
128131
):
129132
"""Render the bounding boxes for a given layout dimp file.
130133
To be used for analysis after the partition is performed for
@@ -183,6 +186,7 @@ def render_bboxes_for_file(
183186
draw_caption=draw_caption,
184187
resize=resize,
185188
format=format,
189+
passwork=password,
186190
)
187191

188192
for drawer in layout_drawers:

unstructured/partition/pdf_image/ocr.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ def process_data_with_ocr(
3737
ocr_mode: str = OCRMode.FULL_PAGE.value,
3838
pdf_image_dpi: int = 200,
3939
ocr_layout_dumper: Optional[OCRLayoutDumper] = None,
40+
password: Optional[str] = None,
4041
) -> "DocumentLayout":
4142
"""
4243
Process OCR data from a given data and supplement the output DocumentLayout
@@ -64,6 +65,8 @@ def process_data_with_ocr(
6465
6566
- ocr_layout_dumper (OCRLayoutDumper, optional): The OCR layout dumper to save the OCR layout.
6667
68+
- password (optional): The password to decrypt the PDF file.
69+
6770
Returns:
6871
DocumentLayout: The merged layout information obtained after OCR processing.
6972
"""
@@ -84,6 +87,7 @@ def process_data_with_ocr(
8487
ocr_mode=ocr_mode,
8588
pdf_image_dpi=pdf_image_dpi,
8689
ocr_layout_dumper=ocr_layout_dumper,
90+
password=password,
8791
)
8892

8993
return merged_layouts
@@ -100,6 +104,7 @@ def process_file_with_ocr(
100104
ocr_mode: str = OCRMode.FULL_PAGE.value,
101105
pdf_image_dpi: int = 200,
102106
ocr_layout_dumper: Optional[OCRLayoutDumper] = None,
107+
password: Optional[str] = None,
103108
) -> "DocumentLayout":
104109
"""
105110
Process OCR data from a given file and supplement the output DocumentLayout
@@ -124,6 +129,8 @@ def process_file_with_ocr(
124129
125130
- pdf_image_dpi (int, optional): DPI (dots per inch) for processing PDF images. Defaults to 200.
126131
132+
- password (optional): The password to decrypt the PDF file.
133+
127134
Returns:
128135
DocumentLayout: The merged layout information obtained after OCR processing.
129136
"""
@@ -157,6 +164,7 @@ def process_file_with_ocr(
157164
dpi=pdf_image_dpi,
158165
output_folder=temp_dir,
159166
paths_only=True,
167+
userpw=password,
160168
)
161169
image_paths = cast(List[str], _image_paths)
162170
for i, image_path in enumerate(image_paths):

unstructured/partition/pdf_image/pdf_image_utils.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ def convert_pdf_to_image(
5858
dpi: int = 200,
5959
output_folder: Optional[Union[str, PurePath]] = None,
6060
path_only: bool = False,
61+
password:Optional[str] = None,
6162
) -> Union[List[Image.Image], List[str]]:
6263
"""Get the image renderings of the pdf pages using pdf2image"""
6364

@@ -71,6 +72,7 @@ def convert_pdf_to_image(
7172
dpi=dpi,
7273
output_folder=output_folder,
7374
paths_only=path_only,
75+
userpw=password,
7476
)
7577
else:
7678
images = pdf2image.convert_from_path(
@@ -125,6 +127,7 @@ def save_elements(
125127
is_image: bool = False,
126128
extract_image_block_to_payload: bool = False,
127129
output_dir_path: str | None = None,
130+
password:Optional[str] = None,
128131
):
129132
"""
130133
Saves specific elements from a PDF as images either to a directory or embeds them in the
@@ -167,6 +170,7 @@ def save_elements(
167170
pdf_image_dpi,
168171
output_folder=temp_dir,
169172
path_only=True,
173+
password=password,
170174
)
171175
image_paths = cast(List[str], _image_paths)
172176

@@ -389,15 +393,16 @@ def convert_pdf_to_images(
389393
filename: str = "",
390394
file: Optional[bytes | IO[bytes]] = None,
391395
chunk_size: int = 10,
396+
password:Optional[str] = None,
392397
) -> Iterator[Image.Image]:
393398
# Convert a PDF in small chunks of pages at a time (e.g. 1-10, 11-20... and so on)
394399
exactly_one(filename=filename, file=file)
395400
if file is not None:
396401
f_bytes = convert_to_bytes(file)
397-
info = pdf2image.pdfinfo_from_bytes(f_bytes)
402+
info = pdf2image.pdfinfo_from_bytes(f_bytes, userpw=password)
398403
else:
399404
f_bytes = None
400-
info = pdf2image.pdfinfo_from_path(filename)
405+
info = pdf2image.pdfinfo_from_path(filename, userpw=password)
401406

402407
total_pages = info["Pages"]
403408
for start_page in range(1, total_pages + 1, chunk_size):
@@ -407,12 +412,14 @@ def convert_pdf_to_images(
407412
f_bytes,
408413
first_page=start_page,
409414
last_page=end_page,
415+
userpw=password,
410416
)
411417
else:
412418
chunk_images = pdf2image.convert_from_path(
413419
filename,
414420
first_page=start_page,
415421
last_page=end_page,
422+
userpw=password,
416423
)
417424

418425
for image in chunk_images:

0 commit comments

Comments
 (0)