Unstructured-IO · Coniferish · Jan 30, 2025 · Oct 15, 2024 · Oct 24, 2024 · Oct 25, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,7 @@
 
 ## 0.8.3
 
+* fix: add `password` for PDF
 * fix: removed `layoutelement.from_lp_textblock()` and related tests as it's not used
 * fix: update requirements to drop `layoutparser` lib
 * fix: update `README.md` to remove layoutparser model zoo support note

diff --git a/sample-docs/password.pdf b/sample-docs/password.pdf
diff --git a/test_unstructured_inference/inference/test_layout.py b/test_unstructured_inference/inference/test_layout.py
@@ -302,6 +302,25 @@ def mock_get_elements(self, *args, **kwargs):
             assert page.image is None
 
 
+@pytest.mark.slow()
+def test_from_file_with_password(monkeypatch, mock_final_layout):
+
+    doc = layout.DocumentLayout.from_file(
+        "sample-docs/password.pdf",
+        password="password")
+    assert doc
+
+    monkeypatch.setattr(layout, "get_model",
+                        lambda x: MockLayoutModel(mock_final_layout))
+    with patch(
+        "unstructured_inference.inference.layout.UnstructuredObjectDetectionModel",
+        MockLayoutModel,
+    ), open("sample-docs/password.pdf",mode="rb") as fp:
+        doc = layout.process_data_with_model(fp, model_name="fake", password="password")
+        assert doc
+
+
+
 def test_from_image_file_raises_with_empty_fn():
     with pytest.raises(FileNotFoundError):
         layout.DocumentLayout.from_image_file("")
@@ -544,6 +563,7 @@ def test_process_file_with_model_routing(monkeypatch, model_type, is_detection_m
             detection_model=detection_model,
             element_extraction_model=element_extraction_model,
             fixed_layouts=None,
+            password=None,
             pdf_image_dpi=200,
         )
 

diff --git a/test_unstructured_inference/models/test_tables.py b/test_unstructured_inference/models/test_tables.py
@@ -1,4 +1,5 @@
 import os
+from copy import deepcopy
 
 import numpy as np
 import pytest
@@ -7,11 +8,13 @@
 from transformers.models.table_transformer.modeling_table_transformer import (
     TableTransformerDecoder,
 )
-from copy import deepcopy
 
 import unstructured_inference.models.table_postprocess as postprocess
 from unstructured_inference.models import tables
-from unstructured_inference.models.tables import apply_thresholds_on_objects, structure_to_cells
+from unstructured_inference.models.tables import (
+    apply_thresholds_on_objects,
+    structure_to_cells,
+)
 
 skip_outside_ci = os.getenv("CI", "").lower() in {"", "false", "f", "0"}
 

diff --git a/unstructured_inference/inference/layout.py b/unstructured_inference/inference/layout.py
@@ -51,6 +51,7 @@ def from_file(
         filename: str,
         fixed_layouts: Optional[List[Optional[List[TextRegion]]]] = None,
         pdf_image_dpi: int = 200,
+        password: Optional[str] = None,
         **kwargs,
     ) -> DocumentLayout:
         """Creates a DocumentLayout from a pdf file."""
@@ -62,6 +63,7 @@ def from_file(
                 pdf_image_dpi,
                 output_folder=temp_dir,
                 path_only=True,
+                password=password,
             )
             image_paths = cast(List[str], _image_paths)
             number_of_pages = len(image_paths)
@@ -133,6 +135,7 @@ def __init__(
         document_filename: Optional[Union[str, PurePath]] = None,
         detection_model: Optional[UnstructuredObjectDetectionModel] = None,
         element_extraction_model: Optional[UnstructuredElementExtractionModel] = None,
+        password: Optional[str] = None,
     ):
         if detection_model is not None and element_extraction_model is not None:
             raise ValueError("Only one of detection_model and extraction_model should be passed.")
@@ -148,6 +151,7 @@ def __init__(
         self.element_extraction_model = element_extraction_model
         self.elements: Collection[LayoutElement] = []
         self.elements_array: LayoutElements | None = None
+        self.password = password
         # NOTE(alan): Dropped LocationlessLayoutElement that was created for chipper - chipper has
         # locations now and if we need to support LayoutElements without bounding boxes we can make
         # the bbox property optional
@@ -325,6 +329,7 @@ def from_image(
 def process_data_with_model(
     data: BinaryIO,
     model_name: Optional[str],
+    password: Optional[str] = None,
     **kwargs: Any,
 ) -> DocumentLayout:
     """Process PDF as file-like object `data` into a `DocumentLayout`.
@@ -339,6 +344,7 @@ def process_data_with_model(
         layout = process_file_with_model(
             file_path,
             model_name,
+            password=password,
             **kwargs,
         )
 
@@ -351,6 +357,7 @@ def process_file_with_model(
     is_image: bool = False,
     fixed_layouts: Optional[List[Optional[List[TextRegion]]]] = None,
     pdf_image_dpi: int = 200,
+    password: Optional[str] = None,
     **kwargs: Any,
 ) -> DocumentLayout:
     """Processes pdf file with name filename into a DocumentLayout by using a model identified by
@@ -379,6 +386,7 @@ def process_file_with_model(
             element_extraction_model=element_extraction_model,
             fixed_layouts=fixed_layouts,
             pdf_image_dpi=pdf_image_dpi,
+            password=password,
             **kwargs,
         )
     )
@@ -390,6 +398,7 @@ def convert_pdf_to_image(
     dpi: int = 200,
     output_folder: Optional[Union[str, PurePath]] = None,
     path_only: bool = False,
+    password: Optional[str] = None,
 ) -> Union[List[Image.Image], List[str]]:
     """Get the image renderings of the pdf pages using pdf2image"""
 
@@ -402,12 +411,14 @@ def convert_pdf_to_image(
             dpi=dpi,
             output_folder=output_folder,
             paths_only=path_only,
+            userpw=password,
         )
     else:
         images = pdf2image.convert_from_path(
             filename,
             dpi=dpi,
             paths_only=path_only,
+            userpw=password,
         )
 
     return images