Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
## 0.8.7

* fix: add `password` for PDF

## 0.8.6

* feat: add back `source` to `TextRegions` and `LayoutElements` for backward compatibility
Expand Down
Binary file added sample-docs/password.pdf
Binary file not shown.
16 changes: 16 additions & 0 deletions test_unstructured_inference/inference/test_layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,6 +302,21 @@ def mock_get_elements(self, *args, **kwargs):
assert page.image is None


@pytest.mark.slow()
def test_from_file_with_password(monkeypatch, mock_final_layout):

doc = layout.DocumentLayout.from_file("sample-docs/password.pdf", password="password")
assert doc

monkeypatch.setattr(layout, "get_model", lambda x: MockLayoutModel(mock_final_layout))
with patch(
"unstructured_inference.inference.layout.UnstructuredObjectDetectionModel",
MockLayoutModel,
), open("sample-docs/password.pdf", mode="rb") as fp:
doc = layout.process_data_with_model(fp, model_name="fake", password="password")
assert doc


def test_from_image_file_raises_with_empty_fn():
with pytest.raises(FileNotFoundError):
layout.DocumentLayout.from_image_file("")
Expand Down Expand Up @@ -544,6 +559,7 @@ def test_process_file_with_model_routing(monkeypatch, model_type, is_detection_m
detection_model=detection_model,
element_extraction_model=element_extraction_model,
fixed_layouts=None,
password=None,
pdf_image_dpi=200,
)

Expand Down
5 changes: 4 additions & 1 deletion test_unstructured_inference/models/test_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,10 @@

import unstructured_inference.models.table_postprocess as postprocess
from unstructured_inference.models import tables
from unstructured_inference.models.tables import apply_thresholds_on_objects, structure_to_cells
from unstructured_inference.models.tables import (
apply_thresholds_on_objects,
structure_to_cells,
)

skip_outside_ci = os.getenv("CI", "").lower() in {"", "false", "f", "0"}

Expand Down
2 changes: 1 addition & 1 deletion unstructured_inference/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.8.6" # pragma: no cover
__version__ = "0.8.7" # pragma: no cover
11 changes: 11 additions & 0 deletions unstructured_inference/inference/layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ def from_file(
filename: str,
fixed_layouts: Optional[List[Optional[List[TextRegion]]]] = None,
pdf_image_dpi: int = 200,
password: Optional[str] = None,
**kwargs,
) -> DocumentLayout:
"""Creates a DocumentLayout from a pdf file."""
Expand All @@ -62,6 +63,7 @@ def from_file(
pdf_image_dpi,
output_folder=temp_dir,
path_only=True,
password=password,
)
image_paths = cast(List[str], _image_paths)
number_of_pages = len(image_paths)
Expand Down Expand Up @@ -133,6 +135,7 @@ def __init__(
document_filename: Optional[Union[str, PurePath]] = None,
detection_model: Optional[UnstructuredObjectDetectionModel] = None,
element_extraction_model: Optional[UnstructuredElementExtractionModel] = None,
password: Optional[str] = None,
):
if detection_model is not None and element_extraction_model is not None:
raise ValueError("Only one of detection_model and extraction_model should be passed.")
Expand All @@ -148,6 +151,7 @@ def __init__(
self.element_extraction_model = element_extraction_model
self.elements: Collection[LayoutElement] = []
self.elements_array: LayoutElements | None = None
self.password = password
# NOTE(alan): Dropped LocationlessLayoutElement that was created for chipper - chipper has
# locations now and if we need to support LayoutElements without bounding boxes we can make
# the bbox property optional
Expand Down Expand Up @@ -325,6 +329,7 @@ def from_image(
def process_data_with_model(
data: BinaryIO,
model_name: Optional[str],
password: Optional[str] = None,
**kwargs: Any,
) -> DocumentLayout:
"""Process PDF as file-like object `data` into a `DocumentLayout`.
Expand All @@ -339,6 +344,7 @@ def process_data_with_model(
layout = process_file_with_model(
file_path,
model_name,
password=password,
**kwargs,
)

Expand All @@ -351,6 +357,7 @@ def process_file_with_model(
is_image: bool = False,
fixed_layouts: Optional[List[Optional[List[TextRegion]]]] = None,
pdf_image_dpi: int = 200,
password: Optional[str] = None,
**kwargs: Any,
) -> DocumentLayout:
"""Processes pdf file with name filename into a DocumentLayout by using a model identified by
Expand Down Expand Up @@ -379,6 +386,7 @@ def process_file_with_model(
element_extraction_model=element_extraction_model,
fixed_layouts=fixed_layouts,
pdf_image_dpi=pdf_image_dpi,
password=password,
**kwargs,
)
)
Expand All @@ -390,6 +398,7 @@ def convert_pdf_to_image(
dpi: int = 200,
output_folder: Optional[Union[str, PurePath]] = None,
path_only: bool = False,
password: Optional[str] = None,
) -> Union[List[Image.Image], List[str]]:
"""Get the image renderings of the pdf pages using pdf2image"""

Expand All @@ -402,12 +411,14 @@ def convert_pdf_to_image(
dpi=dpi,
output_folder=output_folder,
paths_only=path_only,
userpw=password or "",
)
else:
images = pdf2image.convert_from_path(
filename,
dpi=dpi,
paths_only=path_only,
userpw=password or "",
)

return images
Loading