Skip to content
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

## 0.8.3

* fix: add `password` for PDF
* fix: removed `layoutelement.from_lp_textblock()` and related tests as it's not used
* fix: update requirements to drop `layoutparser` lib
* fix: update `README.md` to remove layoutparser model zoo support note
Expand Down
Binary file added sample-docs/password.pdf
Binary file not shown.
20 changes: 20 additions & 0 deletions test_unstructured_inference/inference/test_layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,6 +302,25 @@ def mock_get_elements(self, *args, **kwargs):
assert page.image is None


@pytest.mark.slow()
def test_from_file_with_password(monkeypatch, mock_final_layout):

doc = layout.DocumentLayout.from_file(
"sample-docs/password.pdf",
password="password")
assert doc

monkeypatch.setattr(layout, "get_model",
lambda x: MockLayoutModel(mock_final_layout))
with patch(
"unstructured_inference.inference.layout.UnstructuredObjectDetectionModel",
MockLayoutModel,
), open("sample-docs/password.pdf",mode="rb") as fp:
doc = layout.process_data_with_model(fp, model_name="fake", password="password")
assert doc



def test_from_image_file_raises_with_empty_fn():
with pytest.raises(FileNotFoundError):
layout.DocumentLayout.from_image_file("")
Expand Down Expand Up @@ -544,6 +563,7 @@ def test_process_file_with_model_routing(monkeypatch, model_type, is_detection_m
detection_model=detection_model,
element_extraction_model=element_extraction_model,
fixed_layouts=None,
password=None,
pdf_image_dpi=200,
)

Expand Down
7 changes: 5 additions & 2 deletions test_unstructured_inference/models/test_tables.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
from copy import deepcopy

import numpy as np
import pytest
Expand All @@ -7,11 +8,13 @@
from transformers.models.table_transformer.modeling_table_transformer import (
TableTransformerDecoder,
)
from copy import deepcopy

import unstructured_inference.models.table_postprocess as postprocess
from unstructured_inference.models import tables
from unstructured_inference.models.tables import apply_thresholds_on_objects, structure_to_cells
from unstructured_inference.models.tables import (
apply_thresholds_on_objects,
structure_to_cells,
)

skip_outside_ci = os.getenv("CI", "").lower() in {"", "false", "f", "0"}

Expand Down
11 changes: 11 additions & 0 deletions unstructured_inference/inference/layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ def from_file(
filename: str,
fixed_layouts: Optional[List[Optional[List[TextRegion]]]] = None,
pdf_image_dpi: int = 200,
password: Optional[str] = None,
**kwargs,
) -> DocumentLayout:
"""Creates a DocumentLayout from a pdf file."""
Expand All @@ -62,6 +63,7 @@ def from_file(
pdf_image_dpi,
output_folder=temp_dir,
path_only=True,
password=password,
)
image_paths = cast(List[str], _image_paths)
number_of_pages = len(image_paths)
Expand Down Expand Up @@ -133,6 +135,7 @@ def __init__(
document_filename: Optional[Union[str, PurePath]] = None,
detection_model: Optional[UnstructuredObjectDetectionModel] = None,
element_extraction_model: Optional[UnstructuredElementExtractionModel] = None,
password: Optional[str] = None,
):
if detection_model is not None and element_extraction_model is not None:
raise ValueError("Only one of detection_model and extraction_model should be passed.")
Expand All @@ -148,6 +151,7 @@ def __init__(
self.element_extraction_model = element_extraction_model
self.elements: Collection[LayoutElement] = []
self.elements_array: LayoutElements | None = None
self.password = password
# NOTE(alan): Dropped LocationlessLayoutElement that was created for chipper - chipper has
# locations now and if we need to support LayoutElements without bounding boxes we can make
# the bbox property optional
Expand Down Expand Up @@ -325,6 +329,7 @@ def from_image(
def process_data_with_model(
data: BinaryIO,
model_name: Optional[str],
password: Optional[str] = None,
**kwargs: Any,
) -> DocumentLayout:
"""Process PDF as file-like object `data` into a `DocumentLayout`.
Expand All @@ -339,6 +344,7 @@ def process_data_with_model(
layout = process_file_with_model(
file_path,
model_name,
password=password,
**kwargs,
)

Expand All @@ -351,6 +357,7 @@ def process_file_with_model(
is_image: bool = False,
fixed_layouts: Optional[List[Optional[List[TextRegion]]]] = None,
pdf_image_dpi: int = 200,
password: Optional[str] = None,
**kwargs: Any,
) -> DocumentLayout:
"""Processes pdf file with name filename into a DocumentLayout by using a model identified by
Expand Down Expand Up @@ -379,6 +386,7 @@ def process_file_with_model(
element_extraction_model=element_extraction_model,
fixed_layouts=fixed_layouts,
pdf_image_dpi=pdf_image_dpi,
password=password,
**kwargs,
)
)
Expand All @@ -390,6 +398,7 @@ def convert_pdf_to_image(
dpi: int = 200,
output_folder: Optional[Union[str, PurePath]] = None,
path_only: bool = False,
password: Optional[str] = None,
) -> Union[List[Image.Image], List[str]]:
"""Get the image renderings of the pdf pages using pdf2image"""

Expand All @@ -402,12 +411,14 @@ def convert_pdf_to_image(
dpi=dpi,
output_folder=output_folder,
paths_only=path_only,
userpw=password,
)
else:
images = pdf2image.convert_from_path(
filename,
dpi=dpi,
paths_only=path_only,
userpw=password,
)

return images