Skip to content

Commit 9b6aa8e

Browse files
authored
fix: use temporary instead of fixed directories for storing images of pdfs being processed (#184)
**Summary** - Replaces using a created directory for storing image outputs with a temporary directory - Deprecates `create_image_output_dir` method - Adds hot-loading for annotating images because images are no longer stored long-term in a directory - Adds a document_filename keyword arg to the PageLayout to enable hot-loading **Tests** Removes tests associated with `create_image_output_dir`
1 parent ea0831f commit 9b6aa8e

File tree

4 files changed

+97
-88
lines changed

4 files changed

+97
-88
lines changed

CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,8 @@
1+
## 0.5.16
2+
3+
* Fix to no longer create a directory for storing processed images
4+
* Hot-load images for annotation
5+
16
## 0.5.15
27

38
* Handle an uncaught TesseractError

test_unstructured_inference/inference/test_layout.py

Lines changed: 0 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@
1010

1111
import unstructured_inference.models.base as models
1212
from unstructured_inference.inference import elements, layout, layoutelement
13-
from unstructured_inference.inference.layout import create_image_output_dir
1413
from unstructured_inference.models import chipper, detectron2, tesseract
1514
from unstructured_inference.models.unstructuredmodel import (
1615
UnstructuredElementExtractionModel,
@@ -404,10 +403,6 @@ def mock_get_elements(self, *args, **kwargs):
404403
}
405404

406405
with patch.object(
407-
layout,
408-
"create_image_output_dir",
409-
return_value=tmpdir,
410-
), patch.object(
411406
layout,
412407
"load_pdf",
413408
lambda *args, **kwargs: ([[]], [image_path]),
@@ -416,7 +411,6 @@ def mock_get_elements(self, *args, **kwargs):
416411
page = doc.pages[0]
417412
assert page.elements[0] == mock_final_layout
418413
assert page.image_metadata == image_metadata
419-
assert page.image_path == image_path
420414
assert page.image is None
421415

422416

@@ -868,26 +862,6 @@ def test_exposed_pdf_image_dpi(pdf_image_dpi, expected, monkeypatch):
868862
assert mock_from_image.call_args[0][0].height == expected
869863

870864

871-
def test_create_image_output_dir():
872-
with tempfile.TemporaryDirectory() as tmpdir:
873-
tmp_f_path = os.path.join(tmpdir, "loremipsum.pdf")
874-
output_dir = create_image_output_dir(tmp_f_path)
875-
expected_output_dir = os.path.join(os.path.abspath(tmpdir), "loremipsum_images")
876-
assert os.path.isdir(output_dir)
877-
assert os.path.isabs(output_dir)
878-
assert output_dir == expected_output_dir
879-
880-
881-
def test_create_image_output_dir_no_ext():
882-
with tempfile.TemporaryDirectory() as tmpdir:
883-
tmp_f_path = os.path.join(tmpdir, "loremipsum_no_ext")
884-
output_dir = create_image_output_dir(tmp_f_path)
885-
expected_output_dir = os.path.join(os.path.abspath(tmpdir), "loremipsum_no_ext_images")
886-
assert os.path.isdir(output_dir)
887-
assert os.path.isabs(output_dir)
888-
assert output_dir == expected_output_dir
889-
890-
891865
def test_warning_if_chipper_and_low_dpi(caplog):
892866
with patch.object(layout.DocumentLayout, "from_file") as mock_from_file, patch.object(
893867
chipper.UnstructuredChipperModel,
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.5.15" # pragma: no cover
1+
__version__ = "0.5.16" # pragma: no cover

unstructured_inference/inference/layout.py

Lines changed: 91 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -87,44 +87,44 @@ def from_file(
8787
"""Creates a DocumentLayout from a pdf file."""
8888
logger.info(f"Reading PDF for file: {filename} ...")
8989

90-
# Store pdf images for later use
91-
output_dir = create_image_output_dir(filename)
92-
layouts, _image_paths = load_pdf(
93-
filename,
94-
pdf_image_dpi,
95-
output_folder=output_dir,
96-
path_only=True,
97-
)
98-
image_paths = cast(List[str], _image_paths)
99-
if len(layouts) > len(image_paths):
100-
raise RuntimeError(
101-
"Some images were not loaded. "
102-
"Check that poppler is installed and in your $PATH.",
90+
with tempfile.TemporaryDirectory() as temp_dir:
91+
layouts, _image_paths = load_pdf(
92+
filename,
93+
pdf_image_dpi,
94+
output_folder=temp_dir,
95+
path_only=True,
10396
)
104-
pages: List[PageLayout] = []
105-
if fixed_layouts is None:
106-
fixed_layouts = [None for _ in layouts]
107-
for i, (image_path, layout, fixed_layout) in enumerate(
108-
zip(image_paths, layouts, fixed_layouts),
109-
):
110-
# NOTE(robinson) - In the future, maybe we detect the page number and default
111-
# to the index if it is not detected
112-
with Image.open(image_path) as image:
113-
page = PageLayout.from_image(
114-
image,
115-
image_path=image_path,
116-
number=i + 1,
117-
detection_model=detection_model,
118-
element_extraction_model=element_extraction_model,
119-
layout=layout,
120-
ocr_strategy=ocr_strategy,
121-
ocr_languages=ocr_languages,
122-
ocr_mode=ocr_mode,
123-
fixed_layout=fixed_layout,
124-
extract_tables=extract_tables,
97+
image_paths = cast(List[str], _image_paths)
98+
if len(layouts) > len(image_paths):
99+
raise RuntimeError(
100+
"Some images were not loaded. "
101+
"Check that poppler is installed and in your $PATH.",
125102
)
126-
pages.append(page)
127-
return cls.from_pages(pages)
103+
104+
pages: List[PageLayout] = []
105+
if fixed_layouts is None:
106+
fixed_layouts = [None for _ in layouts]
107+
for i, (image_path, layout, fixed_layout) in enumerate(
108+
zip(image_paths, layouts, fixed_layouts),
109+
):
110+
# NOTE(robinson) - In the future, maybe we detect the page number and default
111+
# to the index if it is not detected
112+
with Image.open(image_path) as image:
113+
page = PageLayout.from_image(
114+
image,
115+
number=i + 1,
116+
document_filename=filename,
117+
detection_model=detection_model,
118+
element_extraction_model=element_extraction_model,
119+
layout=layout,
120+
ocr_strategy=ocr_strategy,
121+
ocr_languages=ocr_languages,
122+
ocr_mode=ocr_mode,
123+
fixed_layout=fixed_layout,
124+
extract_tables=extract_tables,
125+
)
126+
pages.append(page)
127+
return cls.from_pages(pages)
128128

129129
@classmethod
130130
def from_image_file(
@@ -180,7 +180,8 @@ def __init__(
180180
image: Image.Image,
181181
layout: Optional[List[TextRegion]],
182182
image_metadata: Optional[dict] = None,
183-
image_path: Optional[Union[str, PurePath]] = None,
183+
image_path: Optional[Union[str, PurePath]] = None, # TODO: Deprecate
184+
document_filename: Optional[Union[str, PurePath]] = None,
184185
detection_model: Optional[UnstructuredObjectDetectionModel] = None,
185186
element_extraction_model: Optional[UnstructuredElementExtractionModel] = None,
186187
ocr_strategy: str = "auto",
@@ -196,6 +197,7 @@ def __init__(
196197
self.image_metadata = image_metadata
197198
self.image_path = image_path
198199
self.image_array: Union[np.ndarray, None] = None
200+
self.document_filename = document_filename
199201
self.layout = layout
200202
self.number = number
201203
self.detection_model = detection_model
@@ -305,7 +307,11 @@ def _get_image_array(self) -> Union[np.ndarray, None]:
305307
self.image_array = np.array(image)
306308
return self.image_array
307309

308-
def annotate(self, colors: Optional[Union[List[str], str]] = None) -> Image.Image:
310+
def annotate(
311+
self,
312+
colors: Optional[Union[List[str], str]] = None,
313+
image_dpi: int = 200,
314+
) -> Image.Image:
309315
"""Annotates the elements on the page image."""
310316
if colors is None:
311317
colors = ["red" for _ in self.elements]
@@ -315,18 +321,46 @@ def annotate(self, colors: Optional[Union[List[str], str]] = None) -> Image.Imag
315321
if len(colors) < len(self.elements):
316322
n_copies = (len(self.elements) // len(colors)) + 1
317323
colors = colors * n_copies
318-
img = self.image.copy() if self.image else Image.open(self.image_path)
324+
325+
# Hotload image if it hasn't been loaded yet
326+
if self.image:
327+
img = self.image.copy()
328+
elif self.image_path:
329+
img = Image.open(self.image_path)
330+
else:
331+
img = self._get_image(self.document_filename, self.number, image_dpi)
319332

320333
for el, color in zip(self.elements, colors):
321334
if isinstance(el, Rectangle):
322335
img = draw_bbox(img, el, color=color)
336+
323337
return img
324338

339+
def _get_image(self, filename, page_number, pdf_image_dpi: int = 200) -> Image.Image:
340+
"""Hotloads a page image from a pdf file."""
341+
342+
with tempfile.TemporaryDirectory() as temp_dir:
343+
_image_paths = pdf2image.convert_from_path(
344+
filename,
345+
dpi=pdf_image_dpi,
346+
output_folder=temp_dir,
347+
paths_only=True,
348+
)
349+
image_paths = cast(List[str], _image_paths)
350+
if page_number > len(image_paths):
351+
raise ValueError(
352+
f"Page number {page_number} is greater than the number of pages in the PDF.",
353+
)
354+
355+
with Image.open(image_paths[page_number - 1]) as image:
356+
return image.copy()
357+
325358
@classmethod
326359
def from_image(
327360
cls,
328361
image: Image.Image,
329-
image_path: Optional[Union[str, PurePath]],
362+
image_path: Optional[Union[str, PurePath]] = None,
363+
document_filename: Optional[Union[str, PurePath]] = None,
330364
number: int = 1,
331365
detection_model: Optional[UnstructuredObjectDetectionModel] = None,
332366
element_extraction_model: Optional[UnstructuredElementExtractionModel] = None,
@@ -363,6 +397,9 @@ def from_image(
363397
"height": page.image.height if page.image else None,
364398
}
365399
page.image_path = os.path.abspath(image_path) if image_path else None
400+
page.document_filename = os.path.abspath(document_filename) if document_filename else None
401+
402+
# Clear the image to save memory
366403
page.image = None
367404

368405
return page
@@ -480,7 +517,7 @@ def get_element_from_block(
480517
def load_pdf(
481518
filename: str,
482519
dpi: int = 200,
483-
output_folder: Union[str, PurePath] = None, # type: ignore
520+
output_folder: Optional[Union[str, PurePath]] = None,
484521
path_only: bool = False,
485522
) -> Tuple[List[List[TextRegion]], Union[List[Image.Image], List[str]]]:
486523
"""Loads the image and word objects from a pdf using pdfplumber and the image renderings of the
@@ -509,30 +546,23 @@ def load_pdf(
509546
if path_only and not output_folder:
510547
raise ValueError("output_folder must be specified if path_only is true")
511548

512-
images = pdf2image.convert_from_path(
513-
filename,
514-
dpi=dpi,
515-
output_folder=output_folder,
516-
paths_only=path_only,
517-
)
549+
if output_folder is not None:
550+
images = pdf2image.convert_from_path(
551+
filename,
552+
dpi=dpi,
553+
output_folder=output_folder,
554+
paths_only=path_only,
555+
)
556+
else:
557+
images = pdf2image.convert_from_path(
558+
filename,
559+
dpi=dpi,
560+
paths_only=path_only,
561+
)
518562

519563
return layouts, images
520564

521565

522-
def create_image_output_dir(
523-
filename: Union[str, PurePath],
524-
) -> Union[str, PurePath]:
525-
"""Creates a directory to store the converted images from the pdf pages and returns the
526-
directory path"""
527-
parent_dir = os.path.abspath(os.path.dirname(filename))
528-
f_name_without_extension = os.path.splitext(os.path.basename(filename))[0]
529-
530-
# Add a suffix to avoid conflicts in case original file doesn't have an extension
531-
output_dir = os.path.join(parent_dir, f"{f_name_without_extension}_images")
532-
os.makedirs(output_dir, exist_ok=True)
533-
return output_dir
534-
535-
536566
def parse_ocr_data(ocr_data: dict) -> List[TextRegion]:
537567
"""
538568
Parse the OCR result data to extract a list of TextRegion objects.

0 commit comments

Comments
 (0)