Skip to content

Commit 4a66b6d

Browse files
fix: pdf2image too many open files (#152)
This PR is to fix the too many open files issue when loading pdf and to add functionality to set page.image to None to reduce memory usage. * update load_pdf to return either Image objects or Image paths * update DocumentLayout.from_file to open only one image * add image_metadata attribute to PageLayout Class & set page.image to None to reduce memory usage * add functionality to store pdf images for later use * update the annotate and _get_image_array methods of PageLayout to get the image from the image_path property if the image property is None
1 parent 24d8673 commit 4a66b6d

File tree

4 files changed

+230
-89
lines changed

4 files changed

+230
-89
lines changed

CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,10 @@
11
## 0.5.6-dev2
22

3+
* Update the `annotate` and `_get_image_array` methods of `PageLayout` to get the image from the `image_path` property if the `image` property is `None`.
4+
* Add functionality to store pdf images for later use.
5+
* Add `image_metadata` property to `PageLayout` & set `page.image` to None to reduce memory usage.
6+
* Update `DocumentLayout.from_file` to open only one image.
7+
* Update `load_pdf` to return either Image objects or Image paths.
38
* Warns users that Chipper is a beta model.
49
* Exposed control over dpi when converting PDF to an image.
510
* Updated detectron2 version to avoid errors related to deprecated PIL reference
3.43 MB
Binary file not shown.

test_unstructured_inference/inference/test_layout.py

Lines changed: 145 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import os.path
12
import tempfile
23
from functools import partial
34
from itertools import product
@@ -9,6 +10,7 @@
910

1011
import unstructured_inference.models.base as models
1112
from unstructured_inference.inference import elements, layout, layoutelement
13+
from unstructured_inference.inference.layout import create_image_output_dir
1214
from unstructured_inference.models import detectron2, tesseract
1315
from unstructured_inference.models.unstructuredmodel import (
1416
UnstructuredElementExtractionModel,
@@ -47,12 +49,22 @@ def mock_final_layout():
4749

4850

4951
def test_pdf_page_converts_images_to_array(mock_image):
52+
def verify_image_array():
53+
assert page.image_array is None
54+
image_array = page._get_image_array()
55+
assert isinstance(image_array, np.ndarray)
56+
assert page.image_array.all() == image_array.all()
57+
58+
# Scenario 1: where self.image exists
5059
page = layout.PageLayout(number=0, image=mock_image, layout=[])
51-
assert page.image_array is None
60+
verify_image_array()
5261

53-
image_array = page._get_image_array()
54-
assert isinstance(image_array, np.ndarray)
55-
assert page.image_array.all() == image_array.all()
62+
# Scenario 2: where self.image is None, but self.image_path exists
63+
page.image_array = None
64+
page.image = None
65+
page.image_path = "mock_path_to_image"
66+
with patch.object(Image, "open", return_value=mock_image):
67+
verify_image_array()
5668

5769

5870
def test_ocr(monkeypatch):
@@ -141,31 +153,35 @@ def test_get_page_elements_with_ocr(monkeypatch):
141153
assert str(page) == "\n\nAn Even Catchier Title"
142154

143155

144-
def test_read_pdf(monkeypatch, mock_initial_layout, mock_final_layout):
145-
image = np.random.randint(12, 24, (40, 40))
146-
images = [image, image]
156+
def test_read_pdf(monkeypatch, mock_initial_layout, mock_final_layout, mock_image):
157+
with tempfile.TemporaryDirectory() as tmpdir:
158+
image_path1 = os.path.join(tmpdir, "mock1.jpg")
159+
image_path2 = os.path.join(tmpdir, "mock2.jpg")
160+
mock_image.save(image_path1)
161+
mock_image.save(image_path2)
162+
image_paths = [image_path1, image_path2]
147163

148-
layouts = [mock_initial_layout, mock_initial_layout]
164+
layouts = [mock_initial_layout, mock_initial_layout]
149165

150-
monkeypatch.setattr(
151-
models,
152-
"UnstructuredDetectronModel",
153-
partial(MockLayoutModel, layout=mock_final_layout),
154-
)
155-
monkeypatch.setattr(detectron2, "is_detectron2_available", lambda *args: True)
166+
monkeypatch.setattr(
167+
models,
168+
"UnstructuredDetectronModel",
169+
partial(MockLayoutModel, layout=mock_final_layout),
170+
)
171+
monkeypatch.setattr(detectron2, "is_detectron2_available", lambda *args: True)
156172

157-
with patch.object(layout, "load_pdf", return_value=(layouts, images)):
158-
model = layout.get_model("detectron2_lp")
159-
doc = layout.DocumentLayout.from_file("fake-file.pdf", detection_model=model)
173+
with patch.object(layout, "load_pdf", return_value=(layouts, image_paths)):
174+
model = layout.get_model("detectron2_lp")
175+
doc = layout.DocumentLayout.from_file("fake-file.pdf", detection_model=model)
160176

161-
assert str(doc).startswith("A Catchy Title")
162-
assert str(doc).count("A Catchy Title") == 2 # Once for each page
163-
assert str(doc).endswith("A very repetitive narrative. ")
177+
assert str(doc).startswith("A Catchy Title")
178+
assert str(doc).count("A Catchy Title") == 2 # Once for each page
179+
assert str(doc).endswith("A very repetitive narrative. ")
164180

165-
assert doc.pages[0].elements[0].to_dict()["text"] == "A Catchy Title"
181+
assert doc.pages[0].elements[0].to_dict()["text"] == "A Catchy Title"
166182

167-
pages = doc.pages
168-
assert str(doc) == "\n\n".join([str(page) for page in pages])
183+
pages = doc.pages
184+
assert str(doc) == "\n\n".join([str(page) for page in pages])
169185

170186

171187
@pytest.mark.parametrize("model_name", [None, "checkbox", "fake"])
@@ -320,12 +336,53 @@ def mock_get_elements(self, *args, **kwargs):
320336
self.elements = [mock_final_layout]
321337

322338
monkeypatch.setattr(layout.PageLayout, "get_elements_with_detection_model", mock_get_elements)
323-
elements = (
324-
layout.DocumentLayout.from_image_file(f"sample-docs/loremipsum.{filetype}")
325-
.pages[0]
326-
.elements
327-
)
328-
assert elements[0] == mock_final_layout
339+
filename = f"sample-docs/loremipsum.{filetype}"
340+
image = Image.open(filename)
341+
image_metadata = {
342+
"format": image.format,
343+
"width": image.width,
344+
"height": image.height,
345+
}
346+
347+
doc = layout.DocumentLayout.from_image_file(filename)
348+
page = doc.pages[0]
349+
assert page.elements[0] == mock_final_layout
350+
assert page.image is None
351+
assert page.image_path == os.path.abspath(filename)
352+
assert page.image_metadata == image_metadata
353+
354+
355+
def test_from_file(monkeypatch, mock_final_layout):
356+
def mock_get_elements(self, *args, **kwargs):
357+
self.elements = [mock_final_layout]
358+
359+
monkeypatch.setattr(layout.PageLayout, "get_elements_with_detection_model", mock_get_elements)
360+
361+
with tempfile.TemporaryDirectory() as tmpdir:
362+
image_path = os.path.join(tmpdir, "loremipsum.ppm")
363+
image = Image.open("sample-docs/loremipsum.jpg")
364+
image.save(image_path)
365+
image_metadata = {
366+
"format": "PPM",
367+
"width": image.width,
368+
"height": image.height,
369+
}
370+
371+
with patch.object(
372+
layout,
373+
"create_image_output_dir",
374+
return_value=tmpdir,
375+
), patch.object(
376+
layout,
377+
"load_pdf",
378+
lambda *args, **kwargs: ([[]], [image_path]),
379+
):
380+
doc = layout.DocumentLayout.from_file("fake-file.pdf")
381+
page = doc.pages[0]
382+
assert page.elements[0] == mock_final_layout
383+
assert page.image_metadata == image_metadata
384+
assert page.image_path == image_path
385+
assert page.image is None
329386

330387

331388
def test_from_image_file_raises_with_empty_fn():
@@ -526,6 +583,14 @@ def test_load_pdf_image_placement():
526583
assert image_region.y2 < images[5].height / 2
527584

528585

586+
def test_load_pdf_raises_with_path_only_no_output_folder():
587+
with pytest.raises(ValueError):
588+
layout.load_pdf(
589+
"sample-docs/loremipsum-flat.pdf",
590+
path_only=True,
591+
)
592+
593+
529594
@pytest.mark.skip("Temporarily removed multicolumn to fix ordering")
530595
def test_load_pdf_with_multicolumn_layout_and_ocr(filename="sample-docs/design-thinking.pdf"):
531596
layouts, images = layout.load_pdf(filename)
@@ -544,6 +609,21 @@ def test_load_pdf_with_multicolumn_layout_and_ocr(filename="sample-docs/design-t
544609

545610
@pytest.mark.parametrize("colors", ["red", None])
546611
def test_annotate(colors):
612+
def check_annotated_image():
613+
annotated_array = np.array(annotated_image)
614+
for coords in [coords1, coords2]:
615+
x1, y1, x2, y2 = coords
616+
# Make sure the pixels on the edge of the box are red
617+
for i, expected in zip(range(3), [255, 0, 0]):
618+
assert all(annotated_array[y1, x1:x2, i] == expected)
619+
assert all(annotated_array[y2, x1:x2, i] == expected)
620+
assert all(annotated_array[y1:y2, x1, i] == expected)
621+
assert all(annotated_array[y1:y2, x2, i] == expected)
622+
# Make sure almost all the pixels are not changed
623+
assert ((annotated_array[:, :, 0] == 1).mean()) > 0.992
624+
assert ((annotated_array[:, :, 1] == 1).mean()) > 0.992
625+
assert ((annotated_array[:, :, 2] == 1).mean()) > 0.992
626+
547627
test_image_arr = np.ones((100, 100, 3), dtype="uint8")
548628
image = Image.fromarray(test_image_arr)
549629
page = layout.PageLayout(number=1, image=image, layout=None)
@@ -552,19 +632,17 @@ def test_annotate(colors):
552632
coords2 = (1, 10, 7, 11)
553633
rect2 = elements.Rectangle(*coords2)
554634
page.elements = [rect1, rect2]
635+
636+
# Scenario 1: where self.image exists
555637
annotated_image = page.annotate(colors=colors)
556-
annotated_array = np.array(annotated_image)
557-
for x1, y1, x2, y2 in [coords1, coords2]:
558-
# Make sure the pixels on the edge of the box are red
559-
for i, expected in zip(range(3), [255, 0, 0]):
560-
assert all(annotated_array[y1, x1:x2, i] == expected)
561-
assert all(annotated_array[y2, x1:x2, i] == expected)
562-
assert all(annotated_array[y1:y2, x1, i] == expected)
563-
assert all(annotated_array[y1:y2, x2, i] == expected)
564-
# Make sure almost all the pixels are not changed
565-
assert ((annotated_array[:, :, 0] == 1).mean()) > 0.992
566-
assert ((annotated_array[:, :, 1] == 1).mean()) > 0.992
567-
assert ((annotated_array[:, :, 2] == 1).mean()) > 0.992
638+
check_annotated_image()
639+
640+
# Scenario 2: where self.image is None, but self.image_path exists
641+
with patch.object(Image, "open", return_value=image):
642+
page.image = None
643+
page.image_path = "mock_path_to_image"
644+
annotated_image = page.annotate(colors=colors)
645+
check_annotated_image()
568646

569647

570648
def test_textregion_returns_empty_ocr_never(mock_image):
@@ -609,18 +687,21 @@ def ordering_layout():
609687
return elements
610688

611689

612-
def test_layout_order(ordering_layout):
613-
with patch.object(layout, "get_model", lambda: lambda x: ordering_layout), patch.object(
614-
layout,
615-
"load_pdf",
616-
lambda *args, **kwargs: ([[]], [mock_image]),
617-
), patch.object(
618-
layout,
619-
"UnstructuredObjectDetectionModel",
620-
object,
621-
):
622-
doc = layout.DocumentLayout.from_file("sample-docs/layout-parser-paper.pdf")
623-
page = doc.pages[0]
690+
def test_layout_order(mock_image, ordering_layout):
691+
with tempfile.TemporaryDirectory() as tmpdir:
692+
mock_image_path = os.path.join(tmpdir, "mock.jpg")
693+
mock_image.save(mock_image_path)
694+
with patch.object(layout, "get_model", lambda: lambda x: ordering_layout), patch.object(
695+
layout,
696+
"load_pdf",
697+
lambda *args, **kwargs: ([[]], [mock_image_path]),
698+
), patch.object(
699+
layout,
700+
"UnstructuredObjectDetectionModel",
701+
object,
702+
):
703+
doc = layout.DocumentLayout.from_file("sample-docs/layout-parser-paper.pdf")
704+
page = doc.pages[0]
624705
for n, element in enumerate(page.elements):
625706
assert element.text == str(n)
626707

@@ -690,6 +771,7 @@ def test_from_image(
690771
) as mock_detection:
691772
layout.PageLayout.from_image(
692773
mock_image,
774+
image_path=None,
693775
detection_model=detection_model,
694776
element_extraction_model=element_extraction_model,
695777
)
@@ -748,3 +830,13 @@ def test_exposed_pdf_image_dpi(pdf_image_dpi, expected, monkeypatch):
748830
with patch.object(layout.PageLayout, "from_image") as mock_from_image:
749831
layout.DocumentLayout.from_file("sample-docs/loremipsum.pdf", pdf_image_dpi=pdf_image_dpi)
750832
assert mock_from_image.call_args[0][0].height == expected
833+
834+
835+
def test_create_image_output_dir():
836+
with tempfile.TemporaryDirectory() as tmpdir:
837+
tmp_f_path = os.path.join(tmpdir, "loremipsum.pdf")
838+
output_dir = create_image_output_dir(tmp_f_path)
839+
expected_output_dir = os.path.join(os.path.abspath(tmpdir), "loremipsum")
840+
assert os.path.isdir(output_dir)
841+
assert os.path.isabs(output_dir)
842+
assert output_dir == expected_output_dir

0 commit comments

Comments
 (0)