Skip to content

Commit b9f032c

Browse files
Feat/save embedded images in pdf (#208)
Addresses unstructured issue [#1332](Unstructured-IO/unstructured#1332). This PR will work together with unstructured PR [#1371](Unstructured-IO/unstructured#1371). This PR also addresses `"true" embedded images` issue #215. ### Summary - Add functionality to extract and save images from the page - add the `extract_images` method to the `PageLayout` class - pass parameters related to extracting images from the page - add Python script to evaluate image extraction with various PDF processing libraries - Add functionality to get only "true" embedded images when extracting elements from PDF pages - add functionality to extract image objects (`LTImage`) from a `PDF layout element` parsed by `pdfminer.high_level.extract_pages` - update logic to determine `ImageTextRegion` in `load_pdf()` - Update the `layout visualization` script to be able to show only image elements if need The following documents can be used for testing and evaluation. - [Captur-1317-5_ENG-p23.pdf](https://utic-dev-tech-fixtures.s3.us-east-2.amazonaws.com/pastebin/Captur-1317-5_ENG-p23.pdf) - [23-BERKSHIRE.pdf](https://utic-dev-tech-fixtures.s3.us-east-2.amazonaws.com/pastebin/23-BERKSHIRE.pdf) - [main.PMC6312790-p1.pdf](https://github.com/Unstructured-IO/unstructured-inference/files/12675967/main.PMC6312790_1-1.pdf) ### Testing ``` from unstructured_inference.inference.layout import DocumentLayout f_path = "sample-docs/embedded-images.pdf" # default image output directory doc = DocumentLayout.from_file( filename=f_path, extract_images_in_pdf=True, ) # specific image output directory doc = DocumentLayout.from_file( filename=f_path, extract_images_in_pdf=True, image_output_dir_path=<directory_path>, ) ``` ### Evaluation ``` // Extracting Images $ PYTHONPATH=. python examples/image-extraction/embedded-image-extraction.py Captur-1317-5_ENG-p23.pdf unstructured // Layout Visualziation $ PYTHONPATH=. python examples/layout_analysis/visualization.py Captur-1317-5_ENG-p23.pdf image_oly ``` **NOTE:** To reproduce the original results for comparision, you need to replace [the lines](https://github.com/Unstructured-IO/unstructured-inference/blob/feat/save-embedded-images-in-pdf/unstructured_inference/inference/layout.py#L650-L659) with the following code snippet ``` _text, element_class = ( (element.get_text(), EmbeddedTextRegion) if hasattr(element, "get_text") else (None, ImageTextRegion) ) ```
1 parent 5c2acc4 commit b9f032c

File tree

11 files changed

+265
-20
lines changed

11 files changed

+265
-20
lines changed

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,9 @@
1+
## 0.5.31
2+
3+
* Add functionality to extract and save images from the page
4+
* Add functionality to get only "true" embedded images when extracting elements from PDF pages
5+
* Update the layout visualization script to be able to show only image elements if need
6+
17
## 0.5.30
28

39
* add an evaluation metric for table comparison based on token similarity
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
# Extracting Images
2+
3+
This directory contains examples of how to extract images in PDF's separately as images.
4+
5+
## How to run
6+
7+
Run `pip install -r requirements.txt` to install the Python dependencies.
8+
9+
### Extracting Embedded Images
10+
- Python script (embedded-image-extraction.py)
11+
```
12+
$ PYTHONPATH=. python examples/image-extraction/embedded-image-extraction.py <file_path> <library>
13+
```
14+
The library can be `unstructured`, `pymupdf`, and `pypdf2`. For example,
15+
```
16+
$ PYTHONPATH=. python examples/image-extraction/embedded-image-extraction.py embedded-images.pdf unstructured
17+
// or
18+
$ PYTHONPATH=. python examples/image-extraction/embedded-image-extraction.py embedded-images.pdf pymupdf
19+
// or
20+
$ PYTHONPATH=. python examples/image-extraction/embedded-image-extraction.py embedded-images.pdf pypdf2
21+
```
Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
import io
2+
import os.path
3+
import pathlib
4+
import sys
5+
6+
import fitz # PyMuPDF
7+
from PIL import Image
8+
from PyPDF2 import PdfReader
9+
10+
from unstructured_inference.inference.layout import DocumentLayout
11+
12+
CUR_DIR = pathlib.Path(__file__).parent.resolve()
13+
14+
15+
def print_result(images, page_index):
16+
if images:
17+
print(f"[+] Found a total of {len(images)} images in page {page_index}")
18+
else:
19+
print(f"[!] No images found on page {page_index}")
20+
21+
22+
def run_with_unstructured(f_path, output_dir_path):
23+
doc = DocumentLayout.from_file(
24+
filename=f_path,
25+
extract_images_in_pdf=True,
26+
image_output_dir_path=output_dir_path,
27+
)
28+
29+
for page_index, page in enumerate(doc.pages, start=1):
30+
image_elements = [el for el in page.elements if el.type == "Image"]
31+
print_result(image_elements, page_index)
32+
33+
34+
def run_with_pymupdf(f_path, output_dir_path):
35+
doc = fitz.open(f_path)
36+
for page_index, page in enumerate(doc, start=1):
37+
image_list = page.get_images(full=True)
38+
print_result(image_list, page_index)
39+
40+
for image_index, img in enumerate(image_list, start=1):
41+
# Get the XREF of the image
42+
xref = img[0]
43+
# Extract the image bytes
44+
base_image = doc.extract_image(xref)
45+
image_bytes = base_image["image"]
46+
# Get the image extension
47+
image_ext = base_image["ext"]
48+
# Load it to PIL
49+
image = Image.open(io.BytesIO(image_bytes))
50+
output_f_path = os.path.join(output_dir_path, f"image_{page_index}_{image_index}.{image_ext}")
51+
image.save(output_f_path)
52+
53+
54+
def run_with_pypdf2(f_path, output_dir_path):
55+
reader = PdfReader(f_path)
56+
for page_index, page in enumerate(reader.pages, start=1):
57+
images = page.images
58+
print_result(images, page_index)
59+
60+
for image_file_object in images:
61+
output_f_path = os.path.join(output_dir_path, f"figure_{page_index}_{image_file_object.name}")
62+
with open(output_f_path, "wb") as fp:
63+
fp.write(image_file_object.data)
64+
65+
66+
def run(f_path, library):
67+
f_basename = os.path.splitext(os.path.basename(f_path))[0]
68+
output_dir_path = os.path.join(output_basedir_path, library, f_basename)
69+
os.makedirs(output_dir_path, exist_ok=True)
70+
71+
if library == "unstructured":
72+
run_with_unstructured(f_path, output_dir_path)
73+
elif library == "pymupdf":
74+
run_with_pymupdf(f_path, output_dir_path)
75+
elif library == "pypdf2":
76+
run_with_pypdf2(f_path, output_dir_path)
77+
78+
79+
if __name__ == '__main__':
80+
if len(sys.argv) < 3:
81+
print(
82+
"Please provide the path to the file name as the first argument and the image "
83+
"extraction library as the second argument.",
84+
)
85+
sys.exit(1)
86+
87+
if sys.argv[2] not in ["unstructured", "pymupdf", "pypdf2"]:
88+
print("Invalid pdf library")
89+
sys.exit(1)
90+
91+
output_basedir_path = os.path.join(CUR_DIR, "output")
92+
os.makedirs(output_basedir_path, exist_ok=True)
93+
94+
run(f_path=sys.argv[1], library=sys.argv[2])
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
unstructured-inference

examples/layout_analysis/README.md

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,13 @@ Run `pip install -r requirements.txt` to install the Python dependencies.
99
### Visualization
1010
- Python script (visualization.py)
1111
```
12-
PYTHONPATH=. python examples/layout_analysis/visualization.py <file_path>
12+
$ PYTHONPATH=. python examples/layout_analysis/visualization.py <file_path> <scope>
1313
```
14-
For example,
14+
The scope can be `image_only` to show only image elements or `all` to show all elements. For example,
1515
```
16-
PYTHONPATH=. python examples/layout_analysis/visualization.py sample-docs/loremipsum.pdf
16+
$ PYTHONPATH=. python examples/layout_analysis/visualization.py sample-docs/loremipsum.pdf all
17+
// or
18+
$ PYTHONPATH=. python examples/layout_analysis/visualization.py sample-docs/loremipsum.pdf image_oly
1719
```
1820
- Jupyter Notebook (visualization.ipynb)
1921
- Run `jupyter-notebook` to start.

examples/layout_analysis/visualization.py

Lines changed: 26 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,14 @@
22
import pathlib
33
import sys
44

5+
from unstructured_inference.inference.elements import ImageTextRegion
56
from unstructured_inference.inference.layout import process_file_with_model
67
from unstructured_inference.utils import write_image
78

89
CUR_DIR = pathlib.Path(__file__).parent.resolve()
910

1011

11-
def run(f_path):
12+
def run(f_path, scope):
1213
annotation_data_map = {
1314
"final": None,
1415
"extracted": {"layout": {"color": "green", "width": 2}},
@@ -27,21 +28,42 @@ def run(f_path):
2728
)
2829

2930
for idx, page in enumerate(doc.pages):
31+
if scope == "image_only":
32+
embedded_image_elements = [
33+
el for el in page.layout if isinstance(el, ImageTextRegion)
34+
]
35+
inferred_image_elements = [
36+
el for el in page.inferred_layout if el.type == "Figure"
37+
]
38+
final_image_elements = [el for el in page.elements if el.type == "Image"]
39+
40+
page.layout = embedded_image_elements
41+
page.inferred_layout = inferred_image_elements
42+
page.elements = final_image_elements
43+
3044
for action_type, action_value in annotation_data_map.items():
3145
img = page.annotate(annotation_data=action_value)
3246
output_f_path = os.path.join(output_dir_path, f"{f_basename}_{idx+1}_{action_type}.jpg")
3347
write_image(img, output_f_path)
3448

49+
print(f"page_num: {idx+1} - n_total_elements: {len(page.elements)} - n_extracted_elements: "
50+
f"{len(page.layout)} - n_inferred_elements: {len(page.inferred_layout)} - "
51+
f"n_ocr_elements: {len(page.ocr_layout)}")
52+
3553

3654
if __name__ == '__main__':
37-
if len(sys.argv) < 2:
55+
if len(sys.argv) < 3:
3856
print(
39-
"Please provide the path to the file name as the first argument and the strategy as the "
57+
"Please provide the path to the file name as the first argument and the scope as the "
4058
"second argument.",
4159
)
4260
sys.exit(1)
4361

62+
if sys.argv[2] not in ["all", "image_only"]:
63+
print("Invalid scope")
64+
sys.exit(1)
65+
4466
output_basedir_path = os.path.join(CUR_DIR, "output")
4567
os.makedirs(output_basedir_path, exist_ok=True)
4668

47-
run(f_path=sys.argv[1])
69+
run(f_path=sys.argv[1], scope=sys.argv[2])

test_unstructured_inference/inference/test_layout.py

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -356,18 +356,21 @@ def points(self):
356356
class MockPageLayout(layout.PageLayout):
357357
def __init__(
358358
self,
359+
number=1,
360+
image=None,
359361
layout=None,
360362
model=None,
361363
ocr_strategy="auto",
362364
ocr_languages="eng",
363365
extract_tables=False,
364366
):
365-
self.image = None
367+
self.image = image
366368
self.layout = layout
367369
self.model = model
368370
self.ocr_strategy = ocr_strategy
369371
self.ocr_languages = ocr_languages
370372
self.extract_tables = extract_tables
373+
self.number = number
371374

372375
def ocr(self, text_block: MockEmbeddedTextRegion):
373376
return text_block.ocr_text
@@ -878,6 +881,22 @@ def test_from_image(
878881
assert mock_detection.called == detection_model_called
879882

880883

884+
def test_extract_images(mock_pil_image):
885+
page = MockPageLayout(image=mock_pil_image)
886+
page.elements = [
887+
layoutelement.LayoutElement(1, 1, 10, 10, text=None, type="Image"),
888+
layoutelement.LayoutElement(11, 11, 20, 20, text=None, type="Image"),
889+
]
890+
891+
with tempfile.TemporaryDirectory() as tmpdir:
892+
page.extract_images(output_dir_path=str(tmpdir))
893+
894+
for i, el in enumerate(page.elements):
895+
expected_image_path = os.path.join(str(tmpdir), f"figure-{page.number}-{i + 1}.jpg")
896+
assert os.path.isfile(el.image_path)
897+
assert el.image_path == expected_image_path
898+
899+
881900
class MockUnstructuredElementExtractionModel(UnstructuredElementExtractionModel):
882901
def initialize(self, *args, **kwargs):
883902
return super().initialize(*args, **kwargs)
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.5.30" # pragma: no cover
1+
__version__ = "0.5.31" # pragma: no cover

unstructured_inference/inference/layout.py

Lines changed: 50 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
merge_inferred_layout_with_ocr_layout,
2828
)
2929
from unstructured_inference.inference.ordering import order_layout
30+
from unstructured_inference.inference.pdf import get_images_from_pdf_element
3031
from unstructured_inference.logger import logger
3132
from unstructured_inference.models.base import get_model
3233
from unstructured_inference.models.detectron2onnx import (
@@ -37,6 +38,7 @@
3738
UnstructuredObjectDetectionModel,
3839
)
3940
from unstructured_inference.patches.pdfminer import parse_keyword
41+
from unstructured_inference.utils import write_image
4042
from unstructured_inference.visualize import draw_bbox
4143

4244
# NOTE(alan): Patching this to fix a bug in pdfminer.six. Submitted this PR into pdfminer.six to fix
@@ -356,6 +358,33 @@ def get_elements_from_layout(self, layout: List[TextRegion]) -> List[LayoutEleme
356358
]
357359
return elements
358360

361+
def extract_images(self, output_dir_path: Optional[str] = None):
362+
"""
363+
Extract and save images from the page. This method iterates through the layout elements
364+
of the page, identifies image regions, and extracts and saves them as separate image files.
365+
"""
366+
367+
if not output_dir_path:
368+
output_dir_path = os.path.join(os.getcwd(), "figures")
369+
os.makedirs(output_dir_path, exist_ok=True)
370+
371+
figure_number = 0
372+
for el in self.elements:
373+
if isinstance(el, LocationlessLayoutElement) or el.type not in ["Image"]:
374+
continue
375+
376+
figure_number += 1
377+
try:
378+
output_f_path = os.path.join(
379+
output_dir_path,
380+
f"figure-{self.number}-{figure_number}.jpg",
381+
)
382+
cropped_image = self.image.crop((el.x1, el.y1, el.x2, el.y2))
383+
write_image(cropped_image, output_f_path)
384+
el.image_path = output_f_path
385+
except (ValueError, IOError):
386+
logger.warning("Image Extraction Error: Skipping the failed image", exc_info=True)
387+
359388
def _get_image_array(self) -> Union[np.ndarray, None]:
360389
"""Converts the raw image into a numpy array."""
361390
if self.image_array is None:
@@ -439,11 +468,12 @@ def from_image(
439468
ocr_mode: str = OCRMode.FULL_PAGE.value,
440469
extract_tables: bool = False,
441470
fixed_layout: Optional[List[TextRegion]] = None,
442-
**kwargs,
471+
supplement_with_ocr_elements: bool = True,
472+
extract_images_in_pdf: bool = False,
473+
image_output_dir_path: Optional[str] = None,
474+
analysis: bool = False,
443475
):
444476
"""Creates a PageLayout from an already-loaded PIL Image."""
445-
analysis = kwargs.get("analysis", False)
446-
supplement_with_ocr_elements = kwargs.get("supplement_with_ocr_elements", True)
447477

448478
page = cls(
449479
number=number,
@@ -474,6 +504,9 @@ def from_image(
474504
page.image_path = os.path.abspath(image_path) if image_path else None
475505
page.document_filename = os.path.abspath(document_filename) if document_filename else None
476506

507+
if extract_images_in_pdf:
508+
page.extract_images(image_output_dir_path)
509+
477510
# Clear the image to save memory
478511
page.image = None
479512

@@ -602,21 +635,29 @@ def load_pdf(
602635
) -> Tuple[List[List[TextRegion]], Union[List[Image.Image], List[str]]]:
603636
"""Loads the image and word objects from a pdf using pdfplumber and the image renderings of the
604637
pdf pages using pdf2image"""
638+
605639
layouts = []
606640
for page in extract_pages(filename):
607-
layout = []
641+
layout: List[TextRegion] = []
608642
height = page.height
609643
for element in page:
610644
x1, y2, x2, y1 = element.bbox
611645
y1 = height - y1
612646
y2 = height - y2
613647
# Coefficient to rescale bounding box to be compatible with images
614648
coef = dpi / 72
615-
_text, element_class = (
616-
(element.get_text(), EmbeddedTextRegion)
617-
if hasattr(element, "get_text")
618-
else (None, ImageTextRegion)
619-
)
649+
650+
if hasattr(element, "get_text"):
651+
_text = element.get_text()
652+
element_class = EmbeddedTextRegion # type: ignore
653+
else:
654+
embedded_images = get_images_from_pdf_element(element)
655+
if len(embedded_images) > 0:
656+
_text = None
657+
element_class = ImageTextRegion # type: ignore
658+
else:
659+
continue
660+
620661
text_region = element_class(x1 * coef, y1 * coef, x2 * coef, y2 * coef, text=_text)
621662

622663
if text_region.area() > 0:

unstructured_inference/inference/layoutelement.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
class LayoutElement(TextRegion):
2525
type: Optional[str] = None
2626
prob: Optional[float] = None
27+
image_path: Optional[str] = None
2728

2829
def extract_text(
2930
self,
@@ -98,7 +99,8 @@ def merge_inferred_layout_with_extracted_layout(
9899
w, h = page_image_size
99100
full_page_region = Rectangle(0, 0, w, h)
100101
for extracted_region in extracted_layout:
101-
if isinstance(extracted_region, ImageTextRegion):
102+
extracted_is_image = isinstance(extracted_region, ImageTextRegion)
103+
if extracted_is_image:
102104
# Skip extracted images for this purpose, we don't have the text from them and they
103105
# don't provide good text bounding boxes.
104106

@@ -122,7 +124,6 @@ def merge_inferred_layout_with_extracted_layout(
122124
extracted_region,
123125
subregion_threshold=subregion_threshold,
124126
)
125-
extracted_is_image = isinstance(extracted_region, ImageTextRegion)
126127
inferred_is_text = inferred_region.type not in (
127128
"Figure",
128129
"Image",

0 commit comments

Comments
 (0)