Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
## 1.1.8

- put `pdfium` call behind a thread lock

## 1.1.7

- Update OpenCV-Python to 4.13.0.90 to squash ffmpeg vulnerability CVE-2023-6605
Expand Down
2 changes: 1 addition & 1 deletion unstructured_inference/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "1.1.7" # pragma: no cover
__version__ = "1.1.8" # pragma: no cover
86 changes: 45 additions & 41 deletions unstructured_inference/inference/layout.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import tempfile
from functools import cached_property
from pathlib import Path, PurePath
from threading import Lock
from typing import Any, BinaryIO, Collection, List, Optional, Union, cast

import numpy as np
Expand All @@ -23,6 +24,8 @@
)
from unstructured_inference.visualize import draw_bbox

_pdfium_lock = Lock()


class DocumentLayout:
"""Class for handling documents that are saved as .pdf files. For .pdf files, a
Expand Down Expand Up @@ -419,46 +422,47 @@ def convert_pdf_to_image(
raise ValueError("output_folder must be specified if path_only is true")
if filename is None and file is None:
raise ValueError("Either filename or file must be provided")
pdf = pdfium.PdfDocument(filename or file, password=password)
try:
images: dict[int, Image.Image] = {}
if dpi is None:
dpi = inference_config.PDF_RENDER_DPI
scale = dpi / 72.0
for i, page in enumerate(pdf, start=1):
try:
if first_page is not None and i < first_page:
continue
if last_page is not None and i > last_page:
break
bitmap = page.render(
scale=scale,
no_smoothtext=False,
no_smoothimage=False,
no_smoothpath=False,
optimize_mode="print",
)
with _pdfium_lock:
pdf = pdfium.PdfDocument(filename or file, password=password)
try:
images: dict[int, Image.Image] = {}
if dpi is None:
dpi = inference_config.PDF_RENDER_DPI
scale = dpi / 72.0
for i, page in enumerate(pdf, start=1):
try:
images[i] = bitmap.to_pil()
if first_page is not None and i < first_page:
continue
if last_page is not None and i > last_page:
break
bitmap = page.render(
scale=scale,
no_smoothtext=False,
no_smoothimage=False,
no_smoothpath=False,
optimize_mode="print",
)
try:
images[i] = bitmap.to_pil()
finally:
bitmap.close()
finally:
bitmap.close()
finally:
page.close()
if not output_folder:
return list(images.values())
else:
# Save images to output_folder
filenames: list[str] = []
assert Path(output_folder).exists()
assert Path(output_folder).is_dir()
for i, image in images.items():
fn: str = os.path.join(str(output_folder), f"page_{i}.png")
image.save(fn, format="PNG", compress_level=1, optimize=False)
filenames.append(fn)
if path_only:
return filenames
images_values: list[Image.Image] = list(images.values())
return images_values

finally:
pdf.close()
page.close()
if not output_folder:
return list(images.values())
else:
# Save images to output_folder
filenames: list[str] = []
assert Path(output_folder).exists()
assert Path(output_folder).is_dir()
for i, image in images.items():
fn: str = os.path.join(str(output_folder), f"page_{i}.png")
image.save(fn, format="PNG", compress_level=1, optimize=False)
filenames.append(fn)
if path_only:
return filenames
images_values: list[Image.Image] = list(images.values())
return images_values

finally:
pdf.close()