Skip to content

Commit ed8456b

Browse files
authored
feat: bump pypdfium minimum and use context manager (#460)
This PR bumps the minimum required version for `pypdfium2` to `5.0.0` and refactors the `convert_pdf_to_image` function to use the context manager feature introduced in `5.0.0`. This provides a safer invocation of `pypdfium2` with automated resource management.
1 parent a00e748 commit ed8456b

File tree

4 files changed

+23
-22
lines changed

4 files changed

+23
-22
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
## 1.1.9
2+
3+
- Use context manager for `pdfium.PdfDocument`
4+
15
## 1.1.8
26

37
- put `pdfium` call behind a thread lock

requirements/base.in

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,5 +14,5 @@ accelerate
1414
rapidfuzz
1515
pandas
1616
scipy
17-
pypdfium2
17+
pypdfium2>=5.0.0
1818
pdfminer-six
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "1.1.8" # pragma: no cover
1+
__version__ = "1.1.9" # pragma: no cover

unstructured_inference/inference/layout.py

Lines changed: 17 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -423,8 +423,7 @@ def convert_pdf_to_image(
423423
if filename is None and file is None:
424424
raise ValueError("Either filename or file must be provided")
425425
with _pdfium_lock:
426-
pdf = pdfium.PdfDocument(filename or file, password=password)
427-
try:
426+
with pdfium.PdfDocument(filename or file, password=password) as pdf:
428427
images: dict[int, Image.Image] = {}
429428
if dpi is None:
430429
dpi = inference_config.PDF_RENDER_DPI
@@ -448,21 +447,19 @@ def convert_pdf_to_image(
448447
bitmap.close()
449448
finally:
450449
page.close()
451-
if not output_folder:
452-
return list(images.values())
453-
else:
454-
# Save images to output_folder
455-
filenames: list[str] = []
456-
assert Path(output_folder).exists()
457-
assert Path(output_folder).is_dir()
458-
for i, image in images.items():
459-
fn: str = os.path.join(str(output_folder), f"page_{i}.png")
460-
image.save(fn, format="PNG", compress_level=1, optimize=False)
461-
filenames.append(fn)
462-
if path_only:
463-
return filenames
464-
images_values: list[Image.Image] = list(images.values())
465-
return images_values
466-
467-
finally:
468-
pdf.close()
450+
451+
if not output_folder:
452+
return list(images.values())
453+
else:
454+
# Save images to output_folder
455+
filenames: list[str] = []
456+
assert Path(output_folder).exists()
457+
assert Path(output_folder).is_dir()
458+
for i, image in images.items():
459+
fn: str = os.path.join(str(output_folder), f"page_{i}.png")
460+
image.save(fn, format="PNG", compress_level=1, optimize=False)
461+
filenames.append(fn)
462+
if path_only:
463+
return filenames
464+
images_values: list[Image.Image] = list(images.values())
465+
return images_values

0 commit comments

Comments
 (0)