Skip to content

Commit 1e841eb

Browse files
authored
fix: release image backend resources after frame extraction (#3134)
Signed-off-by: LarytheLord <llawlietbagsum@gmail.com>
1 parent ce49923 commit 1e841eb

File tree

2 files changed

+93
-19
lines changed

2 files changed

+93
-19
lines changed

docling/backend/image_backend.py

Lines changed: 21 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import logging
22
from io import BytesIO
33
from pathlib import Path
4-
from typing import Iterable, List, Optional, Union
4+
from typing import Iterable, List, Union
55

66
from docling_core.types.doc import BoundingBox, CoordOrigin
77
from docling_core.types.doc.page import (
@@ -24,7 +24,7 @@
2424

2525
class _ImagePageBackend(PdfPageBackend):
2626
def __init__(self, image: Image.Image):
27-
self._image: Optional[Image.Image] = image
27+
self._image: Image.Image | None = image
2828
self.valid: bool = self._image is not None
2929

3030
def is_valid(self) -> bool:
@@ -85,7 +85,7 @@ def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
8585
yield full_page_bbox
8686

8787
def get_page_image(
88-
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
88+
self, scale: float = 1, cropbox: BoundingBox | None = None
8989
) -> Image.Image:
9090
assert self._image is not None
9191
img = self._image
@@ -147,20 +147,22 @@ def __init__(
147147
# Load frames eagerly for thread-safety across pages
148148
self._frames: List[Image.Image] = []
149149
try:
150-
img = Image.open(self.path_or_stream) # type: ignore[arg-type]
151-
152-
# Handle multi-frame and single-frame images
153-
# - multiframe formats: TIFF, GIF, ICO
154-
# - singleframe formats: JPEG (.jpg, .jpeg), PNG (.png), BMP, WEBP (unless animated), HEIC
155-
frame_count = getattr(img, "n_frames", 1)
156-
157-
if frame_count > 1:
158-
for i in range(frame_count):
159-
img.seek(i)
160-
self._frames.append(img.copy().convert("RGB"))
161-
else:
162-
self._frames.append(img.convert("RGB"))
150+
with Image.open(self.path_or_stream) as img: # type: ignore[arg-type]
151+
# Handle multi-frame and single-frame images
152+
# - multiframe formats: TIFF, GIF, ICO
153+
# - singleframe formats: JPEG (.jpg, .jpeg), PNG (.png), BMP, WEBP (unless animated), HEIC
154+
frame_count = getattr(img, "n_frames", 1)
155+
156+
if frame_count > 1:
157+
for i in range(frame_count):
158+
img.seek(i)
159+
self._frames.append(img.copy().convert("RGB"))
160+
else:
161+
self._frames.append(img.convert("RGB"))
163162
except Exception as e:
163+
for frame in self._frames:
164+
frame.close()
165+
self._frames = []
164166
raise RuntimeError(f"Could not load image for document {self.file}") from e
165167

166168
def is_valid(self) -> bool:
@@ -184,5 +186,7 @@ def supports_pagination(cls) -> bool:
184186
return True
185187

186188
def unload(self):
187-
super().unload()
189+
for frame in self._frames:
190+
frame.close()
188191
self._frames = []
192+
super().unload()

tests/test_backend_image_native.py

Lines changed: 72 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,17 @@
11
from io import BytesIO
2-
from pathlib import Path
2+
from unittest.mock import MagicMock
33

44
import pytest
55
from docling_core.types.doc import BoundingBox, CoordOrigin
66
from PIL import Image
77

88
from docling.backend.image_backend import ImageDocumentBackend, _ImagePageBackend
99
from docling.datamodel.base_models import DocumentStream, InputFormat
10-
from docling.datamodel.document import InputDocument, _DocumentConversionInput
10+
from docling.datamodel.document import (
11+
InputDocument,
12+
_DocumentConversionInput,
13+
_DummyBackend,
14+
)
1115
from docling.document_converter import DocumentConverter, ImageFormatOption
1216
from docling.document_extractor import DocumentExtractor
1317

@@ -216,3 +220,69 @@ def test_multipage_access():
216220
size = page_backend.get_size()
217221
assert size.width == 64
218222
assert size.height == 64
223+
224+
225+
def test_source_image_is_closed_after_backend_init(tmp_path, monkeypatch):
226+
image_path = tmp_path / "test.png"
227+
Image.new("RGB", (32, 32), (10, 20, 30)).save(image_path)
228+
229+
opened_images = []
230+
original_open = Image.open
231+
232+
class TrackingImage:
233+
def __init__(self, image):
234+
self._image = image
235+
self.closed = False
236+
237+
def __getattr__(self, attr):
238+
return getattr(self._image, attr)
239+
240+
def close(self):
241+
self.closed = True
242+
return self._image.close()
243+
244+
def __enter__(self):
245+
return self
246+
247+
def __exit__(self, exc_type, exc, tb):
248+
self.close()
249+
return False
250+
251+
def tracking_open(*args, **kwargs):
252+
tracked_image = TrackingImage(original_open(*args, **kwargs))
253+
opened_images.append(tracked_image)
254+
return tracked_image
255+
256+
input_doc = InputDocument(
257+
path_or_stream=image_path,
258+
format=InputFormat.IMAGE,
259+
backend=_DummyBackend,
260+
filename=image_path.name,
261+
)
262+
263+
monkeypatch.setattr("docling.backend.image_backend.Image.open", tracking_open)
264+
backend = ImageDocumentBackend(
265+
in_doc=input_doc,
266+
path_or_stream=image_path,
267+
)
268+
269+
assert len(opened_images) == 1
270+
assert opened_images[0].closed is True
271+
backend.unload()
272+
273+
274+
def test_unload_closes_cached_frames():
275+
stream = _make_multipage_tiff_stream(num_pages=3, size=(32, 32))
276+
doc_backend = _get_backend_from_stream(stream)
277+
278+
tracked_closers = []
279+
for frame in doc_backend._frames:
280+
closer = MagicMock(wraps=frame.close)
281+
frame.close = closer
282+
tracked_closers.append(closer)
283+
284+
doc_backend.unload()
285+
286+
assert doc_backend._frames == []
287+
for closer in tracked_closers:
288+
closer.assert_called_once()

0 commit comments

Comments
 (0)