Skip to content

Commit ebd9147

Browse files
authored
fix: only save applicable page images (#226)
fix(document): save document images page by page Signed-off-by: Clément Doumouro <[email protected]>
1 parent 7b25698 commit ebd9147

File tree

2 files changed

+53
-9
lines changed

2 files changed

+53
-9
lines changed

docling_core/types/doc/document.py

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4098,7 +4098,10 @@ def _with_embedded_pictures(self) -> "DoclingDocument":
40984098
return result
40994099

41004100
def _with_pictures_refs(
4101-
self, image_dir: Path, reference_path: Optional[Path] = None
4101+
self,
4102+
image_dir: Path,
4103+
page_no: Optional[int],
4104+
reference_path: Optional[Path] = None,
41024105
) -> "DoclingDocument":
41034106
"""Document with images as refs.
41044107
@@ -4111,7 +4114,7 @@ def _with_pictures_refs(
41114114
image_dir.mkdir(parents=True, exist_ok=True)
41124115

41134116
if image_dir.is_dir():
4114-
for item, level in result.iterate_items(with_groups=False):
4117+
for item, level in result.iterate_items(page_no=page_no, with_groups=False):
41154118
if isinstance(item, PictureItem):
41164119

41174120
if (
@@ -4211,7 +4214,7 @@ def save_as_json(
42114214
os.makedirs(artifacts_dir, exist_ok=True)
42124215

42134216
new_doc = self._make_copy_with_refmode(
4214-
artifacts_dir, image_mode, reference_path=reference_path
4217+
artifacts_dir, image_mode, page_no=None, reference_path=reference_path
42154218
)
42164219

42174220
out = new_doc.export_to_dict(
@@ -4254,7 +4257,7 @@ def save_as_yaml(
42544257
os.makedirs(artifacts_dir, exist_ok=True)
42554258

42564259
new_doc = self._make_copy_with_refmode(
4257-
artifacts_dir, image_mode, reference_path=reference_path
4260+
artifacts_dir, image_mode, page_no=None, reference_path=reference_path
42584261
)
42594262

42604263
out = new_doc.export_to_dict(
@@ -4327,7 +4330,7 @@ def save_as_markdown(
43274330
os.makedirs(artifacts_dir, exist_ok=True)
43284331

43294332
new_doc = self._make_copy_with_refmode(
4330-
artifacts_dir, image_mode, reference_path=reference_path
4333+
artifacts_dir, image_mode, page_no, reference_path=reference_path
43314334
)
43324335

43334336
md_out = new_doc.export_to_markdown(
@@ -4503,7 +4506,7 @@ def save_as_html(
45034506
os.makedirs(artifacts_dir, exist_ok=True)
45044507

45054508
new_doc = self._make_copy_with_refmode(
4506-
artifacts_dir, image_mode, reference_path=reference_path
4509+
artifacts_dir, image_mode, page_no, reference_path=reference_path
45074510
)
45084511

45094512
html_out = new_doc.export_to_html(
@@ -4542,14 +4545,15 @@ def _make_copy_with_refmode(
45424545
self,
45434546
artifacts_dir: Path,
45444547
image_mode: ImageRefMode,
4548+
page_no: Optional[int],
45454549
reference_path: Optional[Path] = None,
45464550
):
45474551
new_doc = None
45484552
if image_mode == ImageRefMode.PLACEHOLDER:
45494553
new_doc = self
45504554
elif image_mode == ImageRefMode.REFERENCED:
45514555
new_doc = self._with_pictures_refs(
4552-
image_dir=artifacts_dir, reference_path=reference_path
4556+
image_dir=artifacts_dir, page_no=page_no, reference_path=reference_path
45534557
)
45544558
elif image_mode == ImageRefMode.EMBEDDED:
45554559
new_doc = self._with_embedded_pictures()

test/test_docling_doc.py

Lines changed: 42 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1349,12 +1349,51 @@ def test_save_pictures():
13491349

13501350
doc: DoclingDocument = _construct_doc()
13511351

1352-
new_doc = doc._with_pictures_refs(image_dir=Path("./test/data/constructed_images/"))
1352+
new_doc = doc._with_pictures_refs(
1353+
image_dir=Path("./test/data/constructed_images/"), page_no=None
1354+
)
13531355

13541356
img_paths = new_doc._list_images_on_disk()
13551357
assert len(img_paths) == 1, "len(img_paths)!=1"
13561358

13571359

1360+
def test_save_pictures_with_page():
1361+
# Given
1362+
doc = DoclingDocument(name="Dummy")
1363+
1364+
doc.add_page(page_no=1, size=Size(width=2000, height=4000), image=None)
1365+
doc.add_page(
1366+
page_no=2,
1367+
size=Size(width=2000, height=4000),
1368+
)
1369+
image = PILImage.new(mode="RGB", size=(200, 400), color=(0, 0, 0))
1370+
doc.add_picture(
1371+
image=ImageRef.from_pil(image=image, dpi=72),
1372+
prov=ProvenanceItem(
1373+
page_no=2,
1374+
bbox=BoundingBox(
1375+
b=0, l=0, r=200, t=400, coord_origin=CoordOrigin.BOTTOMLEFT
1376+
),
1377+
charspan=(1, 2),
1378+
),
1379+
)
1380+
1381+
# When
1382+
with_ref = doc._with_pictures_refs(
1383+
image_dir=Path("./test/data/constructed_images/"), page_no=1
1384+
)
1385+
# Then
1386+
n_images = len(with_ref._list_images_on_disk())
1387+
assert n_images == 0
1388+
# When
1389+
with_ref = with_ref._with_pictures_refs(
1390+
image_dir=Path("./test/data/constructed_images/"), page_no=2
1391+
)
1392+
n_images = len(with_ref._list_images_on_disk())
1393+
# Then
1394+
assert n_images == 1
1395+
1396+
13581397
def _normalise_string_wrt_filepaths(instr: str, paths: List[Path]):
13591398

13601399
for p in paths:
@@ -1406,7 +1445,8 @@ def test_save_to_disk():
14061445
image_dir = Path("./test/data/doc/constructed_images/")
14071446

14081447
doc_with_references = doc._with_pictures_refs(
1409-
image_dir=image_dir # Path("./test/data/constructed_images/")
1448+
image_dir=image_dir, # Path("./test/data/constructed_images/")
1449+
page_no=None,
14101450
)
14111451

14121452
# paths will be different on different machines, so needs to be kept!

0 commit comments

Comments
 (0)