Skip to content

Commit e0943d2

Browse files
kmyuskcau-git
andauthored
feat: allow images in doctags deserializer to be optional and support multipage (#225)
* feat: allow images in doctags deserializer to be optional and support multipage Signed-off-by: Yusik Kim <[email protected]> * chore: remove commented out lines Signed-off-by: Yusik Kim <[email protected]> * Avoid typecasting Signed-off-by: Christoph Auer <[email protected]> --------- Signed-off-by: Yusik Kim <[email protected]> Signed-off-by: Christoph Auer <[email protected]> Co-authored-by: Christoph Auer <[email protected]>
1 parent 2f380ab commit e0943d2

File tree

2 files changed

+49
-7
lines changed

2 files changed

+49
-7
lines changed

docling_core/types/doc/document.py

Lines changed: 33 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -542,32 +542,58 @@ class DocTagsDocument(BaseModel):
542542

543543
@classmethod
544544
def from_doctags_and_image_pairs(
545-
cls, doctags: List[Union[Path, str]], images: List[Union[Path, PILImage.Image]]
545+
cls,
546+
doctags: typing.Sequence[Union[Path, str]],
547+
images: Optional[List[Union[Path, PILImage.Image]]],
546548
):
547549
"""from_doctags_and_image_pairs."""
548-
if len(doctags) != len(images):
550+
if images is not None and len(doctags) != len(images):
549551
raise ValueError("Number of page doctags must be equal to page images!")
550552
doctags_doc = cls()
551553

552554
pages = []
553-
for dt, img in zip(doctags, images):
555+
556+
for ix, dt in enumerate(doctags):
554557
if isinstance(dt, Path):
555558
with dt.open("r") as fp:
556559
dt = fp.read()
557560
elif isinstance(dt, str):
558561
pass
559562

560-
if isinstance(img, Path):
561-
img = PILImage.open(img)
562-
elif isinstance(dt, PILImage.Image):
563-
pass
563+
img = None
564+
if images is not None:
565+
img = images[ix]
566+
567+
if isinstance(img, Path):
568+
img = PILImage.open(img)
569+
elif isinstance(img, PILImage.Image):
570+
pass
564571

565572
page = DocTagsPage(tokens=dt, image=img)
566573
pages.append(page)
567574

568575
doctags_doc.pages = pages
569576
return doctags_doc
570577

578+
@classmethod
579+
def from_multipage_doctags_and_images(
580+
cls,
581+
doctags: Union[Path, str],
582+
images: Optional[List[Union[Path, PILImage.Image]]],
583+
):
584+
"""From doctags with `<page_break>` and corresponding list of page images."""
585+
if isinstance(doctags, Path):
586+
with doctags.open("r") as fp:
587+
doctags = fp.read()
588+
dt_list = (
589+
doctags.removeprefix(f"<{DocumentToken.DOCUMENT.value}>")
590+
.removesuffix(f"</{DocumentToken.DOCUMENT.value}>")
591+
.split(f"<{DocumentToken.PAGE_BREAK.value}>")
592+
)
593+
dt_list = [el.strip() for el in dt_list]
594+
595+
return cls.from_doctags_and_image_pairs(dt_list, images)
596+
571597

572598
class ProvenanceItem(BaseModel):
573599
"""ProvenanceItem."""

test/test_doctags_load.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,10 +30,26 @@ def test_doctags_load_from_memory():
3030
# print(doc.export_to_html())
3131

3232

33+
def test_doctags_load_without_image():
34+
doc = DoclingDocument(name="Document")
35+
doctags = Path("test/data/doc/page_with_pic.dt").open("r").read()
36+
doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctags], None)
37+
doc.load_from_doctags(doctags_doc)
38+
# print(doc.export_to_html())
39+
40+
3341
def test_doctags_load_for_kv_region():
3442
doc = DoclingDocument(name="Document")
3543
doctags = Path("test/data/doc/doc_with_kv.dt").open("r").read()
3644
image = PILImage.open(Path("test/data/doc/doc_with_kv.png"))
3745
doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctags], [image])
3846
doc.load_from_doctags(doctags_doc)
3947
# print(doc.export_to_html())
48+
49+
50+
def test_multipage_doctags_load():
51+
doc = DoclingDocument(name="Document")
52+
doctags = Path("test/data/doc/2206.01062.yaml.dt").open("r").read()
53+
doctags_doc = DocTagsDocument.from_multipage_doctags_and_images(doctags, None)
54+
doc.load_from_doctags(doctags_doc)
55+
# print(doc.export_to_html())

0 commit comments

Comments
 (0)