Skip to content

Commit 786f0c6

Browse files
cau-gitvagenas
andauthored
feat: Add ContentLayer attribute to designate items to body or furniture (#148)
* feat: Add ContentLayer attribute to designate items to body or furniture Signed-off-by: Christoph Auer <[email protected]> * introduce safer data gen mechanism, update chunking test data Signed-off-by: Panos Vagenas <[email protected]> * Do not make test rely on order in yaml Signed-off-by: Christoph Auer <[email protected]> * chore: format fixes Signed-off-by: Christoph Auer <[email protected]> * fix: legacy_to_docling_doc must use content_layer Signed-off-by: Christoph Auer <[email protected]> * Add content_layer in iterate_items Signed-off-by: Christoph Auer <[email protected]> * Bump format version, add model_validator for old page_header,page_footer in body Signed-off-by: Christoph Auer <[email protected]> * fix: Change to before model_validator Signed-off-by: Christoph Auer <[email protected]> * Update tests Signed-off-by: Christoph Auer <[email protected]> * Address review comments Signed-off-by: Christoph Auer <[email protected]> --------- Signed-off-by: Christoph Auer <[email protected]> Signed-off-by: Panos Vagenas <[email protected]> Co-authored-by: Panos Vagenas <[email protected]>
1 parent 794c00d commit 786f0c6

28 files changed

+848
-1147
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ pip install docling-core
2323

2424
To develop for Docling Core, you need Python 3.9 / 3.10 / 3.11 / 3.12 / 3.13 and Poetry. You can then install from your local clone's root dir:
2525
```bash
26-
poetry install
26+
poetry install --all-extras
2727
```
2828

2929
To run the pytest suite, execute:

docling_core/types/doc/document.py

Lines changed: 105 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
import textwrap
1414
import typing
1515
import warnings
16+
from enum import Enum
1617
from io import BytesIO
1718
from pathlib import Path
1819
from typing import Any, Dict, Final, List, Literal, Optional, Tuple, Union
@@ -54,7 +55,7 @@
5455

5556
Uint64 = typing.Annotated[int, Field(ge=0, le=(2**64 - 1))]
5657
LevelNumber = typing.Annotated[int, Field(ge=1, le=100)]
57-
CURRENT_VERSION: Final = "1.0.0"
58+
CURRENT_VERSION: Final = "1.1.0"
5859

5960
DEFAULT_EXPORT_LABELS = {
6061
DocItemLabel.TITLE,
@@ -70,6 +71,8 @@
7071
DocItemLabel.LIST_ITEM,
7172
DocItemLabel.CODE,
7273
DocItemLabel.REFERENCE,
74+
DocItemLabel.PAGE_HEADER,
75+
DocItemLabel.PAGE_FOOTER,
7376
}
7477

7578

@@ -513,13 +516,25 @@ class ProvenanceItem(BaseModel):
513516
charspan: Tuple[int, int]
514517

515518

519+
class ContentLayer(str, Enum):
520+
"""ContentLayer."""
521+
522+
BODY = "body"
523+
FURNITURE = "furniture"
524+
525+
526+
DEFAULT_CONTENT_LAYERS = {ContentLayer.BODY}
527+
528+
516529
class NodeItem(BaseModel):
517530
"""NodeItem."""
518531

519532
self_ref: str = Field(pattern=_JSON_POINTER_REGEX)
520533
parent: Optional[RefItem] = None
521534
children: List[RefItem] = []
522535

536+
content_layer: ContentLayer = ContentLayer.BODY
537+
523538
model_config = ConfigDict(extra="forbid")
524539

525540
def get_ref(self):
@@ -1442,8 +1457,8 @@ class DoclingDocument(BaseModel):
14421457
# generated from synthetic data.
14431458
)
14441459

1445-
furniture: GroupItem = GroupItem(
1446-
name="_root_", self_ref="#/furniture"
1460+
furniture: Annotated[GroupItem, Field(deprecated=True)] = GroupItem(
1461+
name="_root_", self_ref="#/furniture", content_layer=ContentLayer.FURNITURE
14471462
) # List[RefItem] = []
14481463
body: GroupItem = GroupItem(name="_root_", self_ref="#/body") # List[RefItem] = []
14491464

@@ -1455,11 +1470,28 @@ class DoclingDocument(BaseModel):
14551470

14561471
pages: Dict[int, PageItem] = {} # empty as default
14571472

1473+
@model_validator(mode="before")
1474+
@classmethod
1475+
def transform_to_content_layer(cls, data: dict) -> dict:
1476+
"""transform_to_content_layer."""
1477+
# Since version 1.1.0, all NodeItems carry content_layer property.
1478+
# We must assign previous page_header and page_footer instances to furniture.
1479+
# Note: model_validators which check on the version must use "before".
1480+
if "version" in data and data["version"] == "1.0.0":
1481+
for item in data.get("texts", []):
1482+
if "label" in item and item["label"] in [
1483+
DocItemLabel.PAGE_HEADER.value,
1484+
DocItemLabel.PAGE_FOOTER.value,
1485+
]:
1486+
item["content_layer"] = "furniture"
1487+
return data
1488+
14581489
def add_group(
14591490
self,
14601491
label: Optional[GroupLabel] = None,
14611492
name: Optional[str] = None,
14621493
parent: Optional[NodeItem] = None,
1494+
content_layer: Optional[ContentLayer] = None,
14631495
) -> GroupItem:
14641496
"""add_group.
14651497
@@ -1479,6 +1511,8 @@ def add_group(
14791511
group.name = name
14801512
if label is not None:
14811513
group.label = label
1514+
if content_layer:
1515+
group.content_layer = content_layer
14821516

14831517
self.groups.append(group)
14841518
parent.children.append(RefItem(cref=cref))
@@ -1493,6 +1527,7 @@ def add_list_item(
14931527
orig: Optional[str] = None,
14941528
prov: Optional[ProvenanceItem] = None,
14951529
parent: Optional[NodeItem] = None,
1530+
content_layer: Optional[ContentLayer] = None,
14961531
):
14971532
"""add_list_item.
14981533
@@ -1523,6 +1558,8 @@ def add_list_item(
15231558
)
15241559
if prov:
15251560
list_item.prov.append(prov)
1561+
if content_layer:
1562+
list_item.content_layer = content_layer
15261563

15271564
self.texts.append(list_item)
15281565
parent.children.append(RefItem(cref=cref))
@@ -1536,6 +1573,7 @@ def add_text(
15361573
orig: Optional[str] = None,
15371574
prov: Optional[ProvenanceItem] = None,
15381575
parent: Optional[NodeItem] = None,
1576+
content_layer: Optional[ContentLayer] = None,
15391577
):
15401578
"""add_text.
15411579
@@ -1549,16 +1587,40 @@ def add_text(
15491587
# Catch a few cases that are in principle allowed
15501588
# but that will create confusion down the road
15511589
if label in [DocItemLabel.TITLE]:
1552-
return self.add_title(text=text, orig=orig, prov=prov, parent=parent)
1590+
return self.add_title(
1591+
text=text,
1592+
orig=orig,
1593+
prov=prov,
1594+
parent=parent,
1595+
content_layer=content_layer,
1596+
)
15531597

15541598
elif label in [DocItemLabel.LIST_ITEM]:
1555-
return self.add_list_item(text=text, orig=orig, prov=prov, parent=parent)
1599+
return self.add_list_item(
1600+
text=text,
1601+
orig=orig,
1602+
prov=prov,
1603+
parent=parent,
1604+
content_layer=content_layer,
1605+
)
15561606

15571607
elif label in [DocItemLabel.SECTION_HEADER]:
1558-
return self.add_heading(text=text, orig=orig, prov=prov, parent=parent)
1608+
return self.add_heading(
1609+
text=text,
1610+
orig=orig,
1611+
prov=prov,
1612+
parent=parent,
1613+
content_layer=content_layer,
1614+
)
15591615

15601616
elif label in [DocItemLabel.CODE]:
1561-
return self.add_code(text=text, orig=orig, prov=prov, parent=parent)
1617+
return self.add_code(
1618+
text=text,
1619+
orig=orig,
1620+
prov=prov,
1621+
parent=parent,
1622+
content_layer=content_layer,
1623+
)
15621624

15631625
else:
15641626

@@ -1580,6 +1642,9 @@ def add_text(
15801642
if prov:
15811643
text_item.prov.append(prov)
15821644

1645+
if content_layer:
1646+
text_item.content_layer = content_layer
1647+
15831648
self.texts.append(text_item)
15841649
parent.children.append(RefItem(cref=cref))
15851650

@@ -1592,6 +1657,7 @@ def add_table(
15921657
prov: Optional[ProvenanceItem] = None,
15931658
parent: Optional[NodeItem] = None,
15941659
label: DocItemLabel = DocItemLabel.TABLE,
1660+
content_layer: Optional[ContentLayer] = None,
15951661
):
15961662
"""add_table.
15971663
@@ -1613,6 +1679,9 @@ def add_table(
16131679
)
16141680
if prov:
16151681
tbl_item.prov.append(prov)
1682+
if content_layer:
1683+
tbl_item.content_layer = content_layer
1684+
16161685
if caption:
16171686
tbl_item.captions.append(caption.get_ref())
16181687

@@ -1628,6 +1697,7 @@ def add_picture(
16281697
caption: Optional[Union[TextItem, RefItem]] = None,
16291698
prov: Optional[ProvenanceItem] = None,
16301699
parent: Optional[NodeItem] = None,
1700+
content_layer: Optional[ContentLayer] = None,
16311701
):
16321702
"""add_picture.
16331703
@@ -1652,6 +1722,8 @@ def add_picture(
16521722
)
16531723
if prov:
16541724
fig_item.prov.append(prov)
1725+
if content_layer:
1726+
fig_item.content_layer = content_layer
16551727
if caption:
16561728
fig_item.captions.append(caption.get_ref())
16571729

@@ -1666,6 +1738,7 @@ def add_title(
16661738
orig: Optional[str] = None,
16671739
prov: Optional[ProvenanceItem] = None,
16681740
parent: Optional[NodeItem] = None,
1741+
content_layer: Optional[ContentLayer] = None,
16691742
):
16701743
"""add_title.
16711744
@@ -1691,6 +1764,8 @@ def add_title(
16911764
)
16921765
if prov:
16931766
text_item.prov.append(prov)
1767+
if content_layer:
1768+
text_item.content_layer = content_layer
16941769

16951770
self.texts.append(text_item)
16961771
parent.children.append(RefItem(cref=cref))
@@ -1704,6 +1779,7 @@ def add_code(
17041779
orig: Optional[str] = None,
17051780
prov: Optional[ProvenanceItem] = None,
17061781
parent: Optional[NodeItem] = None,
1782+
content_layer: Optional[ContentLayer] = None,
17071783
):
17081784
"""add_code.
17091785
@@ -1729,6 +1805,8 @@ def add_code(
17291805
)
17301806
if code_language:
17311807
code_item.code_language = code_language
1808+
if content_layer:
1809+
code_item.content_layer = content_layer
17321810
if prov:
17331811
code_item.prov.append(prov)
17341812

@@ -1744,6 +1822,7 @@ def add_heading(
17441822
level: LevelNumber = 1,
17451823
prov: Optional[ProvenanceItem] = None,
17461824
parent: Optional[NodeItem] = None,
1825+
content_layer: Optional[ContentLayer] = None,
17471826
):
17481827
"""add_heading.
17491828
@@ -1771,6 +1850,8 @@ def add_heading(
17711850
)
17721851
if prov:
17731852
section_header_item.prov.append(prov)
1853+
if content_layer:
1854+
section_header_item.content_layer = content_layer
17741855

17751856
self.texts.append(section_header_item)
17761857
parent.children.append(RefItem(cref=cref))
@@ -1798,6 +1879,7 @@ def iterate_items(
17981879
with_groups: bool = False,
17991880
traverse_pictures: bool = False,
18001881
page_no: Optional[int] = None,
1882+
included_content_layers: set[ContentLayer] = DEFAULT_CONTENT_LAYERS,
18011883
_level: int = 0, # fixed parameter, carries through the node nesting level
18021884
) -> typing.Iterable[Tuple[NodeItem, int]]: # tuple of node and level
18031885
"""iterate_elements.
@@ -1814,14 +1896,22 @@ def iterate_items(
18141896
root = self.body
18151897

18161898
# Yield non-group items or group items when with_groups=True
1817-
if not isinstance(root, GroupItem) or with_groups:
1818-
if isinstance(root, DocItem):
1819-
if page_no is None or any(
1820-
prov.page_no == page_no for prov in root.prov
1821-
):
1822-
yield root, _level
1823-
else:
1824-
yield root, _level
1899+
1900+
# Combine conditions to have a single yield point
1901+
should_yield = (
1902+
(not isinstance(root, GroupItem) or with_groups)
1903+
and (
1904+
not isinstance(root, DocItem)
1905+
or (
1906+
page_no is None
1907+
or any(prov.page_no == page_no for prov in root.prov)
1908+
)
1909+
)
1910+
and root.content_layer in included_content_layers
1911+
)
1912+
1913+
if should_yield:
1914+
yield root, _level
18251915

18261916
# Handle picture traversal - only traverse children if requested
18271917
if isinstance(root, PictureItem) and not traverse_pictures:

docling_core/utils/legacy.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
TableItem,
2626
TextItem,
2727
)
28-
from docling_core.types.doc.document import GroupItem, ListItem, TableData
28+
from docling_core.types.doc.document import ContentLayer, GroupItem, ListItem, TableData
2929
from docling_core.types.doc.labels import GroupLabel
3030
from docling_core.types.legacy_doc.base import (
3131
BaseCell,
@@ -400,7 +400,7 @@ def _transform_prov(item: BaseCell) -> Optional[ProvenanceItem]:
400400
doc.add_text(
401401
label=DocItemLabel.PAGE_HEADER,
402402
text=text_item.text,
403-
parent=doc.furniture,
403+
content_layer=ContentLayer.FURNITURE,
404404
)
405405

406406
# page footers
@@ -412,7 +412,7 @@ def _transform_prov(item: BaseCell) -> Optional[ProvenanceItem]:
412412
doc.add_text(
413413
label=DocItemLabel.PAGE_FOOTER,
414414
text=text_item.text,
415-
parent=doc.furniture,
415+
content_layer=ContentLayer.FURNITURE,
416416
)
417417

418418
# footnotes

0 commit comments

Comments
 (0)