1313import textwrap
1414import typing
1515import warnings
16+ from enum import Enum
1617from io import BytesIO
1718from pathlib import Path
1819from typing import Any , Dict , Final , List , Literal , Optional , Tuple , Union
5455
5556Uint64 = typing .Annotated [int , Field (ge = 0 , le = (2 ** 64 - 1 ))]
5657LevelNumber = typing .Annotated [int , Field (ge = 1 , le = 100 )]
57- CURRENT_VERSION : Final = "1.0 .0"
58+ CURRENT_VERSION : Final = "1.1 .0"
5859
5960DEFAULT_EXPORT_LABELS = {
6061 DocItemLabel .TITLE ,
7071 DocItemLabel .LIST_ITEM ,
7172 DocItemLabel .CODE ,
7273 DocItemLabel .REFERENCE ,
74+ DocItemLabel .PAGE_HEADER ,
75+ DocItemLabel .PAGE_FOOTER ,
7376}
7477
7578
@@ -513,13 +516,25 @@ class ProvenanceItem(BaseModel):
513516 charspan : Tuple [int , int ]
514517
515518
519+ class ContentLayer (str , Enum ):
520+ """ContentLayer."""
521+
522+ BODY = "body"
523+ FURNITURE = "furniture"
524+
525+
526+ DEFAULT_CONTENT_LAYERS = {ContentLayer .BODY }
527+
528+
516529class NodeItem (BaseModel ):
517530 """NodeItem."""
518531
519532 self_ref : str = Field (pattern = _JSON_POINTER_REGEX )
520533 parent : Optional [RefItem ] = None
521534 children : List [RefItem ] = []
522535
536+ content_layer : ContentLayer = ContentLayer .BODY
537+
523538 model_config = ConfigDict (extra = "forbid" )
524539
525540 def get_ref (self ):
@@ -1442,8 +1457,8 @@ class DoclingDocument(BaseModel):
14421457 # generated from synthetic data.
14431458 )
14441459
1445- furniture : GroupItem = GroupItem (
1446- name = "_root_" , self_ref = "#/furniture"
1460+ furniture : Annotated [ GroupItem , Field ( deprecated = True )] = GroupItem (
1461+ name = "_root_" , self_ref = "#/furniture" , content_layer = ContentLayer . FURNITURE
14471462 ) # List[RefItem] = []
14481463 body : GroupItem = GroupItem (name = "_root_" , self_ref = "#/body" ) # List[RefItem] = []
14491464
@@ -1455,11 +1470,28 @@ class DoclingDocument(BaseModel):
14551470
14561471 pages : Dict [int , PageItem ] = {} # empty as default
14571472
1473+ @model_validator (mode = "before" )
1474+ @classmethod
1475+ def transform_to_content_layer (cls , data : dict ) -> dict :
1476+ """transform_to_content_layer."""
1477+ # Since version 1.1.0, all NodeItems carry content_layer property.
1478+ # We must assign previous page_header and page_footer instances to furniture.
1479+ # Note: model_validators which check on the version must use "before".
1480+ if "version" in data and data ["version" ] == "1.0.0" :
1481+ for item in data .get ("texts" , []):
1482+ if "label" in item and item ["label" ] in [
1483+ DocItemLabel .PAGE_HEADER .value ,
1484+ DocItemLabel .PAGE_FOOTER .value ,
1485+ ]:
1486+ item ["content_layer" ] = "furniture"
1487+ return data
1488+
14581489 def add_group (
14591490 self ,
14601491 label : Optional [GroupLabel ] = None ,
14611492 name : Optional [str ] = None ,
14621493 parent : Optional [NodeItem ] = None ,
1494+ content_layer : Optional [ContentLayer ] = None ,
14631495 ) -> GroupItem :
14641496 """add_group.
14651497
@@ -1479,6 +1511,8 @@ def add_group(
14791511 group .name = name
14801512 if label is not None :
14811513 group .label = label
1514+ if content_layer :
1515+ group .content_layer = content_layer
14821516
14831517 self .groups .append (group )
14841518 parent .children .append (RefItem (cref = cref ))
@@ -1493,6 +1527,7 @@ def add_list_item(
14931527 orig : Optional [str ] = None ,
14941528 prov : Optional [ProvenanceItem ] = None ,
14951529 parent : Optional [NodeItem ] = None ,
1530+ content_layer : Optional [ContentLayer ] = None ,
14961531 ):
14971532 """add_list_item.
14981533
@@ -1523,6 +1558,8 @@ def add_list_item(
15231558 )
15241559 if prov :
15251560 list_item .prov .append (prov )
1561+ if content_layer :
1562+ list_item .content_layer = content_layer
15261563
15271564 self .texts .append (list_item )
15281565 parent .children .append (RefItem (cref = cref ))
@@ -1536,6 +1573,7 @@ def add_text(
15361573 orig : Optional [str ] = None ,
15371574 prov : Optional [ProvenanceItem ] = None ,
15381575 parent : Optional [NodeItem ] = None ,
1576+ content_layer : Optional [ContentLayer ] = None ,
15391577 ):
15401578 """add_text.
15411579
@@ -1549,16 +1587,40 @@ def add_text(
15491587 # Catch a few cases that are in principle allowed
15501588 # but that will create confusion down the road
15511589 if label in [DocItemLabel .TITLE ]:
1552- return self .add_title (text = text , orig = orig , prov = prov , parent = parent )
1590+ return self .add_title (
1591+ text = text ,
1592+ orig = orig ,
1593+ prov = prov ,
1594+ parent = parent ,
1595+ content_layer = content_layer ,
1596+ )
15531597
15541598 elif label in [DocItemLabel .LIST_ITEM ]:
1555- return self .add_list_item (text = text , orig = orig , prov = prov , parent = parent )
1599+ return self .add_list_item (
1600+ text = text ,
1601+ orig = orig ,
1602+ prov = prov ,
1603+ parent = parent ,
1604+ content_layer = content_layer ,
1605+ )
15561606
15571607 elif label in [DocItemLabel .SECTION_HEADER ]:
1558- return self .add_heading (text = text , orig = orig , prov = prov , parent = parent )
1608+ return self .add_heading (
1609+ text = text ,
1610+ orig = orig ,
1611+ prov = prov ,
1612+ parent = parent ,
1613+ content_layer = content_layer ,
1614+ )
15591615
15601616 elif label in [DocItemLabel .CODE ]:
1561- return self .add_code (text = text , orig = orig , prov = prov , parent = parent )
1617+ return self .add_code (
1618+ text = text ,
1619+ orig = orig ,
1620+ prov = prov ,
1621+ parent = parent ,
1622+ content_layer = content_layer ,
1623+ )
15621624
15631625 else :
15641626
@@ -1580,6 +1642,9 @@ def add_text(
15801642 if prov :
15811643 text_item .prov .append (prov )
15821644
1645+ if content_layer :
1646+ text_item .content_layer = content_layer
1647+
15831648 self .texts .append (text_item )
15841649 parent .children .append (RefItem (cref = cref ))
15851650
@@ -1592,6 +1657,7 @@ def add_table(
15921657 prov : Optional [ProvenanceItem ] = None ,
15931658 parent : Optional [NodeItem ] = None ,
15941659 label : DocItemLabel = DocItemLabel .TABLE ,
1660+ content_layer : Optional [ContentLayer ] = None ,
15951661 ):
15961662 """add_table.
15971663
@@ -1613,6 +1679,9 @@ def add_table(
16131679 )
16141680 if prov :
16151681 tbl_item .prov .append (prov )
1682+ if content_layer :
1683+ tbl_item .content_layer = content_layer
1684+
16161685 if caption :
16171686 tbl_item .captions .append (caption .get_ref ())
16181687
@@ -1628,6 +1697,7 @@ def add_picture(
16281697 caption : Optional [Union [TextItem , RefItem ]] = None ,
16291698 prov : Optional [ProvenanceItem ] = None ,
16301699 parent : Optional [NodeItem ] = None ,
1700+ content_layer : Optional [ContentLayer ] = None ,
16311701 ):
16321702 """add_picture.
16331703
@@ -1652,6 +1722,8 @@ def add_picture(
16521722 )
16531723 if prov :
16541724 fig_item .prov .append (prov )
1725+ if content_layer :
1726+ fig_item .content_layer = content_layer
16551727 if caption :
16561728 fig_item .captions .append (caption .get_ref ())
16571729
@@ -1666,6 +1738,7 @@ def add_title(
16661738 orig : Optional [str ] = None ,
16671739 prov : Optional [ProvenanceItem ] = None ,
16681740 parent : Optional [NodeItem ] = None ,
1741+ content_layer : Optional [ContentLayer ] = None ,
16691742 ):
16701743 """add_title.
16711744
@@ -1691,6 +1764,8 @@ def add_title(
16911764 )
16921765 if prov :
16931766 text_item .prov .append (prov )
1767+ if content_layer :
1768+ text_item .content_layer = content_layer
16941769
16951770 self .texts .append (text_item )
16961771 parent .children .append (RefItem (cref = cref ))
@@ -1704,6 +1779,7 @@ def add_code(
17041779 orig : Optional [str ] = None ,
17051780 prov : Optional [ProvenanceItem ] = None ,
17061781 parent : Optional [NodeItem ] = None ,
1782+ content_layer : Optional [ContentLayer ] = None ,
17071783 ):
17081784 """add_code.
17091785
@@ -1729,6 +1805,8 @@ def add_code(
17291805 )
17301806 if code_language :
17311807 code_item .code_language = code_language
1808+ if content_layer :
1809+ code_item .content_layer = content_layer
17321810 if prov :
17331811 code_item .prov .append (prov )
17341812
@@ -1744,6 +1822,7 @@ def add_heading(
17441822 level : LevelNumber = 1 ,
17451823 prov : Optional [ProvenanceItem ] = None ,
17461824 parent : Optional [NodeItem ] = None ,
1825+ content_layer : Optional [ContentLayer ] = None ,
17471826 ):
17481827 """add_heading.
17491828
@@ -1771,6 +1850,8 @@ def add_heading(
17711850 )
17721851 if prov :
17731852 section_header_item .prov .append (prov )
1853+ if content_layer :
1854+ section_header_item .content_layer = content_layer
17741855
17751856 self .texts .append (section_header_item )
17761857 parent .children .append (RefItem (cref = cref ))
@@ -1798,6 +1879,7 @@ def iterate_items(
17981879 with_groups : bool = False ,
17991880 traverse_pictures : bool = False ,
18001881 page_no : Optional [int ] = None ,
1882+ included_content_layers : set [ContentLayer ] = DEFAULT_CONTENT_LAYERS ,
18011883 _level : int = 0 , # fixed parameter, carries through the node nesting level
18021884 ) -> typing .Iterable [Tuple [NodeItem , int ]]: # tuple of node and level
18031885 """iterate_elements.
@@ -1814,14 +1896,22 @@ def iterate_items(
18141896 root = self .body
18151897
18161898 # Yield non-group items or group items when with_groups=True
1817- if not isinstance (root , GroupItem ) or with_groups :
1818- if isinstance (root , DocItem ):
1819- if page_no is None or any (
1820- prov .page_no == page_no for prov in root .prov
1821- ):
1822- yield root , _level
1823- else :
1824- yield root , _level
1899+
1900+ # Combine conditions to have a single yield point
1901+ should_yield = (
1902+ (not isinstance (root , GroupItem ) or with_groups )
1903+ and (
1904+ not isinstance (root , DocItem )
1905+ or (
1906+ page_no is None
1907+ or any (prov .page_no == page_no for prov in root .prov )
1908+ )
1909+ )
1910+ and root .content_layer in included_content_layers
1911+ )
1912+
1913+ if should_yield :
1914+ yield root , _level
18251915
18261916 # Handle picture traversal - only traverse children if requested
18271917 if isinstance (root , PictureItem ) and not traverse_pictures :
0 commit comments