@@ -5683,69 +5683,137 @@ def validate_misplaced_list_items(self):
56835683 )
56845684 return self
56855685
5686+ class _DocIndex (BaseModel ):
5687+ """A document merge buffer."""
5688+
5689+ groups : list [GroupItem ] = []
5690+ texts : list [TextItem ] = []
5691+ pictures : list [PictureItem ] = []
5692+ tables : list [TableItem ] = []
5693+ key_value_items : list [KeyValueItem ] = []
5694+ form_items : list [FormItem ] = []
5695+
5696+ pages : dict [int , PageItem ] = {}
5697+
5698+ _body : Optional [GroupItem ] = None
5699+ _max_page : int = 0
5700+ _names : list [str ] = []
5701+
5702+ def get_item_list (self , key : str ) -> list [NodeItem ]:
5703+ return getattr (self , key )
5704+
5705+ def index (self , doc : "DoclingDocument" ) -> None :
5706+
5707+ orig_ref_to_new_ref : dict [str , str ] = {}
5708+ page_delta = self ._max_page - min (doc .pages .keys ()) + 1 if doc .pages else 0
5709+
5710+ if self ._body is None :
5711+ self ._body = GroupItem (** doc .body .model_dump (exclude = {"children" }))
5712+
5713+ self ._names .append (doc .name )
5714+
5715+ # collect items in traversal order
5716+ for item , _ in doc .iterate_items (
5717+ with_groups = True ,
5718+ traverse_pictures = True ,
5719+ included_content_layers = {c for c in ContentLayer },
5720+ ):
5721+ key = item .self_ref .split ("/" )[1 ]
5722+ is_body = key == "body"
5723+ new_cref = (
5724+ "#/body" if is_body else f"#/{ key } /{ len (self .get_item_list (key ))} "
5725+ )
5726+ # register cref mapping:
5727+ orig_ref_to_new_ref [item .self_ref ] = new_cref
5728+
5729+ if not is_body :
5730+ new_item = copy .deepcopy (item )
5731+ new_item .children = []
5732+
5733+ # put item in the right list
5734+ self .get_item_list (key ).append (new_item )
5735+
5736+ # update item's self reference
5737+ new_item .self_ref = new_cref
5738+
5739+ if isinstance (new_item , DocItem ):
5740+ # update page numbers
5741+ # NOTE other prov sources (e.g. GraphCell) currently not covered
5742+ for prov in new_item .prov :
5743+ prov .page_no += page_delta
5744+
5745+ if item .parent :
5746+ # set item's parent
5747+ new_parent_cref = orig_ref_to_new_ref [item .parent .cref ]
5748+ new_item .parent = RefItem (cref = new_parent_cref )
5749+
5750+ # add item to parent's children
5751+ path_components = new_parent_cref .split ("/" )
5752+ num_components = len (path_components )
5753+ if num_components == 3 :
5754+ _ , parent_key , parent_index_str = path_components
5755+ parent_index = int (parent_index_str )
5756+ parent_item = self .get_item_list (parent_key )[parent_index ]
5757+
5758+ # update captions field (not possible in iterate_items order):
5759+ if isinstance (parent_item , FloatingItem ):
5760+ for cap_it , cap in enumerate (parent_item .captions ):
5761+ if cap .cref == item .self_ref :
5762+ parent_item .captions [cap_it ] = RefItem (
5763+ cref = new_cref
5764+ )
5765+ break
5766+
5767+ elif num_components == 2 and path_components [1 ] == "body" :
5768+ parent_item = self ._body
5769+ else :
5770+ raise RuntimeError (
5771+ f"Unsupported ref format: { new_parent_cref } "
5772+ )
5773+ parent_item .children .append (RefItem (cref = new_cref ))
5774+
5775+ # update pages
5776+ new_max_page = None
5777+ for page_nr in doc .pages :
5778+ new_page = copy .deepcopy (doc .pages [page_nr ])
5779+ new_page_nr = page_nr + page_delta
5780+ new_page .page_no = new_page_nr
5781+ self .pages [new_page_nr ] = new_page
5782+ if new_max_page is None or new_page_nr > new_max_page :
5783+ new_max_page = new_page_nr
5784+ if new_max_page is not None :
5785+ self ._max_page = new_max_page
5786+
5787+ def get_name (self ) -> str :
5788+ return " + " .join (self ._names )
5789+
5790+ def _update_from_index (self , doc_index : "_DocIndex" ) -> None :
5791+ if doc_index ._body is not None :
5792+ self .body = doc_index ._body
5793+ self .groups = doc_index .groups
5794+ self .texts = doc_index .texts
5795+ self .pictures = doc_index .pictures
5796+ self .tables = doc_index .tables
5797+ self .key_value_items = doc_index .key_value_items
5798+ self .form_items = doc_index .form_items
5799+ self .pages = doc_index .pages
5800+ self .name = doc_index .get_name ()
5801+
56865802 def _normalize_references (self ) -> None :
5687- """Normalize ref numbering by ordering node items as per iterate_items()."""
5688- new_body = GroupItem (** self .body .model_dump (exclude = {"children" }))
5689-
5690- item_lists : dict [str , list [NodeItem ]] = {
5691- "groups" : [],
5692- "texts" : [],
5693- "pictures" : [],
5694- "tables" : [],
5695- "key_value_items" : [],
5696- "form_items" : [],
5697- }
5698- orig_ref_to_new_ref : dict [str , str ] = {}
5803+ doc_index = DoclingDocument ._DocIndex ()
5804+ doc_index .index (doc = self )
5805+ self ._update_from_index (doc_index )
56995806
5700- # collect items in traversal order
5701- for item , _ in self .iterate_items (
5702- with_groups = True ,
5703- traverse_pictures = True ,
5704- included_content_layers = {c for c in ContentLayer },
5705- ):
5706- key = item .self_ref .split ("/" )[1 ]
5707- is_body = key == "body"
5708- new_cref = "#/body" if is_body else f"#/{ key } /{ len (item_lists [key ])} "
5709- # register cref mapping:
5710- orig_ref_to_new_ref [item .self_ref ] = new_cref
5711-
5712- if not is_body :
5713- new_item = copy .deepcopy (item )
5714- new_item .children = []
5715-
5716- # put item in the right list
5717- item_lists [key ].append (new_item )
5718-
5719- # update item's self reference
5720- new_item .self_ref = new_cref
5721-
5722- if item .parent :
5723- # set item's parent
5724- new_parent_cref = orig_ref_to_new_ref [item .parent .cref ]
5725- new_item .parent = RefItem (cref = new_parent_cref )
5726-
5727- # add item to parent's children
5728- path_components = new_parent_cref .split ("/" )
5729- num_components = len (path_components )
5730- parent_node : NodeItem
5731- if num_components == 3 :
5732- _ , parent_key , parent_index_str = path_components
5733- parent_index = int (parent_index_str )
5734- parent_node = item_lists [parent_key ][parent_index ]
5735- elif num_components == 2 and path_components [1 ] == "body" :
5736- parent_node = new_body
5737- else :
5738- raise RuntimeError (f"Unsupported ref format: { new_parent_cref } " )
5739- parent_node .children .append (RefItem (cref = new_cref ))
5740-
5741- # update document
5742- self .groups = item_lists ["groups" ] # type: ignore
5743- self .texts = item_lists ["texts" ] # type: ignore
5744- self .pictures = item_lists ["pictures" ] # type: ignore
5745- self .tables = item_lists ["tables" ] # type: ignore
5746- self .key_value_items = item_lists ["key_value_items" ] # type: ignore
5747- self .form_items = item_lists ["form_items" ] # type: ignore
5748- self .body = new_body
5807+ @classmethod
5808+ def concatenate (cls , docs : Sequence ["DoclingDocument" ]) -> "DoclingDocument" :
5809+ """Concatenate multiple documents into a single document."""
5810+ doc_index = DoclingDocument ._DocIndex ()
5811+ for doc in docs :
5812+ doc_index .index (doc = doc )
5813+
5814+ res_doc = DoclingDocument (name = " + " .join ([doc .name for doc in docs ]))
5815+ res_doc ._update_from_index (doc_index )
5816+ return res_doc
57495817
57505818
57515819# deprecated aliases (kept for backwards compatibility):
0 commit comments