Skip to content

Commit 99eabb3

Browse files
authored
feat: add document concatenation (#365)
* chore: add document concatenation Signed-off-by: Panos Vagenas <[email protected]> * finalize exposed API, add tests Signed-off-by: Panos Vagenas <[email protected]> --------- Signed-off-by: Panos Vagenas <[email protected]>
1 parent 23badf2 commit 99eabb3

File tree

6 files changed

+29559
-61
lines changed

6 files changed

+29559
-61
lines changed

docling_core/types/doc/document.py

Lines changed: 129 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -5683,69 +5683,137 @@ def validate_misplaced_list_items(self):
56835683
)
56845684
return self
56855685

5686+
class _DocIndex(BaseModel):
5687+
"""A document merge buffer."""
5688+
5689+
groups: list[GroupItem] = []
5690+
texts: list[TextItem] = []
5691+
pictures: list[PictureItem] = []
5692+
tables: list[TableItem] = []
5693+
key_value_items: list[KeyValueItem] = []
5694+
form_items: list[FormItem] = []
5695+
5696+
pages: dict[int, PageItem] = {}
5697+
5698+
_body: Optional[GroupItem] = None
5699+
_max_page: int = 0
5700+
_names: list[str] = []
5701+
5702+
def get_item_list(self, key: str) -> list[NodeItem]:
5703+
return getattr(self, key)
5704+
5705+
def index(self, doc: "DoclingDocument") -> None:
5706+
5707+
orig_ref_to_new_ref: dict[str, str] = {}
5708+
page_delta = self._max_page - min(doc.pages.keys()) + 1 if doc.pages else 0
5709+
5710+
if self._body is None:
5711+
self._body = GroupItem(**doc.body.model_dump(exclude={"children"}))
5712+
5713+
self._names.append(doc.name)
5714+
5715+
# collect items in traversal order
5716+
for item, _ in doc.iterate_items(
5717+
with_groups=True,
5718+
traverse_pictures=True,
5719+
included_content_layers={c for c in ContentLayer},
5720+
):
5721+
key = item.self_ref.split("/")[1]
5722+
is_body = key == "body"
5723+
new_cref = (
5724+
"#/body" if is_body else f"#/{key}/{len(self.get_item_list(key))}"
5725+
)
5726+
# register cref mapping:
5727+
orig_ref_to_new_ref[item.self_ref] = new_cref
5728+
5729+
if not is_body:
5730+
new_item = copy.deepcopy(item)
5731+
new_item.children = []
5732+
5733+
# put item in the right list
5734+
self.get_item_list(key).append(new_item)
5735+
5736+
# update item's self reference
5737+
new_item.self_ref = new_cref
5738+
5739+
if isinstance(new_item, DocItem):
5740+
# update page numbers
5741+
# NOTE other prov sources (e.g. GraphCell) currently not covered
5742+
for prov in new_item.prov:
5743+
prov.page_no += page_delta
5744+
5745+
if item.parent:
5746+
# set item's parent
5747+
new_parent_cref = orig_ref_to_new_ref[item.parent.cref]
5748+
new_item.parent = RefItem(cref=new_parent_cref)
5749+
5750+
# add item to parent's children
5751+
path_components = new_parent_cref.split("/")
5752+
num_components = len(path_components)
5753+
if num_components == 3:
5754+
_, parent_key, parent_index_str = path_components
5755+
parent_index = int(parent_index_str)
5756+
parent_item = self.get_item_list(parent_key)[parent_index]
5757+
5758+
# update captions field (not possible in iterate_items order):
5759+
if isinstance(parent_item, FloatingItem):
5760+
for cap_it, cap in enumerate(parent_item.captions):
5761+
if cap.cref == item.self_ref:
5762+
parent_item.captions[cap_it] = RefItem(
5763+
cref=new_cref
5764+
)
5765+
break
5766+
5767+
elif num_components == 2 and path_components[1] == "body":
5768+
parent_item = self._body
5769+
else:
5770+
raise RuntimeError(
5771+
f"Unsupported ref format: {new_parent_cref}"
5772+
)
5773+
parent_item.children.append(RefItem(cref=new_cref))
5774+
5775+
# update pages
5776+
new_max_page = None
5777+
for page_nr in doc.pages:
5778+
new_page = copy.deepcopy(doc.pages[page_nr])
5779+
new_page_nr = page_nr + page_delta
5780+
new_page.page_no = new_page_nr
5781+
self.pages[new_page_nr] = new_page
5782+
if new_max_page is None or new_page_nr > new_max_page:
5783+
new_max_page = new_page_nr
5784+
if new_max_page is not None:
5785+
self._max_page = new_max_page
5786+
5787+
def get_name(self) -> str:
5788+
return " + ".join(self._names)
5789+
5790+
def _update_from_index(self, doc_index: "_DocIndex") -> None:
5791+
if doc_index._body is not None:
5792+
self.body = doc_index._body
5793+
self.groups = doc_index.groups
5794+
self.texts = doc_index.texts
5795+
self.pictures = doc_index.pictures
5796+
self.tables = doc_index.tables
5797+
self.key_value_items = doc_index.key_value_items
5798+
self.form_items = doc_index.form_items
5799+
self.pages = doc_index.pages
5800+
self.name = doc_index.get_name()
5801+
56865802
def _normalize_references(self) -> None:
5687-
"""Normalize ref numbering by ordering node items as per iterate_items()."""
5688-
new_body = GroupItem(**self.body.model_dump(exclude={"children"}))
5689-
5690-
item_lists: dict[str, list[NodeItem]] = {
5691-
"groups": [],
5692-
"texts": [],
5693-
"pictures": [],
5694-
"tables": [],
5695-
"key_value_items": [],
5696-
"form_items": [],
5697-
}
5698-
orig_ref_to_new_ref: dict[str, str] = {}
5803+
doc_index = DoclingDocument._DocIndex()
5804+
doc_index.index(doc=self)
5805+
self._update_from_index(doc_index)
56995806

5700-
# collect items in traversal order
5701-
for item, _ in self.iterate_items(
5702-
with_groups=True,
5703-
traverse_pictures=True,
5704-
included_content_layers={c for c in ContentLayer},
5705-
):
5706-
key = item.self_ref.split("/")[1]
5707-
is_body = key == "body"
5708-
new_cref = "#/body" if is_body else f"#/{key}/{len(item_lists[key])}"
5709-
# register cref mapping:
5710-
orig_ref_to_new_ref[item.self_ref] = new_cref
5711-
5712-
if not is_body:
5713-
new_item = copy.deepcopy(item)
5714-
new_item.children = []
5715-
5716-
# put item in the right list
5717-
item_lists[key].append(new_item)
5718-
5719-
# update item's self reference
5720-
new_item.self_ref = new_cref
5721-
5722-
if item.parent:
5723-
# set item's parent
5724-
new_parent_cref = orig_ref_to_new_ref[item.parent.cref]
5725-
new_item.parent = RefItem(cref=new_parent_cref)
5726-
5727-
# add item to parent's children
5728-
path_components = new_parent_cref.split("/")
5729-
num_components = len(path_components)
5730-
parent_node: NodeItem
5731-
if num_components == 3:
5732-
_, parent_key, parent_index_str = path_components
5733-
parent_index = int(parent_index_str)
5734-
parent_node = item_lists[parent_key][parent_index]
5735-
elif num_components == 2 and path_components[1] == "body":
5736-
parent_node = new_body
5737-
else:
5738-
raise RuntimeError(f"Unsupported ref format: {new_parent_cref}")
5739-
parent_node.children.append(RefItem(cref=new_cref))
5740-
5741-
# update document
5742-
self.groups = item_lists["groups"] # type: ignore
5743-
self.texts = item_lists["texts"] # type: ignore
5744-
self.pictures = item_lists["pictures"] # type: ignore
5745-
self.tables = item_lists["tables"] # type: ignore
5746-
self.key_value_items = item_lists["key_value_items"] # type: ignore
5747-
self.form_items = item_lists["form_items"] # type: ignore
5748-
self.body = new_body
5807+
@classmethod
5808+
def concatenate(cls, docs: Sequence["DoclingDocument"]) -> "DoclingDocument":
5809+
"""Concatenate multiple documents into a single document."""
5810+
doc_index = DoclingDocument._DocIndex()
5811+
for doc in docs:
5812+
doc_index.index(doc=doc)
5813+
5814+
res_doc = DoclingDocument(name=" + ".join([doc.name for doc in docs]))
5815+
res_doc._update_from_index(doc_index)
5816+
return res_doc
57495817

57505818

57515819
# deprecated aliases (kept for backwards compatibility):

0 commit comments

Comments
 (0)