Skip to content

Commit d622800

Browse files
Saidgurbuzcau-gitPeterStaar-IBM
authored
feat: Introduce Key-Value and Forms items (#158)
* Draft KeyValueItem content Signed-off-by: Christoph Auer <[email protected]> * added add_key_value_item method Signed-off-by: Saidgurbuz <[email protected]> * add KeyValueLink Signed-off-by: Saidgurbuz <[email protected]> * update an add_key_value_item argument Signed-off-by: Saidgurbuz <[email protected]> * remove KeyOrValueCellType and KeyValueLinkType Signed-off-by: Saidgurbuz <[email protected]> * update tests for KeyValueItem Signed-off-by: Saidgurbuz <[email protected]> * add union method to create bbox that covers all the given bboxes Signed-off-by: Saidgurbuz <[email protected]> * added the , and rewrote it to make it more general with a GraphItem, as well as having cell- and link-labels Signed-off-by: Peter Staar <[email protected]> * updated the code due to MyPy and reformatted Signed-off-by: Peter Staar <[email protected]> * updated the reference docs with form-key Signed-off-by: Peter Staar <[email protected]> * updated image Signed-off-by: Peter Staar <[email protected]> * removed the circle Signed-off-by: Peter Staar <[email protected]> * fixed the figure, testing now ... Signed-off-by: Peter Staar <[email protected]> * added square in image Signed-off-by: Peter Staar <[email protected]> * removed square in image Signed-off-by: Peter Staar <[email protected]> * tests should go through Signed-off-by: Peter Staar <[email protected]> * update poetry.lock versions Signed-off-by: Saidgurbuz <[email protected]> * fix import issue Signed-off-by: Saidgurbuz <[email protected]> * add validator for links in GraphData Signed-off-by: Peter Staar <[email protected]> * added field_validator Signed-off-by: Peter Staar <[email protected]> * rename add_key_values and add_form Signed-off-by: Saidgurbuz <[email protected]> * rename union to enclosingbbox Signed-off-by: Saidgurbuz <[email protected]> --------- Signed-off-by: Christoph Auer <[email protected]> Signed-off-by: Saidgurbuz <[email protected]> Signed-off-by: Peter Staar <[email protected]> Co-authored-by: Christoph Auer <[email protected]> Co-authored-by: Peter Staar <[email protected]>
1 parent 23fa0b5 commit d622800

19 files changed

+858
-28
lines changed

docling_core/types/doc/base.py

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
"""Models for the base data types."""
22

33
from enum import Enum
4-
from typing import Tuple
4+
from typing import List, Tuple
55

66
from pydantic import BaseModel
77

@@ -365,3 +365,30 @@ def is_horizontally_connected(
365365
raise ValueError("BoundingBoxes have different CoordOrigin")
366366

367367
return False
368+
369+
@classmethod
370+
def enclosing_bbox(cls, boxes: List["BoundingBox"]) -> "BoundingBox":
371+
"""Create a bounding box that covers all of the given boxes."""
372+
if not boxes:
373+
raise ValueError("No bounding boxes provided for union.")
374+
375+
origin = boxes[0].coord_origin
376+
if any(box.coord_origin != origin for box in boxes):
377+
raise ValueError(
378+
"All bounding boxes must have the same \
379+
CoordOrigin to compute their union."
380+
)
381+
382+
left = min(box.l for box in boxes)
383+
right = max(box.r for box in boxes)
384+
385+
if origin == CoordOrigin.TOPLEFT:
386+
top = min(box.t for box in boxes)
387+
bottom = max(box.b for box in boxes)
388+
elif origin == CoordOrigin.BOTTOMLEFT:
389+
top = max(box.t for box in boxes)
390+
bottom = min(box.b for box in boxes)
391+
else:
392+
raise ValueError("BoundingBoxes have different CoordOrigin")
393+
394+
return cls(l=left, t=top, r=right, b=bottom, coord_origin=origin)

docling_core/types/doc/document.py

Lines changed: 164 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,13 @@
4343
from docling_core.types.base import _JSON_POINTER_REGEX
4444
from docling_core.types.doc import BoundingBox, Size
4545
from docling_core.types.doc.base import ImageRefMode
46-
from docling_core.types.doc.labels import CodeLanguageLabel, DocItemLabel, GroupLabel
46+
from docling_core.types.doc.labels import (
47+
CodeLanguageLabel,
48+
DocItemLabel,
49+
GraphCellLabel,
50+
GraphLinkLabel,
51+
GroupLabel,
52+
)
4753
from docling_core.types.doc.tokens import DocumentToken, TableToken
4854
from docling_core.types.doc.utils import (
4955
get_html_tag_with_text_direction,
@@ -1101,7 +1107,9 @@ def export_to_markdown(self) -> str:
11011107
return md_table
11021108

11031109
def export_to_html(
1104-
self, doc: Optional["DoclingDocument"] = None, add_caption: bool = True
1110+
self,
1111+
doc: Optional["DoclingDocument"] = None,
1112+
add_caption: bool = True,
11051113
) -> str:
11061114
"""Export the table as html."""
11071115
if doc is None:
@@ -1330,11 +1338,73 @@ def export_to_document_tokens(
13301338
return body
13311339

13321340

1333-
class KeyValueItem(DocItem):
1341+
class GraphCell(BaseModel):
1342+
"""GraphCell."""
1343+
1344+
label: GraphCellLabel
1345+
1346+
cell_id: int
1347+
1348+
text: str # sanitized text
1349+
orig: str # text as seen on document
1350+
1351+
prov: Optional[ProvenanceItem] = None
1352+
1353+
# in case you have a text, table or picture item
1354+
item_ref: Optional[RefItem] = None
1355+
1356+
1357+
class GraphLink(BaseModel):
1358+
"""GraphLink."""
1359+
1360+
label: GraphLinkLabel
1361+
1362+
source_cell_id: int
1363+
target_cell_id: int
1364+
1365+
1366+
class GraphData(BaseModel):
1367+
"""GraphData."""
1368+
1369+
cells: List[GraphCell] = Field(default_factory=list)
1370+
links: List[GraphLink] = Field(default_factory=list)
1371+
1372+
@field_validator("links")
1373+
@classmethod
1374+
def validate_links(cls, links, info):
1375+
"""Ensure that each link is valid."""
1376+
cells = info.data.get("cells", [])
1377+
1378+
valid_cell_ids = {cell.cell_id for cell in cells}
1379+
1380+
for link in links:
1381+
if link.source_cell_id not in valid_cell_ids:
1382+
raise ValueError(
1383+
f"Invalid source_cell_id {link.source_cell_id} in GraphLink"
1384+
)
1385+
if link.target_cell_id not in valid_cell_ids:
1386+
raise ValueError(
1387+
f"Invalid target_cell_id {link.target_cell_id} in GraphLink"
1388+
)
1389+
1390+
return links
1391+
1392+
1393+
class KeyValueItem(FloatingItem):
13341394
"""KeyValueItem."""
13351395

13361396
label: typing.Literal[DocItemLabel.KEY_VALUE_REGION] = DocItemLabel.KEY_VALUE_REGION
13371397

1398+
graph: GraphData
1399+
1400+
1401+
class FormItem(FloatingItem):
1402+
"""FormItem."""
1403+
1404+
label: typing.Literal[DocItemLabel.FORM] = DocItemLabel.FORM
1405+
1406+
graph: GraphData
1407+
13381408

13391409
ContentItem = Annotated[
13401410
Union[
@@ -1446,7 +1516,9 @@ class DoclingDocument(BaseModel):
14461516
)
14471517

14481518
furniture: Annotated[GroupItem, Field(deprecated=True)] = GroupItem(
1449-
name="_root_", self_ref="#/furniture", content_layer=ContentLayer.FURNITURE
1519+
name="_root_",
1520+
self_ref="#/furniture",
1521+
content_layer=ContentLayer.FURNITURE,
14501522
) # List[RefItem] = []
14511523
body: GroupItem = GroupItem(name="_root_", self_ref="#/body") # List[RefItem] = []
14521524

@@ -1455,6 +1527,7 @@ class DoclingDocument(BaseModel):
14551527
pictures: List[PictureItem] = []
14561528
tables: List[TableItem] = []
14571529
key_value_items: List[KeyValueItem] = []
1530+
form_items: List[FormItem] = []
14581531

14591532
pages: Dict[int, PageItem] = {} # empty as default
14601533

@@ -1851,6 +1924,68 @@ def add_heading(
18511924

18521925
return section_header_item
18531926

1927+
def add_key_values(
1928+
self,
1929+
graph: GraphData,
1930+
prov: Optional[ProvenanceItem] = None,
1931+
parent: Optional[NodeItem] = None,
1932+
):
1933+
"""add_key_values.
1934+
1935+
:param graph: GraphData:
1936+
:param prov: Optional[ProvenanceItem]: (Default value = None)
1937+
:param parent: Optional[NodeItem]: (Default value = None)
1938+
"""
1939+
if not parent:
1940+
parent = self.body
1941+
1942+
key_value_index = len(self.key_value_items)
1943+
cref = f"#/key_value_items/{key_value_index}"
1944+
1945+
kv_item = KeyValueItem(
1946+
graph=graph,
1947+
self_ref=cref,
1948+
parent=parent.get_ref(),
1949+
)
1950+
if prov:
1951+
kv_item.prov.append(prov)
1952+
1953+
self.key_value_items.append(kv_item)
1954+
parent.children.append(RefItem(cref=cref))
1955+
1956+
return kv_item
1957+
1958+
def add_form(
1959+
self,
1960+
graph: GraphData,
1961+
prov: Optional[ProvenanceItem] = None,
1962+
parent: Optional[NodeItem] = None,
1963+
):
1964+
"""add_form.
1965+
1966+
:param graph: GraphData:
1967+
:param prov: Optional[ProvenanceItem]: (Default value = None)
1968+
:param parent: Optional[NodeItem]: (Default value = None)
1969+
"""
1970+
if not parent:
1971+
parent = self.body
1972+
1973+
form_index = len(self.form_items)
1974+
cref = f"#/form_items/{form_index}"
1975+
1976+
form_item = FormItem(
1977+
graph=graph,
1978+
self_ref=cref,
1979+
parent=parent.get_ref(),
1980+
)
1981+
if prov:
1982+
form_item.prov.append(prov)
1983+
1984+
self.form_items.append(form_item)
1985+
parent.children.append(RefItem(cref=cref))
1986+
1987+
return form_item
1988+
18541989
def num_pages(self):
18551990
"""num_pages."""
18561991
return len(self.pages.values())
@@ -2009,7 +2144,8 @@ def _with_pictures_refs(
20092144
img.save(loc_path)
20102145
if reference_path is not None:
20112146
obj_path = relative_path(
2012-
reference_path.resolve(), loc_path.resolve()
2147+
reference_path.resolve(),
2148+
loc_path.resolve(),
20132149
)
20142150
else:
20152151
obj_path = loc_path
@@ -2027,7 +2163,10 @@ def print_element_tree(self):
20272163
"""Print_element_tree."""
20282164
for ix, (item, level) in enumerate(self.iterate_items(with_groups=True)):
20292165
if isinstance(item, GroupItem):
2030-
print(" " * level, f"{ix}: {item.label.value} with name={item.name}")
2166+
print(
2167+
" " * level,
2168+
f"{ix}: {item.label.value} with name={item.name}",
2169+
)
20312170
elif isinstance(item, DocItem):
20322171
print(" " * level, f"{ix}: {item.label.value}")
20332172

@@ -2519,7 +2658,11 @@ def close_lists(
25192658

25202659
return (in_ordered_list, html_texts)
25212660

2522-
head_lines = ["<!DOCTYPE html>", f'<html lang="{html_lang}">', html_head]
2661+
head_lines = [
2662+
"<!DOCTYPE html>",
2663+
f'<html lang="{html_lang}">',
2664+
html_head,
2665+
]
25232666
html_texts: list[str] = []
25242667

25252668
prev_level = 0 # Track the previous item's level
@@ -2599,7 +2742,8 @@ def _prepare_tag_content(
25992742
section_level: int = min(item.level + 1, 6)
26002743

26012744
text = get_html_tag_with_text_direction(
2602-
html_tag=f"h{section_level}", text=_prepare_tag_content(item.text)
2745+
html_tag=f"h{section_level}",
2746+
text=_prepare_tag_content(item.text),
26032747
)
26042748
html_texts.append(text)
26052749

@@ -2856,13 +3000,19 @@ def _get_standalone_captions(document_body):
28563000
self.iterate_items(
28573001
self.body,
28583002
with_groups=True,
2859-
included_content_layers={ContentLayer.BODY, ContentLayer.FURNITURE},
3003+
included_content_layers={
3004+
ContentLayer.BODY,
3005+
ContentLayer.FURNITURE,
3006+
},
28603007
)
28613008
):
28623009
# Close lists if we've moved to a lower nesting level
28633010
if current_level < previous_level and ordered_list_stack:
28643011
ordered_list_stack = _close_lists(
2865-
current_level, previous_level, ordered_list_stack, output_parts
3012+
current_level,
3013+
previous_level,
3014+
ordered_list_stack,
3015+
output_parts,
28663016
)
28673017
previous_level = current_level
28683018

@@ -2970,7 +3120,10 @@ def _get_standalone_captions(document_body):
29703120
return "".join(output_parts)
29713121

29723122
def _export_to_indented_text(
2973-
self, indent=" ", max_text_len: int = -1, explicit_tables: bool = False
3123+
self,
3124+
indent=" ",
3125+
max_text_len: int = -1,
3126+
explicit_tables: bool = False,
29743127
):
29753128
"""Export the document to indented text to expose hierarchy."""
29763129
result = []

docling_core/types/doc/labels.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,29 @@ def __str__(self):
140140
return str(self.value)
141141

142142

143+
class GraphCellLabel(str, Enum):
144+
"""GraphCellLabel."""
145+
146+
UNSPECIFIED = "unspecified"
147+
148+
KEY = "key"
149+
VALUE = "value"
150+
151+
CHECKBOX = "checkbox"
152+
153+
154+
class GraphLinkLabel(str, Enum):
155+
"""GraphLinkLabel."""
156+
157+
UNSPECIFIED = "unspecified"
158+
159+
TO_VALUE = "to_value"
160+
TO_KEY = "to_key"
161+
162+
TO_PARENT = "to_parent"
163+
TO_CHILD = "to_child"
164+
165+
143166
class CodeLanguageLabel(str, Enum):
144167
"""CodeLanguageLabel."""
145168

0 commit comments

Comments
 (0)