diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index 9d8dc332..29d64c5a 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -1188,6 +1188,7 @@ class TextItem(DocItem): DocItemLabel.REFERENCE, DocItemLabel.TEXT, DocItemLabel.EMPTY_VALUE, + DocItemLabel.FORM_KEY, ] orig: str # untreated representation @@ -1920,14 +1921,55 @@ def export_to_document_tokens( text = serializer.serialize(item=self).text return text +class CheckboxItem(ListItem): + """FormTextItem.""" + + label: typing.Literal[DocItemLabel.CHECKBOX] = DocItemLabel.CHECKBOX + checked: bool = False + +""" +class FormHeaderItem(SectionHeaderItem): + + label: typing.Literal[DocItemLabel.FORM_HEADER] = DocItemLabel.FORM_HEADER + +class FormTextItem(TextItem): + + label: typing.Literal[DocItemLabel.FORM_TEXT] = DocItemLabel.FORM_TEXT +""" + +class FormListItem(DocItem): + """FormListItem.""" + + label: typing.Literal[DocItemLabel.FORM_LISTITEM] = DocItemLabel.FORM_LISTITEM + + marker: Optional[TextItem] = None + + key: TextItem + + def add_value(self, item: Union[CheckboxItem, ListItem, TextItem]) -> NodeItem: + item.parent = self.get_ref() + self.children.append(item) + + return item + + + class FormItem(FloatingItem): """FormItem.""" label: typing.Literal[DocItemLabel.FORM] = DocItemLabel.FORM - graph: GraphData + def add(self, item: Union["FormItem", SectionHeaderItem, TextItem, FormListItem]) -> NodeItem: + item.parent = self.get_ref() + self.children.append(item.get_ref()) + + return item + def add_listitem(self, doc: DoclingDocument, prov: Optional[ProvenanceItem] = None) -> NodeItem: + li = FormListItem(self_ref=self.get_ref()) + return item + ContentItem = Annotated[ Union[ @@ -1940,6 +1982,7 @@ class FormItem(FloatingItem): PictureItem, TableItem, KeyValueItem, + FormItem, ], Field(discriminator="label"), ] @@ -2987,7 +3030,7 @@ def add_key_values( def add_form( self, - graph: GraphData, + form: Optional[FormItem] = None, prov: Optional[ProvenanceItem] = None, parent: Optional[NodeItem] = None, ): @@ -3003,11 +3046,12 @@ def add_form( form_index = len(self.form_items) cref = f"#/form_items/{form_index}" - form_item = FormItem( - graph=graph, - self_ref=cref, - parent=parent.get_ref(), - ) + if form is None: + form = FormItem( + self_ref=cref, + parent=parent.get_ref(), + ) + if prov: form_item.prov.append(prov) diff --git a/docling_core/types/doc/labels.py b/docling_core/types/doc/labels.py index e5884bcb..69454895 100644 --- a/docling_core/types/doc/labels.py +++ b/docling_core/types/doc/labels.py @@ -31,6 +31,11 @@ class DocItemLabel(str, Enum): HANDWRITTEN_TEXT = "handwritten_text" EMPTY_VALUE = "empty_value" # used for empty value fields in fillable forms + # FORM_HEADER = "form_header" + FORM_KEY = "form_key" + FORM_LISTITEM = "form_listitem" + CHECKBOX = "checkbox" + # Additional labels for markup-based formats (e.g. HTML, Word) PARAGRAPH = "paragraph" REFERENCE = "reference" diff --git a/test/data/docling_document/unit/FormItem.yaml b/test/data/docling_document/unit/FormItem.yaml index af7a61e1..219e951e 100644 --- a/test/data/docling_document/unit/FormItem.yaml +++ b/test/data/docling_document/unit/FormItem.yaml @@ -24,7 +24,7 @@ graph: source_cell_id: 1 target_cell_id: 0 image: null -label: form +label: key_value_region parent: null prov: [] references: [] diff --git a/test/test_docling_doc.py b/test/test_docling_doc.py index 50c263a9..de71fb1c 100644 --- a/test/test_docling_doc.py +++ b/test/test_docling_doc.py @@ -14,6 +14,7 @@ from docling_core.types.doc.base import BoundingBox, CoordOrigin, ImageRefMode, Size from docling_core.types.doc.document import ( # BoundingBox, CURRENT_VERSION, + CheckboxItem, CodeItem, ContentLayer, DocItem, @@ -42,6 +43,8 @@ TableItem, TextItem, TitleItem, + CheckboxItem, + FormListItem, ) from docling_core.types.doc.labels import ( DocItemLabel, @@ -491,6 +494,7 @@ def verify(dc, obj): elif dc is FormItem: + """ graph = GraphData( cells=[ GraphCell( @@ -524,7 +528,31 @@ def verify(dc, obj): self_ref="#", ) verify(dc, obj) + """ + + key_name = TextItem(text="name", orig="name", self_ref="#", label=DocItemLabel.FORM_KEY) + val_name = TextItem(text="John Doe", orig="name", self_ref="#", label=DocItemLabel.TEXT) + + form_item_name = FormListItem(key=key_name, self_ref="#") + form_item_name.add_value(val_name) + + key_age = TextItem(text="Age", orig="Age", self_ref="#", label=DocItemLabel.FORM_KEY) + + cb_age_0 = CheckboxItem(checked=True, text="0-20", orig="0-20", self_ref="#") + cb_age_1 = CheckboxItem(checked=False, text="20-40", orig="20-40", self_ref="#") + val_age = TextItem(text="other", orig="other", self_ref="#", label=DocItemLabel.TEXT) + + form_item_age = FormListItem(key=key_age, self_ref="#") #, value=[cb_age_0, cb_age_1, val_age]) + for _ in [cb_age_0, cb_age_1, val_age]: + form_item_age.add_value(_) + + form = FormItem(self_ref="#") + + form.add(form_item_name) + form.add(form_item_age) + verify(dc, obj) + elif dc is TitleItem: obj = dc( text="whatever", @@ -571,8 +599,12 @@ def verify(dc, obj): text="E=mc^2", ) verify(dc, obj) - elif dc is GraphData: # we skip this on purpose + elif dc is CheckboxItem: # we skip this on purpose + continue + elif dc is FormListItem: # we skip this on purpose continue + elif dc is GraphData: # we skip this on purpose + continue else: raise RuntimeError(f"New derived class detected {dc.__name__}") @@ -1002,8 +1034,10 @@ def _construct_doc() -> DoclingDocument: doc.add_key_values(graph=graph) - doc.add_form(graph=graph) + form_1 = doc.add_form(graph=graph) + form_1_item_1 = form_1.add_listitem(key="Name") + inline_fmt = doc.add_inline_group() doc.add_text( label=DocItemLabel.TEXT, text="Some formatting chops:", parent=inline_fmt