Skip to content

Commit 916323f

Browse files
feat: Redefine CodeItem as floating object with captions (#160)
* updated the CodeItem with captions Signed-off-by: Peter Staar <[email protected]> * use FloatItem and add captions to add_code interface Signed-off-by: Michele Dolfi <[email protected]> --------- Signed-off-by: Peter Staar <[email protected]> Signed-off-by: Michele Dolfi <[email protected]> Co-authored-by: Michele Dolfi <[email protected]>
1 parent f751b45 commit 916323f

File tree

4 files changed

+96
-47
lines changed

4 files changed

+96
-47
lines changed

docling_core/transforms/chunker/hierarchical_chunker.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
from docling_core.transforms.chunker import BaseChunk, BaseChunker, BaseMeta
2020
from docling_core.types import DoclingDocument as DLDocument
2121
from docling_core.types.doc.document import (
22+
CodeItem,
2223
DocItem,
2324
DocumentOrigin,
2425
LevelNumber,
@@ -199,8 +200,10 @@ def chunk(self, dl_doc: DLDocument, **kwargs: Any) -> Iterator[BaseChunk]:
199200
heading_by_level.pop(k, None)
200201
continue
201202

202-
if isinstance(item, TextItem) or (
203-
(not self.merge_list_items) and isinstance(item, ListItem)
203+
if (
204+
isinstance(item, TextItem)
205+
or ((not self.merge_list_items) and isinstance(item, ListItem))
206+
or isinstance(item, CodeItem)
204207
):
205208
text = item.text
206209
elif isinstance(item, TableItem):

docling_core/types/doc/document.py

Lines changed: 52 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -677,51 +677,6 @@ def export_to_document_tokens(
677677
return body
678678

679679

680-
class CodeItem(TextItem):
681-
"""CodeItem."""
682-
683-
label: typing.Literal[DocItemLabel.CODE] = (
684-
DocItemLabel.CODE # type: ignore[assignment]
685-
)
686-
code_language: CodeLanguageLabel = CodeLanguageLabel.UNKNOWN
687-
688-
def export_to_document_tokens(
689-
self,
690-
doc: "DoclingDocument",
691-
new_line: str = "",
692-
xsize: int = 500,
693-
ysize: int = 500,
694-
add_location: bool = True,
695-
add_content: bool = True,
696-
):
697-
r"""Export text element to document tokens format.
698-
699-
:param doc: "DoclingDocument":
700-
:param new_line: str (Default value = "")
701-
:param xsize: int: (Default value = 500)
702-
:param ysize: int: (Default value = 500)
703-
:param add_location: bool: (Default value = True)
704-
:param add_content: bool: (Default value = True)
705-
706-
"""
707-
body = f"{DocumentToken.BEG_CODE.value}{new_line}"
708-
709-
if add_location:
710-
body += self.get_location_tokens(
711-
doc=doc,
712-
new_line=new_line,
713-
xsize=xsize,
714-
ysize=ysize,
715-
)
716-
717-
if add_content and self.text is not None:
718-
body += f"<_{self.code_language.value}_>{self.text}{new_line}"
719-
720-
body += f"{DocumentToken.END_CODE.value}\n"
721-
722-
return body
723-
724-
725680
class SectionHeaderItem(TextItem):
726681
"""SectionItem."""
727682

@@ -812,6 +767,53 @@ def get_image(self, doc: "DoclingDocument") -> Optional[PILImage.Image]:
812767
return super().get_image(doc=doc)
813768

814769

770+
class CodeItem(FloatingItem):
771+
"""CodeItem."""
772+
773+
label: typing.Literal[DocItemLabel.CODE] = (
774+
DocItemLabel.CODE # type: ignore[assignment]
775+
)
776+
orig: str # untreated representation
777+
text: str # sanitized representation
778+
code_language: CodeLanguageLabel = CodeLanguageLabel.UNKNOWN
779+
780+
def export_to_document_tokens(
781+
self,
782+
doc: "DoclingDocument",
783+
new_line: str = "",
784+
xsize: int = 500,
785+
ysize: int = 500,
786+
add_location: bool = True,
787+
add_content: bool = True,
788+
):
789+
r"""Export text element to document tokens format.
790+
791+
:param doc: "DoclingDocument":
792+
:param new_line: str (Default value = "")
793+
:param xsize: int: (Default value = 500)
794+
:param ysize: int: (Default value = 500)
795+
:param add_location: bool: (Default value = True)
796+
:param add_content: bool: (Default value = True)
797+
798+
"""
799+
body = f"{DocumentToken.BEG_CODE.value}{new_line}"
800+
801+
if add_location:
802+
body += self.get_location_tokens(
803+
doc=doc,
804+
new_line=new_line,
805+
xsize=xsize,
806+
ysize=ysize,
807+
)
808+
809+
if add_content and self.text is not None:
810+
body += f"<_{self.code_language.value}_>{self.text}{new_line}"
811+
812+
body += f"{DocumentToken.END_CODE.value}\n"
813+
814+
return body
815+
816+
815817
class PictureItem(FloatingItem):
816818
"""PictureItem."""
817819

@@ -1763,6 +1765,7 @@ def add_code(
17631765
text: str,
17641766
code_language: Optional[CodeLanguageLabel] = None,
17651767
orig: Optional[str] = None,
1768+
caption: Optional[Union[TextItem, RefItem]] = None,
17661769
prov: Optional[ProvenanceItem] = None,
17671770
parent: Optional[NodeItem] = None,
17681771
content_layer: Optional[ContentLayer] = None,
@@ -1772,6 +1775,8 @@ def add_code(
17721775
:param text: str:
17731776
:param code_language: Optional[str]: (Default value = None)
17741777
:param orig: Optional[str]: (Default value = None)
1778+
:param caption: Optional[Union[TextItem:
1779+
:param RefItem]]: (Default value = None)
17751780
:param prov: Optional[ProvenanceItem]: (Default value = None)
17761781
:param parent: Optional[NodeItem]: (Default value = None)
17771782
"""
@@ -1795,6 +1800,8 @@ def add_code(
17951800
code_item.content_layer = content_layer
17961801
if prov:
17971802
code_item.prov.append(prov)
1803+
if caption:
1804+
code_item.captions.append(caption.get_ref())
17981805

17991806
self.texts.append(code_item)
18001807
parent.children.append(RefItem(cref=cref))

docs/DoclingDocument.json

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,41 @@
208208
"title": "Prov",
209209
"type": "array"
210210
},
211+
"captions": {
212+
"default": [],
213+
"items": {
214+
"$ref": "#/$defs/RefItem"
215+
},
216+
"title": "Captions",
217+
"type": "array"
218+
},
219+
"references": {
220+
"default": [],
221+
"items": {
222+
"$ref": "#/$defs/RefItem"
223+
},
224+
"title": "References",
225+
"type": "array"
226+
},
227+
"footnotes": {
228+
"default": [],
229+
"items": {
230+
"$ref": "#/$defs/RefItem"
231+
},
232+
"title": "Footnotes",
233+
"type": "array"
234+
},
235+
"image": {
236+
"anyOf": [
237+
{
238+
"$ref": "#/$defs/ImageRef"
239+
},
240+
{
241+
"type": "null"
242+
}
243+
],
244+
"default": null
245+
},
211246
"orig": {
212247
"title": "Orig",
213248
"type": "string"

test/data/docling_document/unit/CodeItem.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,8 @@
11
children: []
2+
captions: []
3+
footnotes: []
4+
references: []
5+
image: null
26
code_language: Python
37
content_layer: body
48
label: code

0 commit comments

Comments
 (0)