diff --git a/docling_core/transforms/serializer/base.py b/docling_core/transforms/serializer/base.py index dc4f2eee..0f1437e0 100644 --- a/docling_core/transforms/serializer/base.py +++ b/docling_core/transforms/serializer/base.py @@ -11,6 +11,7 @@ from pydantic import AnyUrl, BaseModel from docling_core.types.doc.document import ( + ChartItem, DocItem, DoclingDocument, FloatingItem, @@ -82,6 +83,22 @@ def serialize( ... +class BaseChartSerializer(ABC): + """Basr class for chart item serializers.""" + + @abstractmethod + def serialize( + self, + *, + item: ChartItem, + doc_serializer: "BaseDocSerializer", + doc: DoclingDocument, + **kwargs: Any, + ) -> SerializationResult: + """Serializes the passed item.""" + ... + + class BasePictureSerializer(ABC): """Base class for picture item serializers.""" diff --git a/docling_core/transforms/serializer/common.py b/docling_core/transforms/serializer/common.py index f5d80af9..3a4a7346 100644 --- a/docling_core/transforms/serializer/common.py +++ b/docling_core/transforms/serializer/common.py @@ -16,6 +16,7 @@ from docling_core.transforms.serializer.base import ( BaseAnnotationSerializer, + BaseChartSerializer, BaseDocSerializer, BaseFallbackSerializer, BaseFormSerializer, @@ -30,6 +31,7 @@ ) from docling_core.types.doc.document import ( DOCUMENT_TOKENS_EXPORT_LABELS, + ChartItem, ContentLayer, DescriptionAnnotation, DocItem, @@ -207,6 +209,7 @@ class DocSerializer(BaseModel, BaseDocSerializer): text_serializer: BaseTextSerializer table_serializer: BaseTableSerializer + chart_serializer: BaseChartSerializer picture_serializer: BasePictureSerializer key_value_serializer: BaseKeyValueSerializer form_serializer: BaseFormSerializer @@ -362,6 +365,14 @@ def serialize( visited=my_visited, **my_kwargs, ) + elif isinstance(item, ChartItem): + part = self.chart_serializer.serialize( + item=item, + doc_serializer=self, + doc=self.doc, + visited=my_visited, + **my_kwargs, + ) elif isinstance(item, PictureItem): part = self.picture_serializer.serialize( item=item, diff --git a/docling_core/transforms/serializer/doctags.py b/docling_core/transforms/serializer/doctags.py index 844d0096..3d8cc75a 100644 --- a/docling_core/transforms/serializer/doctags.py +++ b/docling_core/transforms/serializer/doctags.py @@ -8,6 +8,7 @@ from docling_core.transforms.serializer.base import ( BaseAnnotationSerializer, + BaseChartSerializer, BaseDocSerializer, BaseFallbackSerializer, BaseFormSerializer, @@ -27,6 +28,7 @@ ) from docling_core.types.doc.base import BoundingBox from docling_core.types.doc.document import ( + ChartItem, CodeItem, DocItem, DoclingDocument, @@ -207,6 +209,23 @@ def serialize( return create_ser_result(text=text_res, span_source=res_parts) +class DocTagsChartSerializer(BaseChartSerializer): + """DocTags-specific chart item serializer.""" + + @override + def serialize( + self, + *, + item: ChartItem, + doc_serializer: BaseDocSerializer, + doc: DoclingDocument, + **kwargs: Any, + ) -> SerializationResult: + """Serializes the passed item.""" + # TODO add actual implementation + return create_ser_result() + + class DocTagsPictureSerializer(BasePictureSerializer): """DocTags-specific picture item serializer.""" @@ -539,6 +558,7 @@ class DocTagsDocSerializer(DocSerializer): text_serializer: BaseTextSerializer = DocTagsTextSerializer() table_serializer: BaseTableSerializer = DocTagsTableSerializer() + chart_serializer: BaseChartSerializer = DocTagsChartSerializer() picture_serializer: BasePictureSerializer = DocTagsPictureSerializer() key_value_serializer: BaseKeyValueSerializer = DocTagsKeyValueSerializer() form_serializer: BaseFormSerializer = DocTagsFormSerializer() diff --git a/docling_core/transforms/serializer/html.py b/docling_core/transforms/serializer/html.py index 98e5cf7d..bc8a43b1 100644 --- a/docling_core/transforms/serializer/html.py +++ b/docling_core/transforms/serializer/html.py @@ -22,6 +22,7 @@ from docling_core.transforms.serializer.base import ( BaseAnnotationSerializer, + BaseChartSerializer, BaseDocSerializer, BaseFallbackSerializer, BaseFormSerializer, @@ -46,6 +47,7 @@ from docling_core.transforms.visualizer.base import BaseVisualizer from docling_core.types.doc.base import ImageRefMode from docling_core.types.doc.document import ( + ChartItem, CodeItem, ContentLayer, DescriptionAnnotation, @@ -406,6 +408,23 @@ def serialize( return create_ser_result(text=text_res, span_source=res_parts) +class HTMLChartSerializer(BaseChartSerializer): + """HTML-specific chart item serializer.""" + + @override + def serialize( + self, + *, + item: ChartItem, + doc_serializer: BaseDocSerializer, + doc: DoclingDocument, + **kwargs: Any, + ) -> SerializationResult: + """Export chart to HTML format.""" + # TODO add actual implementation + return create_ser_result() + + class HTMLPictureSerializer(BasePictureSerializer): """HTML-specific picture item serializer.""" @@ -850,6 +869,7 @@ class HTMLDocSerializer(DocSerializer): text_serializer: BaseTextSerializer = HTMLTextSerializer() table_serializer: BaseTableSerializer = HTMLTableSerializer() + chart_serializer: BaseChartSerializer = HTMLChartSerializer() picture_serializer: BasePictureSerializer = HTMLPictureSerializer() key_value_serializer: BaseKeyValueSerializer = HTMLKeyValueSerializer() form_serializer: BaseFormSerializer = HTMLFormSerializer() diff --git a/docling_core/transforms/serializer/markdown.py b/docling_core/transforms/serializer/markdown.py index d0908270..61ff7779 100644 --- a/docling_core/transforms/serializer/markdown.py +++ b/docling_core/transforms/serializer/markdown.py @@ -17,6 +17,7 @@ from docling_core.transforms.serializer.base import ( BaseAnnotationSerializer, + BaseChartSerializer, BaseDocSerializer, BaseFallbackSerializer, BaseFormSerializer, @@ -36,6 +37,7 @@ ) from docling_core.types.doc.base import ImageRefMode from docling_core.types.doc.document import ( + ChartItem, CodeItem, ContentLayer, DescriptionAnnotation, @@ -357,6 +359,62 @@ def serialize( return create_ser_result(text=text_res, span_source=res_parts) +class MarkdownChartSerializer(BaseChartSerializer): + """Mardown-specific chart item serializer.""" + + @override + def serialize( + self, + *, + item: ChartItem, + doc_serializer: BaseDocSerializer, + doc: DoclingDocument, + **kwargs: Any, + ) -> SerializationResult: + """Serializes the passed item.""" + params = MarkdownParams(**kwargs) + res_parts: list[SerializationResult] = [] + cap_res = doc_serializer.serialize_captions( + item=item, + **kwargs, + ) + + if cap_res.text: + res_parts.append(cap_res) + + if item.self_ref not in doc_serializer.get_excluded_refs(): + if params.include_annotations: + ann_res = doc_serializer.serialize_annotations( + item=item, + **kwargs, + ) + if ann_res.text: + res_parts.append(ann_res) + + res_parts.append(create_ser_result(text=item.data.title, span_source=item)) + res_parts.append(create_ser_result(text=item.data.kind, span_source=item)) + if item.data.is_categorical and item.data.categories: + categories = ", ".join(item.data.categories) + if categories: + res_parts.append( + create_ser_result( + text=f"Categories: {categories}", span_source=item + ) + ) + if item.data.series: + series_text = "" + for series_name, series_data in item.data.series: + series_text += f"- {series_name}: {series_data}\n" + if series_text: + res_parts.append( + create_ser_result(text=series_text.strip(), span_source=item) + ) + + text_res = "\n\n".join([r.text for r in res_parts]) + + return create_ser_result(text=text_res, span_source=res_parts) + + class MarkdownPictureSerializer(BasePictureSerializer): """Markdown-specific picture item serializer.""" @@ -621,6 +679,7 @@ class MarkdownDocSerializer(DocSerializer): text_serializer: BaseTextSerializer = MarkdownTextSerializer() table_serializer: BaseTableSerializer = MarkdownTableSerializer() + chart_serializer: BaseChartSerializer = MarkdownChartSerializer() picture_serializer: BasePictureSerializer = MarkdownPictureSerializer() key_value_serializer: BaseKeyValueSerializer = MarkdownKeyValueSerializer() form_serializer: BaseFormSerializer = MarkdownFormSerializer() diff --git a/docling_core/types/doc/__init__.py b/docling_core/types/doc/__init__.py index 25b5a869..93b04c4d 100644 --- a/docling_core/types/doc/__init__.py +++ b/docling_core/types/doc/__init__.py @@ -10,6 +10,8 @@ AnyTableCell, BaseAnnotation, ChartBar, + ChartData, + ChartItem, ChartLine, ChartPoint, ChartSlice, diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index 45d8611b..6ddeb194 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -68,6 +68,7 @@ DocItemLabel.SECTION_HEADER, DocItemLabel.PARAGRAPH, DocItemLabel.TABLE, + DocItemLabel.CHART, DocItemLabel.PICTURE, DocItemLabel.FORMULA, DocItemLabel.CHECKBOX_UNSELECTED, @@ -709,6 +710,16 @@ class PictureTabularChartData(PictureChartData): ] +class ChartData(BaseModel): + """BaseChartData.""" + + title: str + kind: str + is_categorical: bool = False + series: List[Tuple[str, list]] + categories: Optional[List[str]] = None + + class DocumentOrigin(BaseModel): """FileSource.""" @@ -1826,6 +1837,48 @@ def get_annotations(self) -> Sequence[BaseAnnotation]: return self.annotations +class ChartItem(FloatingItem): + """ChartItem.""" + + data: ChartData + label: typing.Literal[ + DocItemLabel.DOCUMENT_INDEX, + DocItemLabel.CHART, + ] = DocItemLabel.CHART + + def export_to_markdown(self, doc: Optional["DoclingDocument"] = None) -> str: + """Export the chart as markdown.""" + if doc is not None: + from docling_core.transforms.serializer.markdown import ( + MarkdownDocSerializer, + ) + + serializer = MarkdownDocSerializer(doc=doc) + text = serializer.serialize(item=self).text + return text + return "Failed" + + def export_to_dataframe( + self, doc: Optional["DoclingDocument"] = None + ) -> pd.DataFrame: + """Export the chart as a Pandas DataFrame.""" + if doc is None: + _logger.warning( + "Usage of ChartItem.export_to_dataframe() without `doc` argument is deprecated." + ) + + if len(self.data.series) == 0: + return pd.DataFrame() + + data = {name: data for name, data in self.data.series} + + df = pd.DataFrame( + data=data, + ) + + return df + + class GraphCell(BaseModel): """GraphCell.""" @@ -1984,6 +2037,7 @@ class DoclingDocument(BaseModel): ] = [] pictures: List[PictureItem] = [] tables: List[TableItem] = [] + charts: List[ChartItem] = [] key_value_items: List[KeyValueItem] = [] form_items: List[FormItem] = [] @@ -2131,6 +2185,17 @@ def _append_item(self, *, item: NodeItem, parent_ref: RefItem) -> RefItem: self.tables.append(item) + elif isinstance(item, ChartItem): + item_label = "charts" + item_index = len(self.charts) + + cref = f"#/{item_label}/{item_index}" + + item.self_ref = cref + item.parent = parent_ref + + self.charts.append(item) + elif isinstance(item, PictureItem): item_label = "pictures" item_index = len(self.pictures) @@ -2726,6 +2791,40 @@ def add_table( return tbl_item + def add_chart( + self, + data: ChartData, + prov: Optional[ProvenanceItem] = None, + parent: Optional[NodeItem] = None, + label: DocItemLabel = DocItemLabel.CHART, + ): + """add_chart. + + :param data: ChartData: + :param prov: Optional[ProvenanceItem]: (Default value = None) + :param parent: Optional[NodeItem]: (Default value = None) + :param label: DocItemLabel: (Default value = DocItemLabel.CHART) + """ + if not parent: + parent = self.body + + chart_index = len(self.charts) + cref = f"#/charts/{chart_index}" + + chart_item = ChartItem( + label=label, + data=data, + self_ref=cref, + parent=parent.get_ref(), + ) + if prov: + chart_item.prov.append(prov) + + self.charts.append(chart_item) + parent.children.append(RefItem(cref=cref)) + + return chart_item + def add_picture( self, annotations: Optional[List[PictureDataType]] = None, @@ -3730,6 +3829,39 @@ def insert_form( return form_item + def insert_chart( + self, + sibling: NodeItem, + data: ChartData, + prov: Optional[ProvenanceItem] = None, + label: DocItemLabel = DocItemLabel.CHART, + after: bool = True, + ) -> ChartItem: + """Creates a new ChartItem item and inserts it into the document. + + :param sibling: NodeItem: + :param data: ChartData: + :param prov: Optional[ProvenanceItem]: (Default value = None) + :param label: DocItemLabel: (Default value = DocItemLabel.CHART) + :param after: bool: (Default value = True) + :returns: ChartItem: The newly created ChartItem item. + """ + stack, parent_ref = self._get_insertion_stack_and_parent(sibling=sibling) + + chart_item = ChartItem( + label=label, + data=data, + self_ref="#", + parent=parent_ref, + ) + + if prov: + chart_item.prov.append(prov) + + self._insert_in_structure(item=chart_item, stack=stack, after=after) + + return chart_item + # --------------------------- # Range Manipulation Methods # --------------------------- @@ -5598,6 +5730,7 @@ class _DocIndex(BaseModel): texts: list[TextItem] = [] pictures: list[PictureItem] = [] tables: list[TableItem] = [] + charts: list[ChartItem] = [] key_value_items: list[KeyValueItem] = [] form_items: list[FormItem] = [] @@ -5716,6 +5849,7 @@ def _update_from_index(self, doc_index: "_DocIndex") -> None: self.texts = doc_index.texts self.pictures = doc_index.pictures self.tables = doc_index.tables + self.charts = doc_index.charts self.key_value_items = doc_index.key_value_items self.form_items = doc_index.form_items self.pages = doc_index.pages diff --git a/docs/DoclingDocument.json b/docs/DoclingDocument.json index 305f5a9b..8ce1d4c2 100644 --- a/docs/DoclingDocument.json +++ b/docs/DoclingDocument.json @@ -52,6 +52,159 @@ "title": "ChartBar", "type": "object" }, + "ChartData": { + "description": "BaseChartData.", + "properties": { + "title": { + "title": "Title", + "type": "string" + }, + "kind": { + "title": "Kind", + "type": "string" + }, + "is_categorical": { + "default": false, + "title": "Is Categorical", + "type": "boolean" + }, + "series": { + "items": { + "maxItems": 2, + "minItems": 2, + "prefixItems": [ + { + "type": "string" + }, + { + "items": {}, + "type": "array" + } + ], + "type": "array" + }, + "title": "Series", + "type": "array" + }, + "categories": { + "anyOf": [ + { + "items": { + "type": "string" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Categories" + } + }, + "required": [ + "title", + "kind", + "series" + ], + "title": "ChartData", + "type": "object" + }, + "ChartItem": { + "additionalProperties": false, + "description": "ChartItem.", + "properties": { + "self_ref": { + "pattern": "^#(?:/([\\w-]+)(?:/(\\d+))?)?$", + "title": "Self Ref", + "type": "string" + }, + "parent": { + "anyOf": [ + { + "$ref": "#/$defs/RefItem" + }, + { + "type": "null" + } + ], + "default": null + }, + "children": { + "default": [], + "items": { + "$ref": "#/$defs/RefItem" + }, + "title": "Children", + "type": "array" + }, + "content_layer": { + "$ref": "#/$defs/ContentLayer", + "default": "body" + }, + "label": { + "default": "chart", + "enum": [ + "document_index", + "chart" + ], + "title": "Label", + "type": "string" + }, + "prov": { + "default": [], + "items": { + "$ref": "#/$defs/ProvenanceItem" + }, + "title": "Prov", + "type": "array" + }, + "captions": { + "default": [], + "items": { + "$ref": "#/$defs/RefItem" + }, + "title": "Captions", + "type": "array" + }, + "references": { + "default": [], + "items": { + "$ref": "#/$defs/RefItem" + }, + "title": "References", + "type": "array" + }, + "footnotes": { + "default": [], + "items": { + "$ref": "#/$defs/RefItem" + }, + "title": "Footnotes", + "type": "array" + }, + "image": { + "anyOf": [ + { + "$ref": "#/$defs/ImageRef" + }, + { + "type": "null" + } + ], + "default": null + }, + "data": { + "$ref": "#/$defs/ChartData" + } + }, + "required": [ + "self_ref", + "data" + ], + "title": "ChartItem", + "type": "object" + }, "ChartLine": { "description": "Represents a line in a line chart.\n\nAttributes:\n label (str): The label for the line.\n values (List[Tuple[float, float]]): A list of (x, y) coordinate pairs\n representing the line's data points.", "properties": { @@ -2459,6 +2612,14 @@ "title": "Tables", "type": "array" }, + "charts": { + "default": [], + "items": { + "$ref": "#/$defs/ChartItem" + }, + "title": "Charts", + "type": "array" + }, "key_value_items": { "default": [], "items": { diff --git a/examples/2408.09869v3.json b/examples/2408.09869v3.json index 6dac6986..9248a396 100644 --- a/examples/2408.09869v3.json +++ b/examples/2408.09869v3.json @@ -44968,6 +44968,7 @@ } } ], + "charts": [], "key_value_items": [], "form_items": [], "pages": { diff --git a/test/data/doc/2206.01062.yaml.dt.json b/test/data/doc/2206.01062.yaml.dt.json index f954386b..64bad9c8 100644 --- a/test/data/doc/2206.01062.yaml.dt.json +++ b/test/data/doc/2206.01062.yaml.dt.json @@ -16157,6 +16157,7 @@ "annotations": [] } ], + "charts": [], "key_value_items": [], "form_items": [], "pages": { diff --git a/test/data/doc/2408.09869v3_enriched.out.dt.json b/test/data/doc/2408.09869v3_enriched.out.dt.json index 9127073f..5629ef52 100644 --- a/test/data/doc/2408.09869v3_enriched.out.dt.json +++ b/test/data/doc/2408.09869v3_enriched.out.dt.json @@ -15896,6 +15896,7 @@ "annotations": [] } ], + "charts": [], "key_value_items": [], "form_items": [], "pages": { diff --git a/test/data/doc/2408.09869v3_enriched_p2_p3_p5.gt.json b/test/data/doc/2408.09869v3_enriched_p2_p3_p5.gt.json index 7bbddf7b..eae4d717 100644 --- a/test/data/doc/2408.09869v3_enriched_p2_p3_p5.gt.json +++ b/test/data/doc/2408.09869v3_enriched_p2_p3_p5.gt.json @@ -3152,6 +3152,7 @@ ] } ], + "charts": [], "key_value_items": [], "form_items": [], "pages": { diff --git a/test/data/doc/concatenated.json b/test/data/doc/concatenated.json index 47fe4990..8d4959c2 100644 --- a/test/data/doc/concatenated.json +++ b/test/data/doc/concatenated.json @@ -15051,6 +15051,7 @@ "annotations": [] } ], + "charts": [], "key_value_items": [ { "self_ref": "#/key_value_items/0", diff --git a/test/data/doc/constructed_doc.added_extracted_doc.json.gt b/test/data/doc/constructed_doc.added_extracted_doc.json.gt index 4013747b..1ee1742c 100644 --- a/test/data/doc/constructed_doc.added_extracted_doc.json.gt +++ b/test/data/doc/constructed_doc.added_extracted_doc.json.gt @@ -2059,6 +2059,7 @@ "annotations": [] } ], + "charts": [], "key_value_items": [ { "self_ref": "#/key_value_items/0", diff --git a/test/data/doc/constructed_doc.appended_child.json.gt b/test/data/doc/constructed_doc.appended_child.json.gt index 74b6fba7..3b419efa 100644 --- a/test/data/doc/constructed_doc.appended_child.json.gt +++ b/test/data/doc/constructed_doc.appended_child.json.gt @@ -1301,6 +1301,7 @@ "annotations": [] } ], + "charts": [], "key_value_items": [ { "self_ref": "#/key_value_items/0", diff --git a/test/data/doc/constructed_doc.bulk_item_addition.json.gt b/test/data/doc/constructed_doc.bulk_item_addition.json.gt index 257c5b90..2aeba117 100644 --- a/test/data/doc/constructed_doc.bulk_item_addition.json.gt +++ b/test/data/doc/constructed_doc.bulk_item_addition.json.gt @@ -1948,6 +1948,7 @@ "annotations": [] } ], + "charts": [], "key_value_items": [ { "self_ref": "#/key_value_items/0", diff --git a/test/data/doc/constructed_doc.bulk_item_insertion.json.gt b/test/data/doc/constructed_doc.bulk_item_insertion.json.gt index ce4f7c6d..1da8270c 100644 --- a/test/data/doc/constructed_doc.bulk_item_insertion.json.gt +++ b/test/data/doc/constructed_doc.bulk_item_insertion.json.gt @@ -1978,6 +1978,7 @@ "annotations": [] } ], + "charts": [], "key_value_items": [ { "self_ref": "#/key_value_items/0", diff --git a/test/data/doc/constructed_doc.deleted_group.json.gt b/test/data/doc/constructed_doc.deleted_group.json.gt index 549ae6a0..db7ddce8 100644 --- a/test/data/doc/constructed_doc.deleted_group.json.gt +++ b/test/data/doc/constructed_doc.deleted_group.json.gt @@ -1299,6 +1299,7 @@ "annotations": [] } ], + "charts": [], "key_value_items": [ { "self_ref": "#/key_value_items/0", diff --git a/test/data/doc/constructed_doc.deleted_items_range.json.gt b/test/data/doc/constructed_doc.deleted_items_range.json.gt index 91b37357..22914c91 100644 --- a/test/data/doc/constructed_doc.deleted_items_range.json.gt +++ b/test/data/doc/constructed_doc.deleted_items_range.json.gt @@ -1301,6 +1301,7 @@ "annotations": [] } ], + "charts": [], "key_value_items": [ { "self_ref": "#/key_value_items/0", diff --git a/test/data/doc/constructed_doc.deleted_picture.json.gt b/test/data/doc/constructed_doc.deleted_picture.json.gt index 85890f23..e3e4e9c8 100644 --- a/test/data/doc/constructed_doc.deleted_picture.json.gt +++ b/test/data/doc/constructed_doc.deleted_picture.json.gt @@ -1269,6 +1269,7 @@ "annotations": [] } ], + "charts": [], "key_value_items": [ { "self_ref": "#/key_value_items/0", diff --git a/test/data/doc/constructed_doc.deleted_text.json.gt b/test/data/doc/constructed_doc.deleted_text.json.gt index 45c03c2a..6d0a7f63 100644 --- a/test/data/doc/constructed_doc.deleted_text.json.gt +++ b/test/data/doc/constructed_doc.deleted_text.json.gt @@ -1541,6 +1541,7 @@ "annotations": [] } ], + "charts": [], "key_value_items": [ { "self_ref": "#/key_value_items/0", diff --git a/test/data/doc/constructed_doc.embedded.json.gt b/test/data/doc/constructed_doc.embedded.json.gt index 4ac0e019..bffad9b0 100644 --- a/test/data/doc/constructed_doc.embedded.json.gt +++ b/test/data/doc/constructed_doc.embedded.json.gt @@ -1524,6 +1524,7 @@ "annotations": [] } ], + "charts": [], "key_value_items": [ { "self_ref": "#/key_value_items/0", diff --git a/test/data/doc/constructed_doc.embedded.yaml.gt b/test/data/doc/constructed_doc.embedded.yaml.gt index 15d93ce3..3cabdc12 100644 --- a/test/data/doc/constructed_doc.embedded.yaml.gt +++ b/test/data/doc/constructed_doc.embedded.yaml.gt @@ -27,6 +27,7 @@ body: label: unspecified name: _root_ self_ref: '#/body' +charts: [] form_items: - captions: [] children: [] diff --git a/test/data/doc/constructed_doc.extracted_with_deletion.json.gt b/test/data/doc/constructed_doc.extracted_with_deletion.json.gt index fc7a3b94..c32b5cfc 100644 --- a/test/data/doc/constructed_doc.extracted_with_deletion.json.gt +++ b/test/data/doc/constructed_doc.extracted_with_deletion.json.gt @@ -1484,6 +1484,7 @@ "annotations": [] } ], + "charts": [], "key_value_items": [ { "self_ref": "#/key_value_items/0", diff --git a/test/data/doc/constructed_doc.inserted_extracted_doc.json.gt b/test/data/doc/constructed_doc.inserted_extracted_doc.json.gt index a31af507..ef0272f7 100644 --- a/test/data/doc/constructed_doc.inserted_extracted_doc.json.gt +++ b/test/data/doc/constructed_doc.inserted_extracted_doc.json.gt @@ -2633,6 +2633,7 @@ "annotations": [] } ], + "charts": [], "key_value_items": [ { "self_ref": "#/key_value_items/0", diff --git a/test/data/doc/constructed_doc.inserted_items_with_insert_*.json.gt b/test/data/doc/constructed_doc.inserted_items_with_insert_*.json.gt index 2722426c..7474b789 100644 --- a/test/data/doc/constructed_doc.inserted_items_with_insert_*.json.gt +++ b/test/data/doc/constructed_doc.inserted_items_with_insert_*.json.gt @@ -1821,6 +1821,7 @@ "annotations": [] } ], + "charts": [], "key_value_items": [ { "self_ref": "#/key_value_items/0", diff --git a/test/data/doc/constructed_doc.inserted_list_items_with_insert_*.json.gt b/test/data/doc/constructed_doc.inserted_list_items_with_insert_*.json.gt index 42044db6..16521dcf 100644 --- a/test/data/doc/constructed_doc.inserted_list_items_with_insert_*.json.gt +++ b/test/data/doc/constructed_doc.inserted_list_items_with_insert_*.json.gt @@ -1917,6 +1917,7 @@ "annotations": [] } ], + "charts": [], "key_value_items": [ { "self_ref": "#/key_value_items/0", diff --git a/test/data/doc/constructed_doc.inserted_text.json.gt b/test/data/doc/constructed_doc.inserted_text.json.gt index 6c4285f4..9e6a1206 100644 --- a/test/data/doc/constructed_doc.inserted_text.json.gt +++ b/test/data/doc/constructed_doc.inserted_text.json.gt @@ -1558,6 +1558,7 @@ "annotations": [] } ], + "charts": [], "key_value_items": [ { "self_ref": "#/key_value_items/0", diff --git a/test/data/doc/constructed_doc.manipulated_table.json.gt b/test/data/doc/constructed_doc.manipulated_table.json.gt index e65dd7d8..c201c5e2 100644 --- a/test/data/doc/constructed_doc.manipulated_table.json.gt +++ b/test/data/doc/constructed_doc.manipulated_table.json.gt @@ -2058,6 +2058,7 @@ "annotations": [] } ], + "charts": [], "key_value_items": [ { "self_ref": "#/key_value_items/0", diff --git a/test/data/doc/constructed_doc.referenced.json.gt b/test/data/doc/constructed_doc.referenced.json.gt index 8a11418f..ee265d52 100644 --- a/test/data/doc/constructed_doc.referenced.json.gt +++ b/test/data/doc/constructed_doc.referenced.json.gt @@ -1524,6 +1524,7 @@ "annotations": [] } ], + "charts": [], "key_value_items": [ { "self_ref": "#/key_value_items/0", diff --git a/test/data/doc/constructed_doc.referenced.yaml.gt b/test/data/doc/constructed_doc.referenced.yaml.gt index bb291c11..9ab1b08a 100644 --- a/test/data/doc/constructed_doc.referenced.yaml.gt +++ b/test/data/doc/constructed_doc.referenced.yaml.gt @@ -27,6 +27,7 @@ body: label: unspecified name: _root_ self_ref: '#/body' +charts: [] form_items: - captions: [] children: [] diff --git a/test/data/doc/constructed_doc.replaced_item.json.gt b/test/data/doc/constructed_doc.replaced_item.json.gt index 91b37357..22914c91 100644 --- a/test/data/doc/constructed_doc.replaced_item.json.gt +++ b/test/data/doc/constructed_doc.replaced_item.json.gt @@ -1301,6 +1301,7 @@ "annotations": [] } ], + "charts": [], "key_value_items": [ { "self_ref": "#/key_value_items/0", diff --git a/test/data/doc/doc_with_kv.dt.json b/test/data/doc/doc_with_kv.dt.json index 43bc4bb9..8c9c01fd 100644 --- a/test/data/doc/doc_with_kv.dt.json +++ b/test/data/doc/doc_with_kv.dt.json @@ -24,6 +24,7 @@ "texts": [], "pictures": [], "tables": [], + "charts": [], "key_value_items": [ { "self_ref": "#/key_value_items/0", diff --git a/test/data/doc/dummy_doc_2_prec.yaml b/test/data/doc/dummy_doc_2_prec.yaml index 60cca33f..3d522365 100644 --- a/test/data/doc/dummy_doc_2_prec.yaml +++ b/test/data/doc/dummy_doc_2_prec.yaml @@ -8,6 +8,7 @@ body: label: unspecified name: _root_ self_ref: '#/body' +charts: [] form_items: [] furniture: children: diff --git a/test/data/doc/misplaced_list_items.norm.out.yaml b/test/data/doc/misplaced_list_items.norm.out.yaml index 1b33dd76..229d7441 100644 --- a/test/data/doc/misplaced_list_items.norm.out.yaml +++ b/test/data/doc/misplaced_list_items.norm.out.yaml @@ -7,6 +7,7 @@ body: label: unspecified name: _root_ self_ref: '#/body' +charts: [] form_items: [] furniture: children: [] diff --git a/test/data/doc/misplaced_list_items.out.yaml b/test/data/doc/misplaced_list_items.out.yaml index c334227e..0602d693 100644 --- a/test/data/doc/misplaced_list_items.out.yaml +++ b/test/data/doc/misplaced_list_items.out.yaml @@ -7,6 +7,7 @@ body: label: unspecified name: _root_ self_ref: '#/body' +charts: [] form_items: [] furniture: children: [] diff --git a/test/data/doc/page_with_pic.dt.json b/test/data/doc/page_with_pic.dt.json index 3d2239f7..3193fecd 100644 --- a/test/data/doc/page_with_pic.dt.json +++ b/test/data/doc/page_with_pic.dt.json @@ -530,6 +530,7 @@ } ], "tables": [], + "charts": [], "key_value_items": [], "form_items": [], "pages": { diff --git a/test/data/doc/page_with_pic_from_files.dt.json b/test/data/doc/page_with_pic_from_files.dt.json index 3d2239f7..3193fecd 100644 --- a/test/data/doc/page_with_pic_from_files.dt.json +++ b/test/data/doc/page_with_pic_from_files.dt.json @@ -530,6 +530,7 @@ } ], "tables": [], + "charts": [], "key_value_items": [], "form_items": [], "pages": { diff --git a/test/data/doc/page_without_pic.dt.json b/test/data/doc/page_without_pic.dt.json index 10cd83b9..104f465b 100644 --- a/test/data/doc/page_without_pic.dt.json +++ b/test/data/doc/page_without_pic.dt.json @@ -431,6 +431,7 @@ } ], "tables": [], + "charts": [], "key_value_items": [], "form_items": [], "pages": { diff --git a/test/data/doc/rich_table.out.yaml b/test/data/doc/rich_table.out.yaml index c5f8eecc..c0188a3a 100644 --- a/test/data/doc/rich_table.out.yaml +++ b/test/data/doc/rich_table.out.yaml @@ -6,6 +6,7 @@ body: label: unspecified name: _root_ self_ref: '#/body' +charts: [] form_items: [] furniture: children: [] diff --git a/test/data/doc/rich_table_item_ins_norm_1.out.yaml b/test/data/doc/rich_table_item_ins_norm_1.out.yaml index fecd739d..fe0908f9 100644 --- a/test/data/doc/rich_table_item_ins_norm_1.out.yaml +++ b/test/data/doc/rich_table_item_ins_norm_1.out.yaml @@ -6,6 +6,7 @@ body: label: unspecified name: _root_ self_ref: '#/body' +charts: [] form_items: [] furniture: children: [] diff --git a/test/data/doc/rich_table_item_ins_norm_2.out.yaml b/test/data/doc/rich_table_item_ins_norm_2.out.yaml index f2e05e5b..635b7bae 100644 --- a/test/data/doc/rich_table_item_ins_norm_2.out.yaml +++ b/test/data/doc/rich_table_item_ins_norm_2.out.yaml @@ -7,6 +7,7 @@ body: label: unspecified name: _root_ self_ref: '#/body' +charts: [] form_items: [] furniture: children: [] diff --git a/test/data/doc/rich_table_item_ins_norm_3.out.yaml b/test/data/doc/rich_table_item_ins_norm_3.out.yaml index b35564ff..eb8e5877 100644 --- a/test/data/doc/rich_table_item_ins_norm_3.out.yaml +++ b/test/data/doc/rich_table_item_ins_norm_3.out.yaml @@ -7,6 +7,7 @@ body: label: unspecified name: _root_ self_ref: '#/body' +charts: [] form_items: [] furniture: children: [] diff --git a/test/data/doc/rich_table_post_text_del.out.yaml b/test/data/doc/rich_table_post_text_del.out.yaml index 67e086d5..41aa7dde 100644 --- a/test/data/doc/rich_table_post_text_del.out.yaml +++ b/test/data/doc/rich_table_post_text_del.out.yaml @@ -5,6 +5,7 @@ body: label: unspecified name: _root_ self_ref: '#/body' +charts: [] form_items: [] furniture: children: [] diff --git a/test/data/docling_document/unit/ChartItem.yaml b/test/data/docling_document/unit/ChartItem.yaml new file mode 100644 index 00000000..5240b723 --- /dev/null +++ b/test/data/docling_document/unit/ChartItem.yaml @@ -0,0 +1,27 @@ +captions: [] +children: [] +content_layer: body +data: + categories: + - A + - B + - C + is_categorical: true + kind: COLUMN_CLUSTERED + series: + - - Series 1 + - - 10 + - 20 + - 30 + - - Series 2 + - - 15 + - 25 + - 35 + title: My Chart +footnotes: [] +label: chart +parent: null +prov: [] +references: [] +self_ref: '#' +image: null diff --git a/test/data/legacy_doc/doc-export.docling.yaml.gt b/test/data/legacy_doc/doc-export.docling.yaml.gt index 4fc4a7fa..251f996b 100644 --- a/test/data/legacy_doc/doc-export.docling.yaml.gt +++ b/test/data/legacy_doc/doc-export.docling.yaml.gt @@ -128,6 +128,7 @@ body: label: unspecified name: _root_ self_ref: '#/body' +charts: [] form_items: [] furniture: children: [] diff --git a/test/data/viz/2408.09869v3_enriched.dt_viz_p2.png b/test/data/viz/2408.09869v3_enriched.dt_viz_p2.png index 091fea2f..475bc90c 100644 Binary files a/test/data/viz/2408.09869v3_enriched.dt_viz_p2.png and b/test/data/viz/2408.09869v3_enriched.dt_viz_p2.png differ diff --git a/test/data/viz/2408.09869v3_enriched_viz_p1.png b/test/data/viz/2408.09869v3_enriched_viz_p1.png index 8b4c8813..69304379 100644 Binary files a/test/data/viz/2408.09869v3_enriched_viz_p1.png and b/test/data/viz/2408.09869v3_enriched_viz_p1.png differ diff --git a/test/data/viz/2408.09869v3_enriched_viz_p2.png b/test/data/viz/2408.09869v3_enriched_viz_p2.png index c4a38b65..cf80cba5 100644 Binary files a/test/data/viz/2408.09869v3_enriched_viz_p2.png and b/test/data/viz/2408.09869v3_enriched_viz_p2.png differ diff --git a/test/data/viz/2408.09869v3_enriched_viz_p3.png b/test/data/viz/2408.09869v3_enriched_viz_p3.png index 4a27af2a..4c1d1671 100644 Binary files a/test/data/viz/2408.09869v3_enriched_viz_p3.png and b/test/data/viz/2408.09869v3_enriched_viz_p3.png differ diff --git a/test/data/viz/cross_page_lists_p1.png b/test/data/viz/cross_page_lists_p1.png index fdb0fe76..47a15ad6 100644 Binary files a/test/data/viz/cross_page_lists_p1.png and b/test/data/viz/cross_page_lists_p1.png differ diff --git a/test/data/viz/cross_page_lists_p2.png b/test/data/viz/cross_page_lists_p2.png index 18079fde..ec9efd4b 100644 Binary files a/test/data/viz/cross_page_lists_p2.png and b/test/data/viz/cross_page_lists_p2.png differ diff --git a/test/test_docling_doc.py b/test/test_docling_doc.py index d3e239dd..c228c82a 100644 --- a/test/test_docling_doc.py +++ b/test/test_docling_doc.py @@ -14,6 +14,8 @@ from docling_core.types.doc.base import BoundingBox, CoordOrigin, ImageRefMode, Size from docling_core.types.doc.document import ( # BoundingBox, CURRENT_VERSION, + ChartData, + ChartItem, CodeItem, ContentLayer, DocItem, @@ -573,6 +575,18 @@ def verify(dc, obj): verify(dc, obj) elif dc is GraphData: # we skip this on purpose continue + elif dc is ChartItem: + obj = dc( + self_ref="#", + data=ChartData( + title="My Chart", + categories=["A", "B", "C"], + series=[("Series 1", [10, 20, 30]), ("Series 2", [15, 25, 35])], + is_categorical=True, + kind="COLUMN_CLUSTERED", + ), + ) + verify(dc, obj) else: raise RuntimeError(f"New derived class detected {dc.__name__}")