diff --git a/docling_core/experimental/serializer/__init__.py b/docling_core/experimental/serializer/__init__.py new file mode 100644 index 00000000..5c450a0e --- /dev/null +++ b/docling_core/experimental/serializer/__init__.py @@ -0,0 +1,5 @@ +"""Experimental serializers for docling-core. + +This package contains experimental serialization utilities (e.g., Markdown +summaries) that may change without notice. +""" diff --git a/docling_core/experimental/serializer/markdown_summary.py b/docling_core/experimental/serializer/markdown_summary.py new file mode 100644 index 00000000..3db6bebe --- /dev/null +++ b/docling_core/experimental/serializer/markdown_summary.py @@ -0,0 +1,293 @@ +"""Markdown document summary serializers (outline and TOC). + +This module provides a Markdown-focused serializer that emits a compact +document outline or a table of contents derived from a Docling document. +""" + +from enum import Enum +from typing import Any, Optional + +from typing_extensions import override + +from docling_core.transforms.serializer.base import SerializationResult +from docling_core.transforms.serializer.common import create_ser_result +from docling_core.transforms.serializer.markdown import ( + MarkdownDocSerializer, + MarkdownParams, +) +from docling_core.types.doc import ( + CodeItem, + DocItem, + DocItemLabel, + FormItem, + GroupItem, + ListGroup, + ListItem, + NodeItem, + PictureItem, + SectionHeaderItem, + TableItem, + TextItem, + TitleItem, +) + + +class MarkdownSummaryMode(str, Enum): + """Display mode for document summary output.""" + + OUTLINE = "outline" + TABLE_OF_CONTENTS = "table_of_contents" + + +class MarkdownSummaryParams(MarkdownParams): + """Markdown-specific serialization parameters for outline. + + Inherits MarkdownParams to retain Markdown behaviors (escaping, links, etc.). + """ + + mode: MarkdownSummaryMode = MarkdownSummaryMode.OUTLINE + + use_markdown_headers: bool = False + + add_label_counter: bool = False + add_references: bool = True + add_summary: bool = True + + toc_labels: list[DocItemLabel] = [DocItemLabel.TITLE, DocItemLabel.SECTION_HEADER] + + +class MarkdownSummarySerializer(MarkdownDocSerializer): + """Markdown-specific document summary serializer. + + Inherits MarkdownDocSerializer to reuse Markdown formatting/post-processing + and sub-serializers; overrides only the parts selection logic. + """ + + params: MarkdownSummaryParams = MarkdownSummaryParams() + + @override + def get_parts( + self, + item: Optional[NodeItem] = None, + **kwargs: Any, + ) -> list[SerializationResult]: + """Return a single part containing the document (or subtree) outline.""" + return self._create_document_outline(root=item, **kwargs) + + # return [create_ser_result(text=outline, span_source=[])] if outline else [] + + # ------------------------- + # Helper methods (internal) + # ------------------------- + + def _next_idx( + self, *, lbl: DocItemLabel, label_counter: dict[DocItemLabel, int] + ) -> int: + label_counter[lbl] = label_counter.get(lbl, 0) + 1 + return label_counter[lbl] + + def _include_label( + self, *, params: MarkdownSummaryParams, lbl: DocItemLabel + ) -> bool: + """Return True if label should be included (esp. for TOC mode).""" + if ( + params.mode == MarkdownSummaryMode.TABLE_OF_CONTENTS + and lbl not in params.toc_labels + ): + return False + return True + + def _is_node_excluded( + self, + *, + node: NodeItem, + excluded: set[str], + params: MarkdownSummaryParams, + ) -> bool: + """Centralize exclusion logic applied to nodes in the outline.""" + if isinstance(node, DocItem): + if node.self_ref in excluded: + return True + if ( + isinstance(node, TextItem) + and node.self_ref in self._captions_of_some_item + ): + return True + if not self._include_label(params=params, lbl=node.label): + return True + return False + + def _compose_node_label( + self, + *, + node: NodeItem, + params: MarkdownSummaryParams, + label_counter: dict[DocItemLabel, int], + ) -> str: + """Compute the textual label for a node (without refs). + + - When ``add_label_counter`` is True, add counters for non-table/picture + DocItems. + - Tables/pictures are numbered separately when building the final line. + - For groups, expose the raw normalized label but do not emit a line. + """ + node_label = "" + if ( + params.add_label_counter + and isinstance(node, DocItem) + and not isinstance(node, (TableItem, PictureItem)) + ): + base = str(node.label).replace("_", "-") + lbl_cnt = self._next_idx(lbl=node.label, label_counter=label_counter) + node_label = f"{base} {lbl_cnt}" + elif isinstance(node, (DocItem, GroupItem)): + node_label = str(node.label).replace("_", "-") + return node_label + + def _ref_part(self, *, node: NodeItem, params: MarkdownSummaryParams) -> str: + return f" (reference={node.self_ref})" if params.add_references else "" + + def _strip_md_header_prefix(self, text: str) -> str: + stripped = text.lstrip() + while stripped.startswith("#"): + stripped = stripped.lstrip("#").lstrip() + return stripped + + def _line_for_title( + self, + *, + node: TitleItem, + params: MarkdownSummaryParams, + node_label: str, + ref_part: str, + ) -> str: + raw_text = self.text_serializer.serialize( + item=node, doc_serializer=self, doc=self.doc + ).text + if params.use_markdown_headers: + text = raw_text.lstrip() + return f"{text}{ref_part}" + text = raw_text.lstrip().lstrip("# ") if raw_text.startswith("#") else raw_text + return ( + f"{node_label}{ref_part}: {text}" + if params.add_references + else f"{node_label}: {text}" + ) + + def _line_for_section_header( + self, + *, + node: SectionHeaderItem, + params: MarkdownSummaryParams, + node_label: str, + ) -> str: + raw_text = self.text_serializer.serialize( + item=node, doc_serializer=self, doc=self.doc + ).text + if params.use_markdown_headers: + text = raw_text.lstrip() + if params.add_references: + return f"{text} (level={node.level}, reference={node.self_ref})" + return f"{text} (level={node.level})" + stripped = self._strip_md_header_prefix(raw_text) + if params.add_references: + return f"{node_label} (level={node.level}, reference={node.self_ref}): {stripped}" + return f"{node_label} (level={node.level}): {stripped}" + + def _line_for_simple_label(self, *, node_label: str, ref_part: str) -> str: + return f"{node_label}{ref_part}" + + def _line_for_table( + self, *, node_label: str, ref_part: str, label_counter: dict[DocItemLabel, int] + ) -> str: + lbl_cnt = self._next_idx(lbl=DocItemLabel.TABLE, label_counter=label_counter) + return f"{node_label} {lbl_cnt}{ref_part}" + + def _line_for_picture( + self, *, node_label: str, ref_part: str, label_counter: dict[DocItemLabel, int] + ) -> str: + lbl_cnt = self._next_idx(lbl=DocItemLabel.PICTURE, label_counter=label_counter) + return f"{node_label} {lbl_cnt}{ref_part}" + + def _get_summary(self, *, node: NodeItem, params: MarkdownSummaryParams) -> str: + if ( + params.add_summary + and (node.summary is not None) + and isinstance(node.summary, str) + ): + return node.summary + return "" + + def _create_document_outline( + self, + *, + root: Optional[NodeItem] = None, + **kwargs: Any, + ) -> list[SerializationResult]: + """Create an outline, respecting params and recursive traversal.""" + params = self.params.merge_with_patch(patch=kwargs) + excluded = self.get_excluded_refs(**kwargs) + + label_counter: dict[DocItemLabel, int] = {} + visited: set[str] = set() + result: list[SerializationResult] = [] + + for node, _level in self.doc.iterate_items(root=root, with_groups=True): + if node.self_ref in visited: + continue + visited.add(node.self_ref) + + # Skip list items in outline + if isinstance(node, ListItem): + continue + + # Respect exclusion logic + if self._is_node_excluded(node=node, excluded=excluded, params=params): + continue + + summary = self._get_summary(node=node, params=params) + node_label = self._compose_node_label( + node=node, params=params, label_counter=label_counter + ) + ref_part = self._ref_part(node=node, params=params) + + line = "" + if isinstance(node, TitleItem): + line = self._line_for_title( + node=node, params=params, node_label=node_label, ref_part=ref_part + ) + elif isinstance(node, SectionHeaderItem): + line = self._line_for_section_header( + node=node, params=params, node_label=node_label + ) + elif isinstance(node, ListGroup): + line = "" # intentionally skip + elif isinstance(node, (TextItem, FormItem, CodeItem)): + line = self._line_for_simple_label( + node_label=node_label, ref_part=ref_part + ) + elif isinstance(node, TableItem): + line = self._line_for_table( + node_label=node_label, + ref_part=ref_part, + label_counter=label_counter, + ) + elif isinstance(node, PictureItem): + line = self._line_for_picture( + node_label=node_label, + ref_part=ref_part, + label_counter=label_counter, + ) + + if summary: + line = f"{line} (summary={summary})" if line else line + + if line: + result.append( + create_ser_result( + text=line, + span_source=node if isinstance(node, DocItem) else [], + ) + ) + + return result diff --git a/docling_core/experimental/serializer/outline.py b/docling_core/experimental/serializer/outline.py new file mode 100644 index 00000000..ed4eaa39 --- /dev/null +++ b/docling_core/experimental/serializer/outline.py @@ -0,0 +1,302 @@ +"""Markdown document summary serializers (outline and TOC). + +This module provides a Markdown-focused serializer that emits a compact +document outline or a table of contents derived from a Docling document. +""" + +from enum import Enum +from typing import Any, Optional + +from typing_extensions import override + +from docling_core.transforms.serializer.base import ( + BaseFallbackSerializer, + BaseFormSerializer, + BaseInlineSerializer, + BaseKeyValueSerializer, + BaseListSerializer, + BaseMetaSerializer, + BasePictureSerializer, + BaseTableSerializer, + BaseTextSerializer, + SerializationResult, +) +from docling_core.transforms.serializer.common import create_ser_result +from docling_core.transforms.serializer.markdown import ( + MarkdownDocSerializer, + MarkdownMetaSerializer, + MarkdownParams, + MarkdownTextSerializer, +) +from docling_core.types.doc import ( + BaseMeta, + DocItem, + DoclingDocument, + FormItem, + InlineGroup, + KeyValueItem, + ListGroup, + MetaFieldName, + NodeItem, + PictureItem, + SummaryMetaField, + SectionHeaderItem, + TableItem, + TextItem, + TitleItem, +) + +def _default_outline_node(item: NodeItem) -> str: + # return f"[{item.self_ref}] [{item.__class__.__name__}:{item.label.value}]" + return f"[reference={item.self_ref}]" + +def _default_summary(summary:str) -> str: + return f"(summary={summary})" + +class OutlineMode(str, Enum): + """Display mode for document summary output.""" + + OUTLINE = "outline" + TABLE_OF_CONTENTS = "table_of_contents" + + +class OutlineParams(MarkdownParams): + """Markdown-specific serialization parameters for outline. + + Inherits MarkdownParams to retain Markdown behaviors (escaping, links, etc.). + """ + + mode: OutlineMode = OutlineMode.OUTLINE + + +class _OutlineTextSerializer(BaseTextSerializer): + """_Outline class for text item serializers.""" + + def serialize( + self, + *, + item: TextItem, + doc_serializer: "_OutlineDocSerializer", + doc: DoclingDocument, + **kwargs: Any, + ) -> SerializationResult: + """Serializes the passed item.""" + prepend = "" + if isinstance(item, TitleItem) or isinstance(item, SectionHeaderItem): + # MarkdownDocSerializer requires a doc instance; pass through current doc + _md_serializer = MarkdownDocSerializer(doc=doc) + _serializer = MarkdownTextSerializer() + + res = _serializer.serialize(item=item, doc_serializer=_md_serializer, doc=doc) + prepend = res.text + + summary = "" + if item.meta and \ + (field_val := getattr(item.meta, MetaFieldName.SUMMARY)) is not None and \ + isinstance(field_val, SummaryMetaField): + summary = _default_summary(field_val.text) + + reference = _default_outline_node(item) + + text = " ".join([prepend, reference, summary]) + + return create_ser_result( + text=text + ) + + """ + def _serialize_meta_field( + self, meta: BaseMeta, name: str, mark_meta: bool + ) -> Optional[str]: + if (field_val := getattr(meta, name)) is not None and isinstance( + field_val, SummaryMetaField + ): + txt = field_val.text + return ( + f"[{self._humanize_text(name, title=True)}] {txt}" + if mark_meta + else txt + ) + else: + return None + """ + +class _OutlineTableSerializer(BaseTableSerializer): + """_Outline class for table item serializers.""" + + def serialize( + self, + *, + item: TableItem, + doc_serializer: "_OutlineDocSerializer", + doc: DoclingDocument, + **kwargs: Any, + ) -> SerializationResult: + """Serializes the passed item.""" + return create_ser_result( + text=_default_outline_node(item) + ) + + +class _OutlinePictureSerializer(BasePictureSerializer): + """_Outline class for picture item serializers.""" + + def serialize( + self, + *, + item: PictureItem, + doc_serializer: "_OutlineDocSerializer", + doc: DoclingDocument, + **kwargs: Any, + ) -> SerializationResult: + """Serializes the passed item.""" + return create_ser_result( + text=_default_outline_node(item) + ) + + +class _OutlineKeyValueSerializer(BaseKeyValueSerializer): + """_Outline class for key value item serializers.""" + + def serialize( + self, + *, + item: KeyValueItem, + doc_serializer: "_OutlineDocSerializer", + doc: DoclingDocument, + **kwargs: Any, + ) -> SerializationResult: + """Serializes the passed item.""" + return create_ser_result( + text=_default_outline_node(item) + ) + + +class _OutlineFormSerializer(BaseFormSerializer): + """_Outline class for form item serializers.""" + + def serialize( + self, + *, + item: FormItem, + doc_serializer: "_OutlineDocSerializer", + doc: DoclingDocument, + **kwargs: Any, + ) -> SerializationResult: + """Serializes the passed item.""" + return create_ser_result( + text=_default_outline_node(item) + ) + + +class _OutlineListSerializer(BaseListSerializer): + """_Outline class for list serializers.""" + + def serialize( + self, + *, + item: ListGroup, + doc_serializer: "_OutlineDocSerializer", + doc: DoclingDocument, + **kwargs: Any, + ) -> SerializationResult: + """Serializes the passed item.""" + return create_ser_result( + text=_default_outline_node(item) + ) + + +class _OutlineInlineSerializer(BaseInlineSerializer): + """_Outline class for inline serializers.""" + + def serialize( + self, + *, + item: InlineGroup, + doc_serializer: "_OutlineDocSerializer", + doc: DoclingDocument, + **kwargs: Any, + ) -> SerializationResult: + """Serializes the passed item.""" + return create_ser_result(text="") + + +class _OutlineFallbackSerializer(BaseFallbackSerializer): + """_Outline fallback class for item serializers.""" + + def serialize( + self, + *, + item: NodeItem, + doc_serializer: "_OutlineDocSerializer", + doc: DoclingDocument, + **kwargs: Any, + ) -> SerializationResult: + """Serializes the passed item.""" + return create_ser_result(text="") + + + +class _OutlineMetaSerializer(MarkdownMetaSerializer): + + @override + def serialize( + self, + *, + item: NodeItem, + doc: DoclingDocument, + level: Optional[int] = None, + **kwargs: Any, + ) -> SerializationResult: + """Serialize the item's meta.""" + params = MarkdownParams(**kwargs) + return create_ser_result( + text="\n\n".join( + [ + f"{' ' * (level or 0)}[{item.self_ref}] [{item.__class__.__name__}:{item.label.value}] {tmp}" # type:ignore[attr-defined] + for key in ( + list(item.meta.__class__.model_fields) + + list(item.meta.get_custom_part()) + ) + if ( + tmp := self._serialize_meta_field( + item.meta, key, params.mark_meta + ) + ) + ] + if item.meta + else [] + ), + span_source=item if isinstance(item, DocItem) else [], + ) + + def _serialize_meta_field( + self, meta: BaseMeta, name: str, mark_meta: bool + ) -> Optional[str]: + if (field_val := getattr(meta, name)) is not None and isinstance( + field_val, SummaryMetaField + ): + txt = field_val.text + return ( + f"[{self._humanize_text(name, title=True)}] {txt}" + if mark_meta + else txt + ) + else: + return None + +class OutlineDocSerializer(MarkdownDocSerializer): + + text_serializer: BaseTextSerializer = _OutlineTextSerializer() + table_serializer: BaseTableSerializer = _OutlineTableSerializer() + picture_serializer: BasePictureSerializer = _OutlinePictureSerializer() + key_value_serializer: BaseKeyValueSerializer = _OutlineKeyValueSerializer() + form_serializer: BaseFormSerializer = _OutlineFormSerializer() + fallback_serializer: BaseFallbackSerializer = _OutlineFallbackSerializer() + + list_serializer: BaseListSerializer = _OutlineListSerializer() + inline_serializer: BaseInlineSerializer = _OutlineInlineSerializer() + + meta_serializer: BaseMetaSerializer = _OutlineMetaSerializer() + + params: OutlineParams = OutlineParams() diff --git a/test/test_outline_serializer.py b/test/test_outline_serializer.py new file mode 100644 index 00000000..5d3faefb --- /dev/null +++ b/test/test_outline_serializer.py @@ -0,0 +1,30 @@ +from pathlib import Path + +from docling_core.experimental.serializer.outline import ( + OutlineDocSerializer, + OutlineParams, +) +from docling_core.types.doc import DoclingDocument + + +def test_outline_serializer_basic(): + src = Path("test/data/doc/2408.09869_p1.json") + doc = DoclingDocument.load_from_json(filename=src) + + print("\n\nMARKDOWN: \n\n") + print(doc.export_to_markdown()) + + # Only serialize metadata to focus on outline-like content + params = OutlineParams(include_non_meta=True) + ser = OutlineDocSerializer(doc=doc, params=params) + + res = ser.serialize() + actual = res.text + + print("\n\nSUMMARY: \n\n") + print(actual) + + assert isinstance(actual, str) + # Expect summaries from title and section header to appear + assert "This is a title." in actual + assert "This is a section header." in actual