docling-project
diff --git a/‎docling_core/experimental/idoctags.py‎
Lines changed: 51 additions & 34 deletions b/‎docling_core/experimental/idoctags.py‎
Lines changed: 51 additions & 34 deletions
diff --git a/‎examples/convert_to_idoctags.py‎
Lines changed: 2 additions & 3 deletions b/‎examples/convert_to_idoctags.py‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎test/data/doc/cdata_always.gt.idt.xml‎
Lines changed: 9 additions & 5 deletions b/‎test/data/doc/cdata_always.gt.idt.xml‎
Lines changed: 9 additions & 5 deletions
diff --git a/‎test/data/doc/cdata_when_needed.gt.idt.xml‎
Lines changed: 6 additions & 2 deletions b/‎test/data/doc/cdata_when_needed.gt.idt.xml‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎…ip_list_item_with_inline_group_init.yaml‎ ‎…_list_item_with_inline_deserialized.yaml‎test/data/doc/roundtrip_list_item_with_inline_group_init.yaml renamed to test/data/doc/roundtrip_list_item_with_inline_deserialized.yaml
Lines changed: 9 additions & 4 deletions b/‎…ip_list_item_with_inline_group_init.yaml‎ ‎…_list_item_with_inline_deserialized.yaml‎test/data/doc/roundtrip_list_item_with_inline_group_init.yaml renamed to test/data/doc/roundtrip_list_item_with_inline_deserialized.yaml
Lines changed: 9 additions & 4 deletions
@@ -407,6 +407,7 @@ class IDocTagsToken(str, Enum):
     URI = "uri"
     MARKER = "marker"
     FACETS = "facets"
+    CONTENT = "content"  # TODO: review element name
 
 
 class IDocTagsAttributeKey(str, Enum):
@@ -629,6 +630,7 @@ class IDocTagsVocabulary(BaseModel):
         IDocTagsToken.URI: IDocTagsCategory.BINARY_DATA,
         IDocTagsToken.MARKER: IDocTagsCategory.CONTENT,
         IDocTagsToken.FACETS: IDocTagsCategory.CONTENT,
+        IDocTagsToken.CONTENT: IDocTagsCategory.CONTENT,
     }
 
     @classmethod
@@ -959,6 +961,13 @@ class EscapeMode(str, Enum):
     CDATA_WHEN_NEEDED = "cdata_when_needed"  # wrap text in CDATA only if it contains special characters
 
 
+class WrapMode(str, Enum):
+    """Wrap mode for IDocTags output."""
+
+    WRAP_ALWAYS = "wrap_always"  # wrap all text in explicit wrapper element
+    WRAP_WHEN_NEEDED = "wrap_when_needed"  # wrap text only if it has leading or trailing whitespace
+
+
 class ContentType(str, Enum):
     """Content type for IDocTags output."""
 
@@ -996,31 +1005,30 @@ class IDocTagsParams(CommonParams):
 
     # IDocTags formatting
     do_self_closing: bool = True
-    pretty_indentation: Optional[str] = 2 * " "
-
-    # only relevant if pretty_indentation is None or empty:
-    mode: IDocTagsSerializationMode = IDocTagsSerializationMode.HUMAN_FRIENDLY
-    # Expand self-closing forms of non-self-closing tokens after pretty-printing
+    pretty_indentation: Optional[str] = 2 * " "  # None means minimized serialization, "" means no indentation
 
     preserve_empty_non_selfclosing: bool = True
     # XML compliance: escape special characters in text content
     escape_mode: EscapeMode = EscapeMode.CDATA_WHEN_NEEDED
+    content_wrapping_mode: WrapMode = WrapMode.WRAP_WHEN_NEEDED
 
 
 def _get_delim(*, params: IDocTagsParams) -> str:
     """Return record delimiter based on IDocTagsSerializationMode."""
-    if params.mode == IDocTagsSerializationMode.HUMAN_FRIENDLY:
-        return "\n"
-    if params.mode == IDocTagsSerializationMode.LLM_FRIENDLY:
-        return ""
-    raise RuntimeError(f"Unknown IDocTags mode: {params.mode}")
+    return "" if params.pretty_indentation is None else "\n"
 
 
-def _escape_text(text: str, escape_mode: EscapeMode) -> str:
-    if escape_mode == EscapeMode.CDATA_ALWAYS or (
-        escape_mode == EscapeMode.CDATA_WHEN_NEEDED and any(c in text for c in ['"', "'", "&", "<", ">"])
+def _escape_text(text: str, params: IDocTagsParams) -> str:
+    do_wrap = params.content_wrapping_mode == WrapMode.WRAP_ALWAYS or (
+        params.content_wrapping_mode == WrapMode.WRAP_WHEN_NEEDED and text != text.strip()
+    )
+    if params.escape_mode == EscapeMode.CDATA_ALWAYS or (
+        params.escape_mode == EscapeMode.CDATA_WHEN_NEEDED and any(c in text for c in ['"', "'", "&", "<", ">"])
     ):
-        return f"<![CDATA[{text}]]>"
+        text = f"<![CDATA[{text}]]>"
+    if do_wrap:
+        # text = f'<{el_str} xml:space="preserve">{text}</{el_str}>'
+        text = _wrap(text=text, wrap_tag=IDocTagsToken.CONTENT.value)
     return text
 
 
@@ -1509,7 +1517,7 @@ def _serialize_single_item(
                 ser_res = doc_serializer.serialize(item=first_child, visited=my_visited, **kwargs)
                 text_part = ser_res.text
             else:
-                text_part = _escape_text(item.text, params.escape_mode)
+                text_part = _escape_text(item.text, params)
                 text_part = doc_serializer.post_process(
                     text=text_part,
                     formatting=item.formatting,
@@ -1522,13 +1530,13 @@ def _serialize_single_item(
         if params.add_referenced_caption and isinstance(item, FloatingItem):
             cap_text = doc_serializer.serialize_captions(item=item, **kwargs).text
             if cap_text:
-                cap_text = _escape_text(cap_text, params.escape_mode)
+                cap_text = _escape_text(cap_text, params)
                 parts.append(cap_text)
 
         if params.add_referenced_footnote and isinstance(item, FloatingItem):
             ftn_text = doc_serializer.serialize_footnotes(item=item, **kwargs).text
             if ftn_text:
-                ftn_text = _escape_text(ftn_text, params.escape_mode)
+                ftn_text = _escape_text(ftn_text, params)
                 parts.append(ftn_text)
 
         text_res = "".join(parts)
@@ -1575,25 +1583,25 @@ def serialize(
     def _serialize_meta_field(self, meta: BaseMeta, name: str, params: IDocTagsParams) -> Optional[str]:
         if (field_val := getattr(meta, name)) is not None:
             if name == MetaFieldName.SUMMARY and isinstance(field_val, SummaryMetaField):
-                escaped_text = _escape_text(field_val.text, params.escape_mode)
+                escaped_text = _escape_text(field_val.text, params)
                 txt = f"<summary>{escaped_text}</summary>"
             elif name == MetaFieldName.DESCRIPTION and isinstance(field_val, DescriptionMetaField):
-                escaped_text = _escape_text(field_val.text, params.escape_mode)
+                escaped_text = _escape_text(field_val.text, params)
                 txt = f"<description>{escaped_text}</description>"
             elif name == MetaFieldName.CLASSIFICATION and isinstance(field_val, PictureClassificationMetaField):
                 class_name = self._humanize_text(field_val.get_main_prediction().class_name)
-                escaped_class_name = _escape_text(class_name, params.escape_mode)
+                escaped_class_name = _escape_text(class_name, params)
                 txt = f"<classification>{escaped_class_name}</classification>"
             elif name == MetaFieldName.MOLECULE and isinstance(field_val, MoleculeMetaField):
-                escaped_smi = _escape_text(field_val.smi, params.escape_mode)
+                escaped_smi = _escape_text(field_val.smi, params)
                 txt = f"<molecule>{escaped_smi}</molecule>"
             elif name == MetaFieldName.TABULAR_CHART and isinstance(field_val, TabularChartMetaField):
                 # suppressing tabular chart serialization
                 return None
             # elif tmp := str(field_val or ""):
             #     txt = tmp
             elif name not in {v.value for v in MetaFieldName}:
-                escaped_text = _escape_text(str(field_val or ""), params.escape_mode)
+                escaped_text = _escape_text(str(field_val or ""), params)
                 txt = _wrap(text=escaped_text, wrap_tag=name)
             return txt
         return None
@@ -1785,7 +1793,7 @@ def _emit_otsl(
                             parts.append(cell_loc)
                         if ContentType.TABLE_CELL in params.content_types:
                             # Apply XML escaping to table cell content
-                            escaped_content = _escape_text(content, params.escape_mode)
+                            escaped_content = _escape_text(content, params)
                             parts.append(escaped_content)
                     else:
                         parts.append(IDocTagsVocabulary.create_selfclosing_token(token=IDocTagsToken.ECEL))
@@ -2034,6 +2042,7 @@ def serialize_captions(
                             loc_txt = _create_location_tokens_for_item(item=cap, doc=self.doc)
                             results.append(create_ser_result(text=loc_txt))
             if cap_res.text and ContentType.REF_CAPTION in params.content_types:
+                cap_res.text = _escape_text(cap_res.text, params)
                 results.append(cap_res)
         text_res = "".join([r.text for r in results])
         if text_res:
@@ -2058,7 +2067,7 @@ def serialize_footnotes(
 
                     content = ""
                     if ftn.text and ContentType.REF_FOOTNOTE in params.content_types:
-                        content = ftn.text
+                        content = _escape_text(ftn.text, params)
 
                     text_res = f"{location}{content}"
                     if text_res:
@@ -2094,7 +2103,7 @@ def serialize_doc(
 
         text_res = f"{open_token}{text_res}{close_token}"
 
-        if self.params.pretty_indentation:
+        if self.params.pretty_indentation is not None:
             try:
                 my_root = parseString(text_res).documentElement
             except Exception as e:
@@ -2225,6 +2234,7 @@ def _dispatch_element(self, *, doc: DoclingDocument, el: Element, parent: Option
             IDocTagsToken.STRIKETHROUGH.value,
             IDocTagsToken.SUBSCRIPT.value,
             IDocTagsToken.SUPERSCRIPT.value,
+            IDocTagsToken.CONTENT.value,
         }:
             self._parse_text_like(doc=doc, el=el, parent=parent)
         elif name == IDocTagsToken.PAGE_BREAK.value:
@@ -2256,9 +2266,12 @@ def _walk_children(self, *, doc: DoclingDocument, el: Element, parent: Optional[
 
     # ------------- Text blocks -------------
 
-    def _get_simple_text_block(self, elements: list) -> Optional[str]:
+    def _should_preserve_space(self, el: Element) -> bool:
+        return el.tagName == IDocTagsToken.CONTENT.value  # and el.getAttribute("xml:space") == "preserve"
+
+    def _get_children_simple_text_block(self, element: Element) -> Optional[str]:
         result = None
-        for el in elements:
+        for el in element.childNodes:
             if isinstance(el, Element):
                 if el.tagName not in {
                     IDocTagsToken.LOCATION.value,
@@ -2269,27 +2282,30 @@ def _get_simple_text_block(self, elements: list) -> Optional[str]:
                     IDocTagsToken.STRIKETHROUGH.value,
                     IDocTagsToken.SUBSCRIPT.value,
                     IDocTagsToken.SUPERSCRIPT.value,
+                    IDocTagsToken.CONTENT.value,
                 }:
                     return None
-                elif tmp := self._get_simple_text_block(el.childNodes):
+                elif tmp := self._get_children_simple_text_block(el):
                     result = tmp
-            elif isinstance(el, Text) and el.data.strip():
+            elif isinstance(el, Text) and el.data.strip():  # TODO should still support whitespace-only
                 if result is None:
-                    result = el.data.strip()
+                    result = el.data if element.tagName == IDocTagsToken.CONTENT.value else el.data.strip()
                 else:
                     return None
         return result
 
     def _parse_text_like(self, *, doc: DoclingDocument, el: Element, parent: Optional[NodeItem]) -> None:
         """Parse text-like tokens (title, text, caption, footnotes, code, formula)."""
-        if self._get_simple_text_block(el.childNodes) is None:
-            # This text-like element wraps a single inline group; create it directly
+        element_children = [
+            node for node in el.childNodes if isinstance(node, Element) and node.tagName != IDocTagsToken.LOCATION.value
+        ]
+
+        if len(element_children) > 1 or self._get_children_simple_text_block(el) is None:
             self._parse_inline_group(doc=doc, el=el, parent=parent)
             return
 
         prov_list = self._extract_provenance(doc=doc, el=el)
         text, formatting = self._extract_text_with_formatting(el)
-        text = text.strip()
         if not text:
             return
 
@@ -2324,6 +2340,7 @@ def _parse_text_like(self, *, doc: DoclingDocument, el: Element, parent: Optiona
                 IDocTagsToken.STRIKETHROUGH.value: DocItemLabel.TEXT,
                 IDocTagsToken.SUBSCRIPT.value: DocItemLabel.TEXT,
                 IDocTagsToken.SUPERSCRIPT.value: DocItemLabel.TEXT,
+                IDocTagsToken.CONTENT.value: DocItemLabel.TEXT,
             }
         ):
             is_bold = nm == IDocTagsToken.BOLD.value
@@ -2829,7 +2846,7 @@ def _get_text(self, el: Element) -> str:
             if isinstance(node, Text):
                 # Skip pure indentation/pretty-print whitespace
                 if node.data.strip():
-                    out.append(node.data)
+                    out.append(node.data if el.tagName == IDocTagsToken.CONTENT.value else node.data.strip())
             elif isinstance(node, Element):
                 nm = node.tagName
                 if nm in {IDocTagsToken.LOCATION.value}:
 
@@ -354,15 +354,14 @@ def _count_yes(key: str) -> int:
             # png_path = pngs_dir / f"{idx}_{i}.png"
             # __.save(png_path)
 
-        for mode in [IDocTagsSerializationMode.HUMAN_FRIENDLY, IDocTagsSerializationMode.LLM_FRIENDLY]:
+        for indent in ["  ", None]:
             for esc_mode in [True, False]:
                 for content in [True, False]:
                     try:
                         params_probe = IDocTagsParams()
                         params_probe.content_types = set(ContentType) if content else set()
-                        params_probe.mode = mode
                         params_probe.escape_mode = esc_mode
-                        params_probe.pretty_indentation = "  " if mode==IDocTagsSerializationMode.HUMAN_FRIENDLY else None
+                        params_probe.pretty_indentation = indent
 
                         iser_probe = IDocTagsDocSerializer(doc=doc, params=params_probe)
                         _ = iser_probe.serialize().text
 
@@ -24,7 +24,7 @@ Affiliation 2]]></text>
     <list_text><![CDATA[list item 4]]></list_text>
   </list>
   <floating_group class="table">
-    <caption>This is the caption of table 1.</caption>
+    <caption><![CDATA[This is the caption of table 1.]]></caption>
     <otsl>
       <fcel/>
 <![CDATA[Product]]>      <fcel/>
@@ -41,10 +41,10 @@ Affiliation 2]]></text>
     </otsl>
   </floating_group>
   <floating_group class="picture">
-    <caption>This is the caption of figure 1.</caption>
+    <caption><![CDATA[This is the caption of figure 1.]]></caption>
   </floating_group>
   <floating_group class="picture">
-    <caption>This is the caption of figure 2.</caption>
+    <caption><![CDATA[This is the caption of figure 2.]]></caption>
   </floating_group>
   <list ordered="false">
     <list_text><![CDATA[item 1 of list]]></list_text>
@@ -112,7 +112,9 @@ Affiliation 2]]></text>
   </list>
   <text><![CDATA[The end.]]></text>
   <text><![CDATA[Simple text]]></text>
-  <text><![CDATA[    4 leading spaces, 1 trailing ]]></text>
+  <text>
+    <content><![CDATA[    4 leading spaces, 1 trailing ]]></content>
+  </text>
   <text><![CDATA[Some 'single' quotes]]></text>
   <text><![CDATA[Some "double" quotes]]></text>
   <text>
@@ -122,7 +124,9 @@ Affiliation 2]]></text>
     </meta>
 <![CDATA[An ampersand: &]]>  </text>
   <code><![CDATA[0 == 0]]></code>
-  <code><![CDATA[ 1 leading space, 4 trailing    ]]></code>
+  <code>
+    <content><![CDATA[ 1 leading space, 4 trailing    ]]></content>
+  </code>
   <code><![CDATA[0 < 1]]></code>
   <code class="Python"><![CDATA[42 == 42]]></code>
   <code class="Python"><![CDATA[42 < 1337]]></code>
 
@@ -119,7 +119,9 @@ hyperlink
   </list>
   <text>The end.</text>
   <text>Simple text</text>
-  <text>    4 leading spaces, 1 trailing </text>
+  <text>
+    <content>    4 leading spaces, 1 trailing </content>
+  </text>
   <text><![CDATA[Some 'single' quotes]]></text>
   <text><![CDATA[Some "double" quotes]]></text>
   <text>
@@ -129,7 +131,9 @@ hyperlink
     </meta>
 <![CDATA[An ampersand: &]]>  </text>
   <code>0 == 0</code>
-  <code> 1 leading space, 4 trailing    </code>
+  <code>
+    <content> 1 leading space, 4 trailing    </content>
+  </code>
   <code><![CDATA[0 < 1]]></code>
   <code class="Python">42 == 42</code>
   <code class="Python"><![CDATA[42 < 1337]]></code>
 
@@ -88,8 +88,13 @@ groups:
     $ref: '#/texts/17'
   self_ref: '#/groups/6'
 key_value_items: []
-name: t
-pages: {}
+name: Document
+pages:
+  '0':
+    page_no: 0
+    size:
+      height: 512.0
+      width: 512.0
 pictures: []
 schema_name: DoclingDocument
 tables: []
@@ -180,12 +185,12 @@ texts:
 - children: []
   content_layer: body
   label: formula
-  orig: E=mc^2
+  orig: 'E=mc^2 '
   parent:
     $ref: '#/groups/2'
   prov: []
   self_ref: '#/texts/8'
-  text: E=mc^2
+  text: 'E=mc^2 '
 - children: []
   content_layer: body
   label: text