@@ -407,6 +407,7 @@ class IDocTagsToken(str, Enum):
407407 URI = "uri"
408408 MARKER = "marker"
409409 FACETS = "facets"
410+ CONTENT = "content" # TODO: review element name
410411
411412
412413class IDocTagsAttributeKey (str , Enum ):
@@ -629,6 +630,7 @@ class IDocTagsVocabulary(BaseModel):
629630 IDocTagsToken .URI : IDocTagsCategory .BINARY_DATA ,
630631 IDocTagsToken .MARKER : IDocTagsCategory .CONTENT ,
631632 IDocTagsToken .FACETS : IDocTagsCategory .CONTENT ,
633+ IDocTagsToken .CONTENT : IDocTagsCategory .CONTENT ,
632634 }
633635
634636 @classmethod
@@ -959,6 +961,13 @@ class EscapeMode(str, Enum):
959961 CDATA_WHEN_NEEDED = "cdata_when_needed" # wrap text in CDATA only if it contains special characters
960962
961963
964+ class WrapMode (str , Enum ):
965+ """Wrap mode for IDocTags output."""
966+
967+ WRAP_ALWAYS = "wrap_always" # wrap all text in explicit wrapper element
968+ WRAP_WHEN_NEEDED = "wrap_when_needed" # wrap text only if it has leading or trailing whitespace
969+
970+
962971class ContentType (str , Enum ):
963972 """Content type for IDocTags output."""
964973
@@ -996,31 +1005,30 @@ class IDocTagsParams(CommonParams):
9961005
9971006 # IDocTags formatting
9981007 do_self_closing : bool = True
999- pretty_indentation : Optional [str ] = 2 * " "
1000-
1001- # only relevant if pretty_indentation is None or empty:
1002- mode : IDocTagsSerializationMode = IDocTagsSerializationMode .HUMAN_FRIENDLY
1003- # Expand self-closing forms of non-self-closing tokens after pretty-printing
1008+ pretty_indentation : Optional [str ] = 2 * " " # None means minimized serialization, "" means no indentation
10041009
10051010 preserve_empty_non_selfclosing : bool = True
10061011 # XML compliance: escape special characters in text content
10071012 escape_mode : EscapeMode = EscapeMode .CDATA_WHEN_NEEDED
1013+ content_wrapping_mode : WrapMode = WrapMode .WRAP_WHEN_NEEDED
10081014
10091015
10101016def _get_delim (* , params : IDocTagsParams ) -> str :
10111017 """Return record delimiter based on IDocTagsSerializationMode."""
1012- if params .mode == IDocTagsSerializationMode .HUMAN_FRIENDLY :
1013- return "\n "
1014- if params .mode == IDocTagsSerializationMode .LLM_FRIENDLY :
1015- return ""
1016- raise RuntimeError (f"Unknown IDocTags mode: { params .mode } " )
1018+ return "" if params .pretty_indentation is None else "\n "
10171019
10181020
1019- def _escape_text (text : str , escape_mode : EscapeMode ) -> str :
1020- if escape_mode == EscapeMode .CDATA_ALWAYS or (
1021- escape_mode == EscapeMode .CDATA_WHEN_NEEDED and any (c in text for c in ['"' , "'" , "&" , "<" , ">" ])
1021+ def _escape_text (text : str , params : IDocTagsParams ) -> str :
1022+ do_wrap = params .content_wrapping_mode == WrapMode .WRAP_ALWAYS or (
1023+ params .content_wrapping_mode == WrapMode .WRAP_WHEN_NEEDED and text != text .strip ()
1024+ )
1025+ if params .escape_mode == EscapeMode .CDATA_ALWAYS or (
1026+ params .escape_mode == EscapeMode .CDATA_WHEN_NEEDED and any (c in text for c in ['"' , "'" , "&" , "<" , ">" ])
10221027 ):
1023- return f"<![CDATA[{ text } ]]>"
1028+ text = f"<![CDATA[{ text } ]]>"
1029+ if do_wrap :
1030+ # text = f'<{el_str} xml:space="preserve">{text}</{el_str}>'
1031+ text = _wrap (text = text , wrap_tag = IDocTagsToken .CONTENT .value )
10241032 return text
10251033
10261034
@@ -1509,7 +1517,7 @@ def _serialize_single_item(
15091517 ser_res = doc_serializer .serialize (item = first_child , visited = my_visited , ** kwargs )
15101518 text_part = ser_res .text
15111519 else :
1512- text_part = _escape_text (item .text , params . escape_mode )
1520+ text_part = _escape_text (item .text , params )
15131521 text_part = doc_serializer .post_process (
15141522 text = text_part ,
15151523 formatting = item .formatting ,
@@ -1522,13 +1530,13 @@ def _serialize_single_item(
15221530 if params .add_referenced_caption and isinstance (item , FloatingItem ):
15231531 cap_text = doc_serializer .serialize_captions (item = item , ** kwargs ).text
15241532 if cap_text :
1525- cap_text = _escape_text (cap_text , params . escape_mode )
1533+ cap_text = _escape_text (cap_text , params )
15261534 parts .append (cap_text )
15271535
15281536 if params .add_referenced_footnote and isinstance (item , FloatingItem ):
15291537 ftn_text = doc_serializer .serialize_footnotes (item = item , ** kwargs ).text
15301538 if ftn_text :
1531- ftn_text = _escape_text (ftn_text , params . escape_mode )
1539+ ftn_text = _escape_text (ftn_text , params )
15321540 parts .append (ftn_text )
15331541
15341542 text_res = "" .join (parts )
@@ -1575,25 +1583,25 @@ def serialize(
15751583 def _serialize_meta_field (self , meta : BaseMeta , name : str , params : IDocTagsParams ) -> Optional [str ]:
15761584 if (field_val := getattr (meta , name )) is not None :
15771585 if name == MetaFieldName .SUMMARY and isinstance (field_val , SummaryMetaField ):
1578- escaped_text = _escape_text (field_val .text , params . escape_mode )
1586+ escaped_text = _escape_text (field_val .text , params )
15791587 txt = f"<summary>{ escaped_text } </summary>"
15801588 elif name == MetaFieldName .DESCRIPTION and isinstance (field_val , DescriptionMetaField ):
1581- escaped_text = _escape_text (field_val .text , params . escape_mode )
1589+ escaped_text = _escape_text (field_val .text , params )
15821590 txt = f"<description>{ escaped_text } </description>"
15831591 elif name == MetaFieldName .CLASSIFICATION and isinstance (field_val , PictureClassificationMetaField ):
15841592 class_name = self ._humanize_text (field_val .get_main_prediction ().class_name )
1585- escaped_class_name = _escape_text (class_name , params . escape_mode )
1593+ escaped_class_name = _escape_text (class_name , params )
15861594 txt = f"<classification>{ escaped_class_name } </classification>"
15871595 elif name == MetaFieldName .MOLECULE and isinstance (field_val , MoleculeMetaField ):
1588- escaped_smi = _escape_text (field_val .smi , params . escape_mode )
1596+ escaped_smi = _escape_text (field_val .smi , params )
15891597 txt = f"<molecule>{ escaped_smi } </molecule>"
15901598 elif name == MetaFieldName .TABULAR_CHART and isinstance (field_val , TabularChartMetaField ):
15911599 # suppressing tabular chart serialization
15921600 return None
15931601 # elif tmp := str(field_val or ""):
15941602 # txt = tmp
15951603 elif name not in {v .value for v in MetaFieldName }:
1596- escaped_text = _escape_text (str (field_val or "" ), params . escape_mode )
1604+ escaped_text = _escape_text (str (field_val or "" ), params )
15971605 txt = _wrap (text = escaped_text , wrap_tag = name )
15981606 return txt
15991607 return None
@@ -1785,7 +1793,7 @@ def _emit_otsl(
17851793 parts .append (cell_loc )
17861794 if ContentType .TABLE_CELL in params .content_types :
17871795 # Apply XML escaping to table cell content
1788- escaped_content = _escape_text (content , params . escape_mode )
1796+ escaped_content = _escape_text (content , params )
17891797 parts .append (escaped_content )
17901798 else :
17911799 parts .append (IDocTagsVocabulary .create_selfclosing_token (token = IDocTagsToken .ECEL ))
@@ -2034,6 +2042,7 @@ def serialize_captions(
20342042 loc_txt = _create_location_tokens_for_item (item = cap , doc = self .doc )
20352043 results .append (create_ser_result (text = loc_txt ))
20362044 if cap_res .text and ContentType .REF_CAPTION in params .content_types :
2045+ cap_res .text = _escape_text (cap_res .text , params )
20372046 results .append (cap_res )
20382047 text_res = "" .join ([r .text for r in results ])
20392048 if text_res :
@@ -2058,7 +2067,7 @@ def serialize_footnotes(
20582067
20592068 content = ""
20602069 if ftn .text and ContentType .REF_FOOTNOTE in params .content_types :
2061- content = ftn .text
2070+ content = _escape_text ( ftn .text , params )
20622071
20632072 text_res = f"{ location } { content } "
20642073 if text_res :
@@ -2094,7 +2103,7 @@ def serialize_doc(
20942103
20952104 text_res = f"{ open_token } { text_res } { close_token } "
20962105
2097- if self .params .pretty_indentation :
2106+ if self .params .pretty_indentation is not None :
20982107 try :
20992108 my_root = parseString (text_res ).documentElement
21002109 except Exception as e :
@@ -2225,6 +2234,7 @@ def _dispatch_element(self, *, doc: DoclingDocument, el: Element, parent: Option
22252234 IDocTagsToken .STRIKETHROUGH .value ,
22262235 IDocTagsToken .SUBSCRIPT .value ,
22272236 IDocTagsToken .SUPERSCRIPT .value ,
2237+ IDocTagsToken .CONTENT .value ,
22282238 }:
22292239 self ._parse_text_like (doc = doc , el = el , parent = parent )
22302240 elif name == IDocTagsToken .PAGE_BREAK .value :
@@ -2256,9 +2266,12 @@ def _walk_children(self, *, doc: DoclingDocument, el: Element, parent: Optional[
22562266
22572267 # ------------- Text blocks -------------
22582268
2259- def _get_simple_text_block (self , elements : list ) -> Optional [str ]:
2269+ def _should_preserve_space (self , el : Element ) -> bool :
2270+ return el .tagName == IDocTagsToken .CONTENT .value # and el.getAttribute("xml:space") == "preserve"
2271+
2272+ def _get_children_simple_text_block (self , element : Element ) -> Optional [str ]:
22602273 result = None
2261- for el in elements :
2274+ for el in element . childNodes :
22622275 if isinstance (el , Element ):
22632276 if el .tagName not in {
22642277 IDocTagsToken .LOCATION .value ,
@@ -2269,27 +2282,30 @@ def _get_simple_text_block(self, elements: list) -> Optional[str]:
22692282 IDocTagsToken .STRIKETHROUGH .value ,
22702283 IDocTagsToken .SUBSCRIPT .value ,
22712284 IDocTagsToken .SUPERSCRIPT .value ,
2285+ IDocTagsToken .CONTENT .value ,
22722286 }:
22732287 return None
2274- elif tmp := self ._get_simple_text_block (el . childNodes ):
2288+ elif tmp := self ._get_children_simple_text_block (el ):
22752289 result = tmp
2276- elif isinstance (el , Text ) and el .data .strip ():
2290+ elif isinstance (el , Text ) and el .data .strip (): # TODO should still support whitespace-only
22772291 if result is None :
2278- result = el .data .strip ()
2292+ result = el .data if element . tagName == IDocTagsToken . CONTENT . value else el . data .strip ()
22792293 else :
22802294 return None
22812295 return result
22822296
22832297 def _parse_text_like (self , * , doc : DoclingDocument , el : Element , parent : Optional [NodeItem ]) -> None :
22842298 """Parse text-like tokens (title, text, caption, footnotes, code, formula)."""
2285- if self ._get_simple_text_block (el .childNodes ) is None :
2286- # This text-like element wraps a single inline group; create it directly
2299+ element_children = [
2300+ node for node in el .childNodes if isinstance (node , Element ) and node .tagName != IDocTagsToken .LOCATION .value
2301+ ]
2302+
2303+ if len (element_children ) > 1 or self ._get_children_simple_text_block (el ) is None :
22872304 self ._parse_inline_group (doc = doc , el = el , parent = parent )
22882305 return
22892306
22902307 prov_list = self ._extract_provenance (doc = doc , el = el )
22912308 text , formatting = self ._extract_text_with_formatting (el )
2292- text = text .strip ()
22932309 if not text :
22942310 return
22952311
@@ -2324,6 +2340,7 @@ def _parse_text_like(self, *, doc: DoclingDocument, el: Element, parent: Optiona
23242340 IDocTagsToken .STRIKETHROUGH .value : DocItemLabel .TEXT ,
23252341 IDocTagsToken .SUBSCRIPT .value : DocItemLabel .TEXT ,
23262342 IDocTagsToken .SUPERSCRIPT .value : DocItemLabel .TEXT ,
2343+ IDocTagsToken .CONTENT .value : DocItemLabel .TEXT ,
23272344 }
23282345 ):
23292346 is_bold = nm == IDocTagsToken .BOLD .value
@@ -2829,7 +2846,7 @@ def _get_text(self, el: Element) -> str:
28292846 if isinstance (node , Text ):
28302847 # Skip pure indentation/pretty-print whitespace
28312848 if node .data .strip ():
2832- out .append (node .data )
2849+ out .append (node .data if el . tagName == IDocTagsToken . CONTENT . value else node . data . strip () )
28332850 elif isinstance (node , Element ):
28342851 nm = node .tagName
28352852 if nm in {IDocTagsToken .LOCATION .value }:
0 commit comments