Skip to content

Commit fdcdfd1

Browse files
authored
feat(IDocTags): add content wrapping for handling whitespace (#489)
* feat(IDocTags): add content wrapping for handling whitespace Also: - fix caption serialiazation - consolidate indendation via `pretty_indentation` param Signed-off-by: Panos Vagenas <pva@zurich.ibm.com> * fix content tag value Signed-off-by: Panos Vagenas <pva@zurich.ibm.com> --------- Signed-off-by: Panos Vagenas <pva@zurich.ibm.com>
1 parent e5c0015 commit fdcdfd1

12 files changed

+482
-192
lines changed

docling_core/experimental/idoctags.py

Lines changed: 51 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -407,6 +407,7 @@ class IDocTagsToken(str, Enum):
407407
URI = "uri"
408408
MARKER = "marker"
409409
FACETS = "facets"
410+
CONTENT = "content" # TODO: review element name
410411

411412

412413
class IDocTagsAttributeKey(str, Enum):
@@ -629,6 +630,7 @@ class IDocTagsVocabulary(BaseModel):
629630
IDocTagsToken.URI: IDocTagsCategory.BINARY_DATA,
630631
IDocTagsToken.MARKER: IDocTagsCategory.CONTENT,
631632
IDocTagsToken.FACETS: IDocTagsCategory.CONTENT,
633+
IDocTagsToken.CONTENT: IDocTagsCategory.CONTENT,
632634
}
633635

634636
@classmethod
@@ -959,6 +961,13 @@ class EscapeMode(str, Enum):
959961
CDATA_WHEN_NEEDED = "cdata_when_needed" # wrap text in CDATA only if it contains special characters
960962

961963

964+
class WrapMode(str, Enum):
965+
"""Wrap mode for IDocTags output."""
966+
967+
WRAP_ALWAYS = "wrap_always" # wrap all text in explicit wrapper element
968+
WRAP_WHEN_NEEDED = "wrap_when_needed" # wrap text only if it has leading or trailing whitespace
969+
970+
962971
class ContentType(str, Enum):
963972
"""Content type for IDocTags output."""
964973

@@ -996,31 +1005,30 @@ class IDocTagsParams(CommonParams):
9961005

9971006
# IDocTags formatting
9981007
do_self_closing: bool = True
999-
pretty_indentation: Optional[str] = 2 * " "
1000-
1001-
# only relevant if pretty_indentation is None or empty:
1002-
mode: IDocTagsSerializationMode = IDocTagsSerializationMode.HUMAN_FRIENDLY
1003-
# Expand self-closing forms of non-self-closing tokens after pretty-printing
1008+
pretty_indentation: Optional[str] = 2 * " " # None means minimized serialization, "" means no indentation
10041009

10051010
preserve_empty_non_selfclosing: bool = True
10061011
# XML compliance: escape special characters in text content
10071012
escape_mode: EscapeMode = EscapeMode.CDATA_WHEN_NEEDED
1013+
content_wrapping_mode: WrapMode = WrapMode.WRAP_WHEN_NEEDED
10081014

10091015

10101016
def _get_delim(*, params: IDocTagsParams) -> str:
10111017
"""Return record delimiter based on IDocTagsSerializationMode."""
1012-
if params.mode == IDocTagsSerializationMode.HUMAN_FRIENDLY:
1013-
return "\n"
1014-
if params.mode == IDocTagsSerializationMode.LLM_FRIENDLY:
1015-
return ""
1016-
raise RuntimeError(f"Unknown IDocTags mode: {params.mode}")
1018+
return "" if params.pretty_indentation is None else "\n"
10171019

10181020

1019-
def _escape_text(text: str, escape_mode: EscapeMode) -> str:
1020-
if escape_mode == EscapeMode.CDATA_ALWAYS or (
1021-
escape_mode == EscapeMode.CDATA_WHEN_NEEDED and any(c in text for c in ['"', "'", "&", "<", ">"])
1021+
def _escape_text(text: str, params: IDocTagsParams) -> str:
1022+
do_wrap = params.content_wrapping_mode == WrapMode.WRAP_ALWAYS or (
1023+
params.content_wrapping_mode == WrapMode.WRAP_WHEN_NEEDED and text != text.strip()
1024+
)
1025+
if params.escape_mode == EscapeMode.CDATA_ALWAYS or (
1026+
params.escape_mode == EscapeMode.CDATA_WHEN_NEEDED and any(c in text for c in ['"', "'", "&", "<", ">"])
10221027
):
1023-
return f"<![CDATA[{text}]]>"
1028+
text = f"<![CDATA[{text}]]>"
1029+
if do_wrap:
1030+
# text = f'<{el_str} xml:space="preserve">{text}</{el_str}>'
1031+
text = _wrap(text=text, wrap_tag=IDocTagsToken.CONTENT.value)
10241032
return text
10251033

10261034

@@ -1509,7 +1517,7 @@ def _serialize_single_item(
15091517
ser_res = doc_serializer.serialize(item=first_child, visited=my_visited, **kwargs)
15101518
text_part = ser_res.text
15111519
else:
1512-
text_part = _escape_text(item.text, params.escape_mode)
1520+
text_part = _escape_text(item.text, params)
15131521
text_part = doc_serializer.post_process(
15141522
text=text_part,
15151523
formatting=item.formatting,
@@ -1522,13 +1530,13 @@ def _serialize_single_item(
15221530
if params.add_referenced_caption and isinstance(item, FloatingItem):
15231531
cap_text = doc_serializer.serialize_captions(item=item, **kwargs).text
15241532
if cap_text:
1525-
cap_text = _escape_text(cap_text, params.escape_mode)
1533+
cap_text = _escape_text(cap_text, params)
15261534
parts.append(cap_text)
15271535

15281536
if params.add_referenced_footnote and isinstance(item, FloatingItem):
15291537
ftn_text = doc_serializer.serialize_footnotes(item=item, **kwargs).text
15301538
if ftn_text:
1531-
ftn_text = _escape_text(ftn_text, params.escape_mode)
1539+
ftn_text = _escape_text(ftn_text, params)
15321540
parts.append(ftn_text)
15331541

15341542
text_res = "".join(parts)
@@ -1575,25 +1583,25 @@ def serialize(
15751583
def _serialize_meta_field(self, meta: BaseMeta, name: str, params: IDocTagsParams) -> Optional[str]:
15761584
if (field_val := getattr(meta, name)) is not None:
15771585
if name == MetaFieldName.SUMMARY and isinstance(field_val, SummaryMetaField):
1578-
escaped_text = _escape_text(field_val.text, params.escape_mode)
1586+
escaped_text = _escape_text(field_val.text, params)
15791587
txt = f"<summary>{escaped_text}</summary>"
15801588
elif name == MetaFieldName.DESCRIPTION and isinstance(field_val, DescriptionMetaField):
1581-
escaped_text = _escape_text(field_val.text, params.escape_mode)
1589+
escaped_text = _escape_text(field_val.text, params)
15821590
txt = f"<description>{escaped_text}</description>"
15831591
elif name == MetaFieldName.CLASSIFICATION and isinstance(field_val, PictureClassificationMetaField):
15841592
class_name = self._humanize_text(field_val.get_main_prediction().class_name)
1585-
escaped_class_name = _escape_text(class_name, params.escape_mode)
1593+
escaped_class_name = _escape_text(class_name, params)
15861594
txt = f"<classification>{escaped_class_name}</classification>"
15871595
elif name == MetaFieldName.MOLECULE and isinstance(field_val, MoleculeMetaField):
1588-
escaped_smi = _escape_text(field_val.smi, params.escape_mode)
1596+
escaped_smi = _escape_text(field_val.smi, params)
15891597
txt = f"<molecule>{escaped_smi}</molecule>"
15901598
elif name == MetaFieldName.TABULAR_CHART and isinstance(field_val, TabularChartMetaField):
15911599
# suppressing tabular chart serialization
15921600
return None
15931601
# elif tmp := str(field_val or ""):
15941602
# txt = tmp
15951603
elif name not in {v.value for v in MetaFieldName}:
1596-
escaped_text = _escape_text(str(field_val or ""), params.escape_mode)
1604+
escaped_text = _escape_text(str(field_val or ""), params)
15971605
txt = _wrap(text=escaped_text, wrap_tag=name)
15981606
return txt
15991607
return None
@@ -1785,7 +1793,7 @@ def _emit_otsl(
17851793
parts.append(cell_loc)
17861794
if ContentType.TABLE_CELL in params.content_types:
17871795
# Apply XML escaping to table cell content
1788-
escaped_content = _escape_text(content, params.escape_mode)
1796+
escaped_content = _escape_text(content, params)
17891797
parts.append(escaped_content)
17901798
else:
17911799
parts.append(IDocTagsVocabulary.create_selfclosing_token(token=IDocTagsToken.ECEL))
@@ -2034,6 +2042,7 @@ def serialize_captions(
20342042
loc_txt = _create_location_tokens_for_item(item=cap, doc=self.doc)
20352043
results.append(create_ser_result(text=loc_txt))
20362044
if cap_res.text and ContentType.REF_CAPTION in params.content_types:
2045+
cap_res.text = _escape_text(cap_res.text, params)
20372046
results.append(cap_res)
20382047
text_res = "".join([r.text for r in results])
20392048
if text_res:
@@ -2058,7 +2067,7 @@ def serialize_footnotes(
20582067

20592068
content = ""
20602069
if ftn.text and ContentType.REF_FOOTNOTE in params.content_types:
2061-
content = ftn.text
2070+
content = _escape_text(ftn.text, params)
20622071

20632072
text_res = f"{location}{content}"
20642073
if text_res:
@@ -2094,7 +2103,7 @@ def serialize_doc(
20942103

20952104
text_res = f"{open_token}{text_res}{close_token}"
20962105

2097-
if self.params.pretty_indentation:
2106+
if self.params.pretty_indentation is not None:
20982107
try:
20992108
my_root = parseString(text_res).documentElement
21002109
except Exception as e:
@@ -2225,6 +2234,7 @@ def _dispatch_element(self, *, doc: DoclingDocument, el: Element, parent: Option
22252234
IDocTagsToken.STRIKETHROUGH.value,
22262235
IDocTagsToken.SUBSCRIPT.value,
22272236
IDocTagsToken.SUPERSCRIPT.value,
2237+
IDocTagsToken.CONTENT.value,
22282238
}:
22292239
self._parse_text_like(doc=doc, el=el, parent=parent)
22302240
elif name == IDocTagsToken.PAGE_BREAK.value:
@@ -2256,9 +2266,12 @@ def _walk_children(self, *, doc: DoclingDocument, el: Element, parent: Optional[
22562266

22572267
# ------------- Text blocks -------------
22582268

2259-
def _get_simple_text_block(self, elements: list) -> Optional[str]:
2269+
def _should_preserve_space(self, el: Element) -> bool:
2270+
return el.tagName == IDocTagsToken.CONTENT.value # and el.getAttribute("xml:space") == "preserve"
2271+
2272+
def _get_children_simple_text_block(self, element: Element) -> Optional[str]:
22602273
result = None
2261-
for el in elements:
2274+
for el in element.childNodes:
22622275
if isinstance(el, Element):
22632276
if el.tagName not in {
22642277
IDocTagsToken.LOCATION.value,
@@ -2269,27 +2282,30 @@ def _get_simple_text_block(self, elements: list) -> Optional[str]:
22692282
IDocTagsToken.STRIKETHROUGH.value,
22702283
IDocTagsToken.SUBSCRIPT.value,
22712284
IDocTagsToken.SUPERSCRIPT.value,
2285+
IDocTagsToken.CONTENT.value,
22722286
}:
22732287
return None
2274-
elif tmp := self._get_simple_text_block(el.childNodes):
2288+
elif tmp := self._get_children_simple_text_block(el):
22752289
result = tmp
2276-
elif isinstance(el, Text) and el.data.strip():
2290+
elif isinstance(el, Text) and el.data.strip(): # TODO should still support whitespace-only
22772291
if result is None:
2278-
result = el.data.strip()
2292+
result = el.data if element.tagName == IDocTagsToken.CONTENT.value else el.data.strip()
22792293
else:
22802294
return None
22812295
return result
22822296

22832297
def _parse_text_like(self, *, doc: DoclingDocument, el: Element, parent: Optional[NodeItem]) -> None:
22842298
"""Parse text-like tokens (title, text, caption, footnotes, code, formula)."""
2285-
if self._get_simple_text_block(el.childNodes) is None:
2286-
# This text-like element wraps a single inline group; create it directly
2299+
element_children = [
2300+
node for node in el.childNodes if isinstance(node, Element) and node.tagName != IDocTagsToken.LOCATION.value
2301+
]
2302+
2303+
if len(element_children) > 1 or self._get_children_simple_text_block(el) is None:
22872304
self._parse_inline_group(doc=doc, el=el, parent=parent)
22882305
return
22892306

22902307
prov_list = self._extract_provenance(doc=doc, el=el)
22912308
text, formatting = self._extract_text_with_formatting(el)
2292-
text = text.strip()
22932309
if not text:
22942310
return
22952311

@@ -2324,6 +2340,7 @@ def _parse_text_like(self, *, doc: DoclingDocument, el: Element, parent: Optiona
23242340
IDocTagsToken.STRIKETHROUGH.value: DocItemLabel.TEXT,
23252341
IDocTagsToken.SUBSCRIPT.value: DocItemLabel.TEXT,
23262342
IDocTagsToken.SUPERSCRIPT.value: DocItemLabel.TEXT,
2343+
IDocTagsToken.CONTENT.value: DocItemLabel.TEXT,
23272344
}
23282345
):
23292346
is_bold = nm == IDocTagsToken.BOLD.value
@@ -2829,7 +2846,7 @@ def _get_text(self, el: Element) -> str:
28292846
if isinstance(node, Text):
28302847
# Skip pure indentation/pretty-print whitespace
28312848
if node.data.strip():
2832-
out.append(node.data)
2849+
out.append(node.data if el.tagName == IDocTagsToken.CONTENT.value else node.data.strip())
28332850
elif isinstance(node, Element):
28342851
nm = node.tagName
28352852
if nm in {IDocTagsToken.LOCATION.value}:

examples/convert_to_idoctags.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -354,15 +354,14 @@ def _count_yes(key: str) -> int:
354354
# png_path = pngs_dir / f"{idx}_{i}.png"
355355
# __.save(png_path)
356356

357-
for mode in [IDocTagsSerializationMode.HUMAN_FRIENDLY, IDocTagsSerializationMode.LLM_FRIENDLY]:
357+
for indent in [" ", None]:
358358
for esc_mode in [True, False]:
359359
for content in [True, False]:
360360
try:
361361
params_probe = IDocTagsParams()
362362
params_probe.content_types = set(ContentType) if content else set()
363-
params_probe.mode = mode
364363
params_probe.escape_mode = esc_mode
365-
params_probe.pretty_indentation = " " if mode==IDocTagsSerializationMode.HUMAN_FRIENDLY else None
364+
params_probe.pretty_indentation = indent
366365

367366
iser_probe = IDocTagsDocSerializer(doc=doc, params=params_probe)
368367
_ = iser_probe.serialize().text

test/data/doc/cdata_always.gt.idt.xml

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ Affiliation 2]]></text>
2424
<list_text><![CDATA[list item 4]]></list_text>
2525
</list>
2626
<floating_group class="table">
27-
<caption>This is the caption of table 1.</caption>
27+
<caption><![CDATA[This is the caption of table 1.]]></caption>
2828
<otsl>
2929
<fcel/>
3030
<![CDATA[Product]]> <fcel/>
@@ -41,10 +41,10 @@ Affiliation 2]]></text>
4141
</otsl>
4242
</floating_group>
4343
<floating_group class="picture">
44-
<caption>This is the caption of figure 1.</caption>
44+
<caption><![CDATA[This is the caption of figure 1.]]></caption>
4545
</floating_group>
4646
<floating_group class="picture">
47-
<caption>This is the caption of figure 2.</caption>
47+
<caption><![CDATA[This is the caption of figure 2.]]></caption>
4848
</floating_group>
4949
<list ordered="false">
5050
<list_text><![CDATA[item 1 of list]]></list_text>
@@ -112,7 +112,9 @@ Affiliation 2]]></text>
112112
</list>
113113
<text><![CDATA[The end.]]></text>
114114
<text><![CDATA[Simple text]]></text>
115-
<text><![CDATA[ 4 leading spaces, 1 trailing ]]></text>
115+
<text>
116+
<content><![CDATA[ 4 leading spaces, 1 trailing ]]></content>
117+
</text>
116118
<text><![CDATA[Some 'single' quotes]]></text>
117119
<text><![CDATA[Some "double" quotes]]></text>
118120
<text>
@@ -122,7 +124,9 @@ Affiliation 2]]></text>
122124
</meta>
123125
<![CDATA[An ampersand: &]]> </text>
124126
<code><![CDATA[0 == 0]]></code>
125-
<code><![CDATA[ 1 leading space, 4 trailing ]]></code>
127+
<code>
128+
<content><![CDATA[ 1 leading space, 4 trailing ]]></content>
129+
</code>
126130
<code><![CDATA[0 < 1]]></code>
127131
<code class="Python"><![CDATA[42 == 42]]></code>
128132
<code class="Python"><![CDATA[42 < 1337]]></code>

test/data/doc/cdata_when_needed.gt.idt.xml

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,9 @@ hyperlink
119119
</list>
120120
<text>The end.</text>
121121
<text>Simple text</text>
122-
<text> 4 leading spaces, 1 trailing </text>
122+
<text>
123+
<content> 4 leading spaces, 1 trailing </content>
124+
</text>
123125
<text><![CDATA[Some 'single' quotes]]></text>
124126
<text><![CDATA[Some "double" quotes]]></text>
125127
<text>
@@ -129,7 +131,9 @@ hyperlink
129131
</meta>
130132
<![CDATA[An ampersand: &]]> </text>
131133
<code>0 == 0</code>
132-
<code> 1 leading space, 4 trailing </code>
134+
<code>
135+
<content> 1 leading space, 4 trailing </content>
136+
</code>
133137
<code><![CDATA[0 < 1]]></code>
134138
<code class="Python">42 == 42</code>
135139
<code class="Python"><![CDATA[42 < 1337]]></code>

test/data/doc/roundtrip_list_item_with_inline_group_init.yaml renamed to test/data/doc/roundtrip_list_item_with_inline_deserialized.yaml

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -88,8 +88,13 @@ groups:
8888
$ref: '#/texts/17'
8989
self_ref: '#/groups/6'
9090
key_value_items: []
91-
name: t
92-
pages: {}
91+
name: Document
92+
pages:
93+
'0':
94+
page_no: 0
95+
size:
96+
height: 512.0
97+
width: 512.0
9398
pictures: []
9499
schema_name: DoclingDocument
95100
tables: []
@@ -180,12 +185,12 @@ texts:
180185
- children: []
181186
content_layer: body
182187
label: formula
183-
orig: E=mc^2
188+
orig: 'E=mc^2 '
184189
parent:
185190
$ref: '#/groups/2'
186191
prov: []
187192
self_ref: '#/texts/8'
188-
text: E=mc^2
193+
text: 'E=mc^2 '
189194
- children: []
190195
content_layer: body
191196
label: text

0 commit comments

Comments
 (0)