Skip to content

Commit b60ac19

Browse files
authored
feat: add support for heading with inline in HTML & DocTags (#379)
* test: refactor serialization tests Signed-off-by: Panos Vagenas <[email protected]> * further refactor serialization tests Signed-off-by: Panos Vagenas <[email protected]> * rename inline test file for markdown Signed-off-by: Panos Vagenas <[email protected]> * simplify Markdown handling of inline in heading / list item Signed-off-by: Panos Vagenas <[email protected]> * fix input test data & make it more interesting Signed-off-by: Panos Vagenas <[email protected]> * add HTML & DocTags support Signed-off-by: Panos Vagenas <[email protected]> * generalize and refactor HTML ser case Signed-off-by: Panos Vagenas <[email protected]> * generalize MD serializer too Signed-off-by: Panos Vagenas <[email protected]> --------- Signed-off-by: Panos Vagenas <[email protected]>
1 parent c0b0c4c commit b60ac19

File tree

8 files changed

+434
-174
lines changed

8 files changed

+434
-174
lines changed

docling_core/transforms/serializer/doctags.py

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
PictureMoleculeData,
4343
PictureTabularChartData,
4444
ProvenanceItem,
45+
SectionHeaderItem,
4546
TableItem,
4647
TextItem,
4748
)
@@ -94,11 +95,11 @@ def serialize(
9495
item: TextItem,
9596
doc_serializer: BaseDocSerializer,
9697
doc: DoclingDocument,
98+
visited: Optional[set[str]] = None,
9799
**kwargs: Any,
98100
) -> SerializationResult:
99101
"""Serializes the passed item."""
100-
from docling_core.types.doc.document import SectionHeaderItem
101-
102+
my_visited = visited if visited is not None else set()
102103
params = DocTagsParams(**kwargs)
103104
wrap_tag: Optional[str] = DocumentToken.create_token_name_from_doc_item_label(
104105
label=item.label,
@@ -116,12 +117,21 @@ def serialize(
116117
parts.append(location)
117118

118119
if params.add_content:
119-
text_part = item.text
120-
text_part = doc_serializer.post_process(
121-
text=text_part,
122-
formatting=item.formatting,
123-
hyperlink=item.hyperlink,
124-
)
120+
if (
121+
item.text == ""
122+
and len(item.children) == 1
123+
and isinstance(
124+
(child_group := item.children[0].resolve(doc)), InlineGroup
125+
)
126+
):
127+
ser_res = doc_serializer.serialize(item=child_group, visited=my_visited)
128+
text_part = ser_res.text
129+
else:
130+
text_part = doc_serializer.post_process(
131+
text=item.text,
132+
formatting=item.formatting,
133+
hyperlink=item.hyperlink,
134+
)
125135

126136
if isinstance(item, CodeItem):
127137
language_token = DocumentToken.get_code_language_token(

docling_core/transforms/serializer/html.py

Lines changed: 73 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -139,41 +139,61 @@ def serialize(
139139
res_parts: list[SerializationResult] = []
140140
post_processed = False
141141

142-
# Prepare the HTML based on item type
143-
if isinstance(item, TitleItem):
144-
text_inner = self._prepare_content(item.text)
145-
text = get_html_tag_with_text_direction(html_tag="h1", text=text_inner)
142+
has_inline_repr = (
143+
item.text == ""
144+
and len(item.children) == 1
145+
and isinstance((child_group := item.children[0].resolve(doc)), InlineGroup)
146+
)
147+
if has_inline_repr:
148+
text = doc_serializer.serialize(item=child_group, visited=my_visited).text
149+
post_processed = True
150+
else:
151+
text = item.text
152+
if not isinstance(item, (CodeItem, FormulaItem)):
153+
text = html.escape(text, quote=False)
154+
text = text.replace("\n", "<br>")
146155

147-
elif isinstance(item, SectionHeaderItem):
148-
section_level = min(item.level + 1, 6)
149-
text_inner = self._prepare_content(item.text)
156+
# Prepare the HTML based on item type
157+
if isinstance(item, (TitleItem, SectionHeaderItem)):
158+
section_level = (
159+
min(item.level + 1, 6) if isinstance(item, SectionHeaderItem) else 1
160+
)
150161
text = get_html_tag_with_text_direction(
151-
html_tag=f"h{section_level}", text=text_inner
162+
html_tag=f"h{section_level}", text=text
152163
)
153164

154165
elif isinstance(item, FormulaItem):
155166
text = self._process_formula(
156167
item=item,
168+
text=text,
169+
orig=item.orig,
157170
doc=doc,
158171
image_mode=params.image_mode,
159172
formula_to_mathml=params.formula_to_mathml,
160173
is_inline_scope=is_inline_scope,
161174
)
162175

163176
elif isinstance(item, CodeItem):
164-
text = self._process_code(item=item, is_inline_scope=is_inline_scope)
177+
text = (
178+
f"<code>{text}</code>"
179+
if is_inline_scope
180+
else f"<pre><code>{text}</code></pre>"
181+
)
165182

166183
elif isinstance(item, ListItem):
167184
# List items are handled by list serializer
168185
text_parts: list[str] = []
169-
if item_text := self._prepare_content(item.text):
170-
item_text = doc_serializer.post_process(
171-
text=item_text,
172-
formatting=item.formatting,
173-
hyperlink=item.hyperlink,
174-
)
175-
post_processed = True
176-
text_parts.append(item_text)
186+
if text:
187+
if has_inline_repr:
188+
text = f"\n{text}\n"
189+
else:
190+
text = doc_serializer.post_process(
191+
text=text,
192+
formatting=item.formatting,
193+
hyperlink=item.hyperlink,
194+
)
195+
post_processed = True
196+
text_parts.append(text)
177197
nested_parts = [
178198
r.text
179199
for r in doc_serializer.get_parts(
@@ -184,29 +204,26 @@ def serialize(
184204
)
185205
]
186206
text_parts.extend(nested_parts)
187-
text_inner = "\n".join(text_parts)
207+
text = "\n".join(text_parts)
188208
if nested_parts:
189-
text_inner = f"\n{text_inner}\n"
209+
text = f"\n{text}\n"
190210
text = (
191211
get_html_tag_with_text_direction(
192212
html_tag="li",
193-
text=text_inner,
213+
text=text,
194214
attrs=(
195215
{"style": f"list-style-type: '{item.marker} ';"}
196216
if params.show_original_list_item_marker and item.marker
197217
else {}
198218
),
199219
)
200-
if text_inner
220+
if text
201221
else ""
202222
)
203223

204-
elif is_inline_scope:
205-
text = self._prepare_content(item.text)
206-
else:
224+
elif not is_inline_scope:
207225
# Regular text item
208-
text_inner = self._prepare_content(item.text)
209-
text = get_html_tag_with_text_direction(html_tag="p", text=text_inner)
226+
text = get_html_tag_with_text_direction(html_tag="p", text=text)
210227

211228
# Apply formatting and hyperlinks
212229
if not post_processed:
@@ -227,66 +244,44 @@ def serialize(
227244

228245
return create_ser_result(text=text, span_source=res_parts)
229246

230-
def _prepare_content(
231-
self, text: str, do_escape_html=True, do_replace_newline=True
232-
) -> str:
233-
"""Prepare text content for HTML inclusion."""
234-
if do_escape_html:
235-
text = html.escape(text, quote=False)
236-
if do_replace_newline:
237-
text = text.replace("\n", "<br>")
238-
return text
239-
240-
def _process_code(
241-
self,
242-
item: CodeItem,
243-
is_inline_scope: bool,
244-
) -> str:
245-
code_text = self._prepare_content(
246-
item.text, do_escape_html=False, do_replace_newline=False
247-
)
248-
if is_inline_scope:
249-
text = f"<code>{code_text}</code>"
250-
else:
251-
text = f"<pre><code>{code_text}</code></pre>"
252-
253-
return text
254-
255247
def _process_formula(
256248
self,
257-
item: FormulaItem,
249+
*,
250+
item: DocItem,
251+
text: str,
252+
orig: str,
258253
doc: DoclingDocument,
259254
image_mode: ImageRefMode,
260255
formula_to_mathml: bool,
261256
is_inline_scope: bool,
262257
) -> str:
263258
"""Process a formula item to HTML/MathML."""
264-
math_formula = self._prepare_content(
265-
item.text, do_escape_html=False, do_replace_newline=False
266-
)
267-
268259
# If formula is empty, try to use an image fallback
269-
if item.text == "" and item.orig != "":
270-
img_fallback = self._get_formula_image_fallback(item, doc)
271-
if (
272-
image_mode == ImageRefMode.EMBEDDED
273-
and len(item.prov) > 0
274-
and img_fallback
275-
):
276-
return img_fallback
260+
if (
261+
text == ""
262+
and orig != ""
263+
and len(item.prov) > 0
264+
and image_mode == ImageRefMode.EMBEDDED
265+
and (
266+
img_fallback := self._get_formula_image_fallback(
267+
item=item, orig=orig, doc=doc
268+
)
269+
)
270+
):
271+
return img_fallback
277272

278273
# Try to generate MathML
279-
if formula_to_mathml and math_formula:
274+
elif formula_to_mathml and text:
280275
try:
281276
# Set display mode based on context
282277
display_mode = "inline" if is_inline_scope else "block"
283278
mathml_element = latex2mathml.converter.convert_to_element(
284-
math_formula, display=display_mode
279+
text, display=display_mode
285280
)
286281
annotation = SubElement(
287282
mathml_element, "annotation", dict(encoding="TeX")
288283
)
289-
annotation.text = math_formula
284+
annotation.text = text
290285
mathml = unescape(tostring(mathml_element, encoding="unicode"))
291286

292287
# Don't wrap in div for inline formulas
@@ -296,40 +291,40 @@ def _process_formula(
296291
return f"<div>{mathml}</div>"
297292

298293
except Exception:
299-
img_fallback = self._get_formula_image_fallback(item, doc)
294+
img_fallback = self._get_formula_image_fallback(
295+
item=item, orig=orig, doc=doc
296+
)
300297
if (
301298
image_mode == ImageRefMode.EMBEDDED
302299
and len(item.prov) > 0
303300
and img_fallback
304301
):
305302
return img_fallback
306-
elif math_formula:
307-
return f"<pre>{math_formula}</pre>"
303+
elif text:
304+
return f"<pre>{text}</pre>"
308305
else:
309306
return "<pre>Formula not decoded</pre>"
310307

311308
_logger.warning("Could not parse formula with MathML")
312309

313310
# Fallback options if we got here
314-
if math_formula and is_inline_scope:
315-
return f"<code>{math_formula}</code>"
316-
elif math_formula and (not is_inline_scope):
317-
f"<pre>{math_formula}</pre>"
311+
if text and is_inline_scope:
312+
return f"<code>{text}</code>"
313+
elif text and (not is_inline_scope):
314+
f"<pre>{text}</pre>"
318315
elif is_inline_scope:
319316
return '<span class="formula-not-decoded">Formula not decoded</span>'
320317

321318
return '<div class="formula-not-decoded">Formula not decoded</div>'
322319

323320
def _get_formula_image_fallback(
324-
self, item: TextItem, doc: DoclingDocument
321+
self, *, item: DocItem, orig: str, doc: DoclingDocument
325322
) -> Optional[str]:
326323
"""Try to get an image fallback for a formula."""
327324
item_image = item.get_image(doc=doc)
328325
if item_image is not None:
329326
img_ref = ImageRef.from_pil(item_image, dpi=72)
330-
return (
331-
"<figure>" f'<img src="{img_ref.uri}" alt="{item.orig}" />' "</figure>"
332-
)
327+
return "<figure>" f'<img src="{img_ref.uri}" alt="{orig}" />' "</figure>"
333328
return None
334329

335330

docling_core/transforms/serializer/markdown.py

Lines changed: 16 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -124,33 +124,32 @@ def serialize(
124124
my_visited = visited if visited is not None else set()
125125
params = MarkdownParams(**kwargs)
126126
res_parts: list[SerializationResult] = []
127-
text = item.text
128127
escape_html = True
129128
escape_underscores = True
130-
processing_pending = True
131-
if isinstance(item, (ListItem, TitleItem, SectionHeaderItem)):
132-
# case where processing/formatting should be applied first (in inner scope)
129+
130+
has_inline_repr = (
131+
item.text == ""
132+
and len(item.children) == 1
133+
and isinstance((child_group := item.children[0].resolve(doc)), InlineGroup)
134+
)
135+
if has_inline_repr:
136+
text = doc_serializer.serialize(item=child_group, visited=my_visited).text
133137
processing_pending = False
134-
if (
135-
text == ""
136-
and len(item.children) == 1
137-
and isinstance(
138-
(child_group := item.children[0].resolve(doc)), InlineGroup
139-
)
140-
):
141-
# case of inline within heading / list item
142-
ser_res = doc_serializer.serialize(item=child_group)
143-
text = ser_res.text
144-
for span in ser_res.spans:
145-
my_visited.add(span.item.self_ref)
146-
else:
138+
else:
139+
text = item.text
140+
processing_pending = True
141+
142+
if isinstance(item, (ListItem, TitleItem, SectionHeaderItem)):
143+
if not has_inline_repr:
144+
# case where processing/formatting should be applied first (in inner scope)
147145
text = doc_serializer.post_process(
148146
text=text,
149147
escape_html=escape_html,
150148
escape_underscores=escape_underscores,
151149
formatting=item.formatting,
152150
hyperlink=item.hyperlink,
153151
)
152+
processing_pending = False
154153

155154
if isinstance(item, ListItem):
156155
pieces: list[str] = []

0 commit comments

Comments
 (0)