Skip to content

Commit c9374e8

Browse files
authored
fix(markdown): add heading formatting, fix code & formula formatting (#336)
* feat(markdown): add formatted headings (show behavior before change) Signed-off-by: Panos Vagenas <[email protected]> * add change and updated test data Signed-off-by: Panos Vagenas <[email protected]> * fix when formatting & escaping are applied (depending on type) Signed-off-by: Panos Vagenas <[email protected]> --------- Signed-off-by: Panos Vagenas <[email protected]>
1 parent b3fb504 commit c9374e8

File tree

5 files changed

+673
-18
lines changed

5 files changed

+673
-18
lines changed

docling_core/transforms/serializer/common.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -349,6 +349,7 @@ def serialize(
349349
doc_serializer=self,
350350
doc=self.doc,
351351
is_inline_scope=is_inline_scope,
352+
visited=my_visited,
352353
**my_kwargs,
353354
)
354355
if item.self_ref not in self.get_excluded_refs(**kwargs)

docling_core/transforms/serializer/markdown.py

Lines changed: 43 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -106,36 +106,60 @@ def serialize(
106106
doc_serializer: BaseDocSerializer,
107107
doc: DoclingDocument,
108108
is_inline_scope: bool = False,
109+
visited: Optional[set[str]] = None, # refs of visited items
109110
**kwargs: Any,
110111
) -> SerializationResult:
111112
"""Serializes the passed item."""
113+
my_visited = visited if visited is not None else set()
112114
params = MarkdownParams(**kwargs)
113115
res_parts: list[SerializationResult] = []
116+
text = item.text
114117
escape_html = True
115118
escape_underscores = True
116-
if isinstance(item, TitleItem):
117-
text_part = f"# {item.text}"
118-
elif isinstance(item, SectionHeaderItem):
119-
text_part = f"{(item.level + 1) * '#'} {item.text}"
119+
processing_pending = True
120+
if isinstance(item, (TitleItem, SectionHeaderItem)):
121+
# case where processing/formatting should be applied first (in inner scope)
122+
processing_pending = False
123+
if (
124+
text == ""
125+
and len(item.children) == 1
126+
and isinstance(
127+
(child_group := item.children[0].resolve(doc)), InlineGroup
128+
)
129+
):
130+
# case of heading with inline
131+
ser_res = doc_serializer.serialize(item=child_group)
132+
text = ser_res.text
133+
for span in ser_res.spans:
134+
my_visited.add(span.item.self_ref)
135+
else:
136+
text = doc_serializer.post_process(
137+
text=text,
138+
escape_html=escape_html,
139+
escape_underscores=escape_underscores,
140+
formatting=item.formatting,
141+
hyperlink=item.hyperlink,
142+
)
143+
num_hashes = 1 if isinstance(item, TitleItem) else item.level + 1
144+
text_part = f"{num_hashes * '#'} {text}"
120145
elif isinstance(item, CodeItem):
121-
text_part = (
122-
f"`{item.text}`" if is_inline_scope else f"```\n{item.text}\n```"
123-
)
146+
text_part = f"`{text}`" if is_inline_scope else f"```\n{text}\n```"
124147
escape_html = False
125148
escape_underscores = False
126149
elif isinstance(item, FormulaItem):
127-
if item.text:
128-
text_part = f"${item.text}$" if is_inline_scope else f"$${item.text}$$"
150+
if text:
151+
text_part = f"${text}$" if is_inline_scope else f"$${text}$$"
129152
elif item.orig:
130153
text_part = "<!-- formula-not-decoded -->"
131154
else:
132155
text_part = ""
133156
escape_html = False
134157
escape_underscores = False
135158
elif params.wrap_width:
136-
text_part = textwrap.fill(item.text, width=params.wrap_width)
159+
# although wrapping is not guaranteed if post-processing makes changes
160+
text_part = textwrap.fill(text, width=params.wrap_width)
137161
else:
138-
text_part = item.text
162+
text_part = text
139163

140164
if text_part:
141165
text_res = create_ser_result(text=text_part, span_source=item)
@@ -147,13 +171,14 @@ def serialize(
147171
res_parts.append(cap_res)
148172

149173
text = (" " if is_inline_scope else "\n\n").join([r.text for r in res_parts])
150-
text = doc_serializer.post_process(
151-
text=text,
152-
escape_html=escape_html,
153-
escape_underscores=escape_underscores,
154-
formatting=item.formatting,
155-
hyperlink=item.hyperlink,
156-
)
174+
if processing_pending:
175+
text = doc_serializer.post_process(
176+
text=text,
177+
escape_html=escape_html,
178+
escape_underscores=escape_underscores,
179+
formatting=item.formatting,
180+
hyperlink=item.hyperlink,
181+
)
157182
return create_ser_result(text=text, span_source=res_parts)
158183

159184

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
# Contribution guideline example
2+
3+
This is simple.
4+
5+
Foo *emphasis* **strong emphasis** ***both*** .
6+
7+
Create your feature branch: `git checkout -b feature/AmazingFeature` .
8+
9+
1. Pull the [**repository**](https://github.com/docling-project/docling) .
10+
2. Create your feature branch ( `git checkout -b feature/AmazingFeature` )
11+
3. Commit your changes ( `git commit -m 'Add some AmazingFeature'` )
12+
4. Push to the branch ( `git push origin feature/AmazingFeature` )
13+
5. Open a Pull Request
14+
6. **Whole list item has same formatting**
15+
7. List item has *mixed or partial* formatting
16+
17+
# *Whole heading is italic*
18+
19+
Some *`formatted_code`*
20+
21+
## *Partially formatted* heading to\_escape `not_to_escape`
22+
23+
[$$E=mc^2$$](https://en.wikipedia.org/wiki/Albert_Einstein)

0 commit comments

Comments
 (0)