Skip to content

Commit 807d972

Browse files
authored
fix(HTML): fix nested list serialization edge cases (#367)
* fix(HTML): (initial state without fix) fix nested list serialization edge cases Signed-off-by: Panos Vagenas <[email protected]> * actual fix Signed-off-by: Panos Vagenas <[email protected]> --------- Signed-off-by: Panos Vagenas <[email protected]>
1 parent 8433d28 commit 807d972

11 files changed

+4036
-114
lines changed

docling_core/transforms/serializer/html.py

Lines changed: 32 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -130,11 +130,14 @@ def serialize(
130130
doc_serializer: BaseDocSerializer,
131131
doc: DoclingDocument,
132132
is_inline_scope: bool = False,
133+
visited: Optional[set[str]] = None,
133134
**kwargs: Any,
134135
) -> SerializationResult:
135136
"""Serializes the passed text item to HTML."""
136137
params = HTMLParams(**kwargs)
138+
my_visited: set[str] = visited if visited is not None else set()
137139
res_parts: list[SerializationResult] = []
140+
post_processed = False
138141

139142
# Prepare the HTML based on item type
140143
if isinstance(item, TitleItem):
@@ -162,7 +165,28 @@ def serialize(
162165

163166
elif isinstance(item, ListItem):
164167
# List items are handled by list serializer
165-
text_inner = self._prepare_content(item.text)
168+
text_parts: list[str] = []
169+
if item_text := self._prepare_content(item.text):
170+
item_text = doc_serializer.post_process(
171+
text=item_text,
172+
formatting=item.formatting,
173+
hyperlink=item.hyperlink,
174+
)
175+
post_processed = True
176+
text_parts.append(item_text)
177+
nested_parts = [
178+
r.text
179+
for r in doc_serializer.get_parts(
180+
item=item,
181+
is_inline_scope=is_inline_scope,
182+
visited=my_visited,
183+
**kwargs,
184+
)
185+
]
186+
text_parts.extend(nested_parts)
187+
text_inner = "\n".join(text_parts)
188+
if nested_parts:
189+
text_inner = f"\n{text_inner}\n"
166190
text = (
167191
get_html_tag_with_text_direction(
168192
html_tag="li",
@@ -185,11 +209,12 @@ def serialize(
185209
text = get_html_tag_with_text_direction(html_tag="p", text=text_inner)
186210

187211
# Apply formatting and hyperlinks
188-
text = doc_serializer.post_process(
189-
text=text,
190-
formatting=item.formatting,
191-
hyperlink=item.hyperlink,
192-
)
212+
if not post_processed:
213+
text = doc_serializer.post_process(
214+
text=text,
215+
formatting=item.formatting,
216+
hyperlink=item.hyperlink,
217+
)
193218

194219
if text:
195220
text_res = create_ser_result(text=text, span_source=item)
@@ -703,7 +728,6 @@ def serialize(
703728
) -> SerializationResult:
704729
"""Serializes a list to HTML."""
705730
my_visited: set[str] = visited if visited is not None else set()
706-
params = HTMLParams(**kwargs)
707731
# Get all child parts
708732
parts = doc_serializer.get_parts(
709733
item=item,
@@ -713,72 +737,8 @@ def serialize(
713737
**kwargs,
714738
)
715739

716-
# Append nested list to parent list item:
717-
i = 0
718-
while i < len(parts):
719-
prt = parts[i]
720-
if prt.text.startswith(("<ul>", "<ol>")):
721-
for j in range(i - 1, -1, -1):
722-
if parts[j].text.startswith(("<li>", "<li ")) and parts[
723-
j
724-
].text.endswith("</li>"):
725-
before, _, _ = parts[j].text.rpartition("</li>")
726-
parts[j].text = f"{before}\n{prt.text}\n</li>"
727-
break
728-
if j > -1:
729-
parts.pop(i)
730-
else:
731-
i += 1
732-
733740
# Add all child parts
734-
text_res = "\n".join(
735-
[
736-
(
737-
p.text
738-
if (
739-
(
740-
p.text.startswith(("<li>", "<li "))
741-
and p.text.endswith("</li>")
742-
)
743-
or (
744-
p.text.startswith(("<ol>", "<ol "))
745-
and p.text.endswith("</ol>")
746-
)
747-
or (
748-
p.text.startswith(("<ul>", "<ul "))
749-
and p.text.endswith("</ul>")
750-
)
751-
)
752-
else (
753-
get_html_tag_with_text_direction(
754-
html_tag="li",
755-
text=p.text,
756-
attrs=(
757-
{
758-
"style": f"list-style-type: '{grandparent_item.marker} ';"
759-
}
760-
if params.show_original_list_item_marker
761-
and grandparent_item.marker
762-
else {}
763-
),
764-
)
765-
if p.spans
766-
and p.spans[0].item.parent
767-
and isinstance(
768-
(parent_item := p.spans[0].item.parent.resolve(doc)),
769-
InlineGroup,
770-
)
771-
and parent_item.parent
772-
and isinstance(
773-
(grandparent_item := parent_item.parent.resolve(doc)),
774-
ListItem,
775-
)
776-
else f"<li>{p.text}</li>"
777-
)
778-
)
779-
for p in parts
780-
]
781-
)
741+
text_res = "\n".join(p.text for p in parts if p.text)
782742
if text_res:
783743
tag = "ol" if item.first_item_is_enumerated(doc) else "ul"
784744
text_res = f"<{tag}>\n{text_res}\n</{tag}>"

test/data/doc/constructed_doc.embedded.html.gt

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -135,11 +135,13 @@
135135
<ul>
136136
<li style="list-style-type: '■ ';">list item 1</li>
137137
<li style="list-style-type: '■ ';">list item 2</li>
138-
<li style="list-style-type: '■ ';">list item 3
138+
<li style="list-style-type: '■ ';">
139+
list item 3
139140
<ol>
140141
<li>list item 3.a</li>
141142
<li>list item 3.b</li>
142-
<li>list item 3.c
143+
<li>
144+
list item 3.c
143145
<ol>
144146
<li>list item 3.c.i</li>
145147
</ol>
@@ -160,11 +162,16 @@
160162
</ul>
161163
<ul>
162164
<li style="list-style-type: '■ ';">item 1 of neighboring list</li>
163-
<li style="list-style-type: '■ ';">item 2 of neighboring list
165+
<li style="list-style-type: '■ ';">
166+
item 2 of neighboring list
164167
<ul>
165168
<li style="list-style-type: '□ ';">item 1 of sub list</li>
166-
<li style="list-style-type: '□ ';"><span class='inline-group'>Here a code snippet: <code>print("Hello world")</code> (to be displayed inline)</span></li>
167-
<li style="list-style-type: '□ ';"><span class='inline-group'>Here a formula: <math xmlns="http://www.w3.org/1998/Math/MathML" display="inline"><mrow><mi>E</mi><mo>&#x0003D;</mo><mi>m</mi><msup><mi>c</mi><mn>2</mn></msup></mrow><annotation encoding="TeX">E=mc^2</annotation></math> (to be displayed inline)</span></li>
169+
<li style="list-style-type: '□ ';">
170+
<span class='inline-group'>Here a code snippet: <code>print("Hello world")</code> (to be displayed inline)</span>
171+
</li>
172+
<li style="list-style-type: '□ ';">
173+
<span class='inline-group'>Here a formula: <math xmlns="http://www.w3.org/1998/Math/MathML" display="inline"><mrow><mi>E</mi><mo>&#x0003D;</mo><mi>m</mi><msup><mi>c</mi><mn>2</mn></msup></mrow><annotation encoding="TeX">E=mc^2</annotation></math> (to be displayed inline)</span>
174+
</li>
168175
</ul>
169176
</li>
170177
</ul>
@@ -188,10 +195,12 @@
188195
<ol>
189196
<li style="list-style-type: '(i) ';">Item 1 in A</li>
190197
<li style="list-style-type: '(ii) ';">Item 2 in A</li>
191-
<li style="list-style-type: '(iii) ';">Item 3 in A
198+
<li style="list-style-type: '(iii) ';">
199+
Item 3 in A
192200
<ol>
193201
<li>Item 1 in B</li>
194-
<li style="list-style-type: '42. ';">Item 2 in B
202+
<li style="list-style-type: '42. ';">
203+
Item 2 in B
195204
<ol>
196205
<li>Item 1 in C</li>
197206
<li>Item 2 in C</li>

test/data/doc/constructed_doc.placeholder.html.gt

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -135,11 +135,13 @@
135135
<ul>
136136
<li style="list-style-type: '■ ';">list item 1</li>
137137
<li style="list-style-type: '■ ';">list item 2</li>
138-
<li style="list-style-type: '■ ';">list item 3
138+
<li style="list-style-type: '■ ';">
139+
list item 3
139140
<ol>
140141
<li>list item 3.a</li>
141142
<li>list item 3.b</li>
142-
<li>list item 3.c
143+
<li>
144+
list item 3.c
143145
<ol>
144146
<li>list item 3.c.i</li>
145147
</ol>
@@ -160,11 +162,16 @@
160162
</ul>
161163
<ul>
162164
<li style="list-style-type: '■ ';">item 1 of neighboring list</li>
163-
<li style="list-style-type: '■ ';">item 2 of neighboring list
165+
<li style="list-style-type: '■ ';">
166+
item 2 of neighboring list
164167
<ul>
165168
<li style="list-style-type: '□ ';">item 1 of sub list</li>
166-
<li style="list-style-type: '□ ';"><span class='inline-group'>Here a code snippet: <code>print("Hello world")</code> (to be displayed inline)</span></li>
167-
<li style="list-style-type: '□ ';"><span class='inline-group'>Here a formula: <math xmlns="http://www.w3.org/1998/Math/MathML" display="inline"><mrow><mi>E</mi><mo>&#x0003D;</mo><mi>m</mi><msup><mi>c</mi><mn>2</mn></msup></mrow><annotation encoding="TeX">E=mc^2</annotation></math> (to be displayed inline)</span></li>
169+
<li style="list-style-type: '□ ';">
170+
<span class='inline-group'>Here a code snippet: <code>print("Hello world")</code> (to be displayed inline)</span>
171+
</li>
172+
<li style="list-style-type: '□ ';">
173+
<span class='inline-group'>Here a formula: <math xmlns="http://www.w3.org/1998/Math/MathML" display="inline"><mrow><mi>E</mi><mo>&#x0003D;</mo><mi>m</mi><msup><mi>c</mi><mn>2</mn></msup></mrow><annotation encoding="TeX">E=mc^2</annotation></math> (to be displayed inline)</span>
174+
</li>
168175
</ul>
169176
</li>
170177
</ul>
@@ -188,10 +195,12 @@
188195
<ol>
189196
<li style="list-style-type: '(i) ';">Item 1 in A</li>
190197
<li style="list-style-type: '(ii) ';">Item 2 in A</li>
191-
<li style="list-style-type: '(iii) ';">Item 3 in A
198+
<li style="list-style-type: '(iii) ';">
199+
Item 3 in A
192200
<ol>
193201
<li>Item 1 in B</li>
194-
<li style="list-style-type: '42. ';">Item 2 in B
202+
<li style="list-style-type: '42. ';">
203+
Item 2 in B
195204
<ol>
196205
<li>Item 1 in C</li>
197206
<li>Item 2 in C</li>

test/data/doc/constructed_doc.referenced.html.gt

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -135,11 +135,13 @@
135135
<ul>
136136
<li style="list-style-type: '■ ';">list item 1</li>
137137
<li style="list-style-type: '■ ';">list item 2</li>
138-
<li style="list-style-type: '■ ';">list item 3
138+
<li style="list-style-type: '■ ';">
139+
list item 3
139140
<ol>
140141
<li>list item 3.a</li>
141142
<li>list item 3.b</li>
142-
<li>list item 3.c
143+
<li>
144+
list item 3.c
143145
<ol>
144146
<li>list item 3.c.i</li>
145147
</ol>
@@ -160,11 +162,16 @@
160162
</ul>
161163
<ul>
162164
<li style="list-style-type: '■ ';">item 1 of neighboring list</li>
163-
<li style="list-style-type: '■ ';">item 2 of neighboring list
165+
<li style="list-style-type: '■ ';">
166+
item 2 of neighboring list
164167
<ul>
165168
<li style="list-style-type: '□ ';">item 1 of sub list</li>
166-
<li style="list-style-type: '□ ';"><span class='inline-group'>Here a code snippet: <code>print("Hello world")</code> (to be displayed inline)</span></li>
167-
<li style="list-style-type: '□ ';"><span class='inline-group'>Here a formula: <math xmlns="http://www.w3.org/1998/Math/MathML" display="inline"><mrow><mi>E</mi><mo>&#x0003D;</mo><mi>m</mi><msup><mi>c</mi><mn>2</mn></msup></mrow><annotation encoding="TeX">E=mc^2</annotation></math> (to be displayed inline)</span></li>
169+
<li style="list-style-type: '□ ';">
170+
<span class='inline-group'>Here a code snippet: <code>print("Hello world")</code> (to be displayed inline)</span>
171+
</li>
172+
<li style="list-style-type: '□ ';">
173+
<span class='inline-group'>Here a formula: <math xmlns="http://www.w3.org/1998/Math/MathML" display="inline"><mrow><mi>E</mi><mo>&#x0003D;</mo><mi>m</mi><msup><mi>c</mi><mn>2</mn></msup></mrow><annotation encoding="TeX">E=mc^2</annotation></math> (to be displayed inline)</span>
174+
</li>
168175
</ul>
169176
</li>
170177
</ul>
@@ -188,10 +195,12 @@
188195
<ol>
189196
<li style="list-style-type: '(i) ';">Item 1 in A</li>
190197
<li style="list-style-type: '(ii) ';">Item 2 in A</li>
191-
<li style="list-style-type: '(iii) ';">Item 3 in A
198+
<li style="list-style-type: '(iii) ';">
199+
Item 3 in A
192200
<ol>
193201
<li>Item 1 in B</li>
194-
<li style="list-style-type: '42. ';">Item 2 in B
202+
<li style="list-style-type: '42. ';">
203+
Item 2 in B
195204
<ol>
196205
<li>Item 1 in C</li>
197206
<li>Item 2 in C</li>

test/data/doc/constructed_document.yaml.html

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -135,11 +135,13 @@ <h2>1. Introduction</h2>
135135
<ul>
136136
<li style="list-style-type: '■ ';">list item 1</li>
137137
<li style="list-style-type: '■ ';">list item 2</li>
138-
<li style="list-style-type: '■ ';">list item 3
138+
<li style="list-style-type: '■ ';">
139+
list item 3
139140
<ol>
140141
<li>list item 3.a</li>
141142
<li>list item 3.b</li>
142-
<li>list item 3.c
143+
<li>
144+
list item 3.c
143145
<ol>
144146
<li>list item 3.c.i</li>
145147
</ol>
@@ -160,11 +162,16 @@ <h2>1. Introduction</h2>
160162
</ul>
161163
<ul>
162164
<li style="list-style-type: '■ ';">item 1 of neighboring list</li>
163-
<li style="list-style-type: '■ ';">item 2 of neighboring list
165+
<li style="list-style-type: '■ ';">
166+
item 2 of neighboring list
164167
<ul>
165168
<li style="list-style-type: '□ ';">item 1 of sub list</li>
166-
<li style="list-style-type: '□ ';"><span class='inline-group'>Here a code snippet: <code>print("Hello world")</code> (to be displayed inline)</span></li>
167-
<li style="list-style-type: '□ ';"><span class='inline-group'>Here a formula: <math xmlns="http://www.w3.org/1998/Math/MathML" display="inline"><mrow><mi>E</mi><mo>&#x0003D;</mo><mi>m</mi><msup><mi>c</mi><mn>2</mn></msup></mrow><annotation encoding="TeX">E=mc^2</annotation></math> (to be displayed inline)</span></li>
169+
<li style="list-style-type: '□ ';">
170+
<span class='inline-group'>Here a code snippet: <code>print("Hello world")</code> (to be displayed inline)</span>
171+
</li>
172+
<li style="list-style-type: '□ ';">
173+
<span class='inline-group'>Here a formula: <math xmlns="http://www.w3.org/1998/Math/MathML" display="inline"><mrow><mi>E</mi><mo>&#x0003D;</mo><mi>m</mi><msup><mi>c</mi><mn>2</mn></msup></mrow><annotation encoding="TeX">E=mc^2</annotation></math> (to be displayed inline)</span>
174+
</li>
168175
</ul>
169176
</li>
170177
</ul>
@@ -188,10 +195,12 @@ <h2>1. Introduction</h2>
188195
<ol>
189196
<li style="list-style-type: '(i) ';">Item 1 in A</li>
190197
<li style="list-style-type: '(ii) ';">Item 2 in A</li>
191-
<li style="list-style-type: '(iii) ';">Item 3 in A
198+
<li style="list-style-type: '(iii) ';">
199+
Item 3 in A
192200
<ol>
193201
<li>Item 1 in B</li>
194-
<li style="list-style-type: '42. ';">Item 2 in B
202+
<li style="list-style-type: '42. ';">
203+
Item 2 in B
195204
<ol>
196205
<li>Item 1 in C</li>
197206
<li>Item 2 in C</li>

test/data/doc/constructed_orig_false.gt.html

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -135,11 +135,13 @@ <h2>1. Introduction</h2>
135135
<ul>
136136
<li>list item 1</li>
137137
<li>list item 2</li>
138-
<li>list item 3
138+
<li>
139+
list item 3
139140
<ol>
140141
<li>list item 3.a</li>
141142
<li>list item 3.b</li>
142-
<li>list item 3.c
143+
<li>
144+
list item 3.c
143145
<ol>
144146
<li>list item 3.c.i</li>
145147
</ol>
@@ -160,11 +162,16 @@ <h2>1. Introduction</h2>
160162
</ul>
161163
<ul>
162164
<li>item 1 of neighboring list</li>
163-
<li>item 2 of neighboring list
165+
<li>
166+
item 2 of neighboring list
164167
<ul>
165168
<li>item 1 of sub list</li>
166-
<li><span class='inline-group'>Here a code snippet: <code>print("Hello world")</code> (to be displayed inline)</span></li>
167-
<li><span class='inline-group'>Here a formula: <math xmlns="http://www.w3.org/1998/Math/MathML" display="inline"><mrow><mi>E</mi><mo>&#x0003D;</mo><mi>m</mi><msup><mi>c</mi><mn>2</mn></msup></mrow><annotation encoding="TeX">E=mc^2</annotation></math> (to be displayed inline)</span></li>
169+
<li>
170+
<span class='inline-group'>Here a code snippet: <code>print("Hello world")</code> (to be displayed inline)</span>
171+
</li>
172+
<li>
173+
<span class='inline-group'>Here a formula: <math xmlns="http://www.w3.org/1998/Math/MathML" display="inline"><mrow><mi>E</mi><mo>&#x0003D;</mo><mi>m</mi><msup><mi>c</mi><mn>2</mn></msup></mrow><annotation encoding="TeX">E=mc^2</annotation></math> (to be displayed inline)</span>
174+
</li>
168175
</ul>
169176
</li>
170177
</ul>
@@ -188,10 +195,12 @@ <h2>1. Introduction</h2>
188195
<ol>
189196
<li>Item 1 in A</li>
190197
<li>Item 2 in A</li>
191-
<li>Item 3 in A
198+
<li>
199+
Item 3 in A
192200
<ol>
193201
<li>Item 1 in B</li>
194-
<li>Item 2 in B
202+
<li>
203+
Item 2 in B
195204
<ol>
196205
<li>Item 1 in C</li>
197206
<li>Item 2 in C</li>

0 commit comments

Comments
 (0)