Skip to content

Commit c77c59b

Browse files
authored
fix(markdown): fix case of leading list (#174)
Signed-off-by: Panos Vagenas <[email protected]>
1 parent c560309 commit c77c59b

18 files changed

+735
-549
lines changed

docling_core/types/doc/document.py

Lines changed: 3 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2488,20 +2488,11 @@ def _ingest_text(text: str, do_escape_html=True, do_escape_underscores=True):
24882488
visited=visited,
24892489
)
24902490
# NOTE: assumes unordered (flag & marker currently in ListItem)
2491-
indent_str = f"{(list_level if components else 0) * indent * ' '}"
2491+
indent_str = list_level * indent * " "
24922492
text = "\n".join(
24932493
[
2494-
(
2495-
# if starting with sublist, promote to top-level
2496-
cpt.lstrip()
2497-
if not components
2498-
else (
2499-
# avoid additional marker on already evaled sublists
2500-
cpt
2501-
if cpt and cpt[0] == " "
2502-
else f"{indent_str}- {cpt}"
2503-
)
2504-
)
2494+
# avoid additional marker on already evaled sublists
2495+
cpt if cpt and cpt[0] == " " else f"{indent_str}- {cpt}"
25052496
for cpt in comps
25062497
]
25072498
)

test/data/doc/constructed_doc.dt

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
1-
<doctag><title>Title of the Document</title>
1+
<doctag><unordered_list><list_item>item of leading list</list_item>
2+
</unordered_list>
3+
<title>Title of the Document</title>
24
<text>Author 1
35
Affiliation 1</text>
46
<text>Author 2
@@ -19,10 +21,10 @@ Affiliation 2</text>
1921
<otsl><fcel>Product<fcel>Years<lcel><nl><ucel><fcel>2016<fcel>2017<nl><fcel>Apple<fcel>49823<fcel>695944<nl><caption>This is the caption of table 1.</caption></otsl>
2022
<picture><caption>This is the caption of figure 1.</caption></picture>
2123
<picture><caption>This is the caption of figure 2.</caption></picture>
22-
<unordered_list><unordered_list><list_item>subitem of list</list_item>
24+
<unordered_list><list_item>item 1 of list</list_item>
2325
</unordered_list>
24-
<list_item>item 1 of list</list_item>
25-
<list_item>item 2 of list</list_item>
26+
<unordered_list><unordered_list><list_item>item 1 of list after empty list</list_item>
27+
<list_item>item 2 of list after empty list</list_item>
2628
</unordered_list>
2729
<unordered_list><list_item>item 1 of neighboring list</list_item>
2830
<list_item>item 2 of neighboring list</list_item>
@@ -35,6 +37,7 @@ Affiliation 2</text>
3537
<formula>E=mc^2</formula>
3638
<paragraph>(to be displayed inline)</paragraph>
3739
</unordered_list>
40+
</unordered_list>
3841
<paragraph>Here a code block:</paragraph>
3942
<code<_unknown_>print("Hello world")</code
4043
<paragraph>Here a formula block:</paragraph>

test/data/doc/constructed_doc.dt.gt

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
1-
<doctag><title>Title of the Document</title>
1+
<doctag><unordered_list><list_item>item of leading list</list_item>
2+
</unordered_list>
3+
<title>Title of the Document</title>
24
<text>Author 1
35
Affiliation 1</text>
46
<text>Author 2
@@ -19,10 +21,10 @@ Affiliation 2</text>
1921
<otsl><fcel>Product<fcel>Years<lcel><nl><ucel><fcel>2016<fcel>2017<nl><fcel>Apple<fcel>49823<fcel>695944<nl><caption>This is the caption of table 1.</caption></otsl>
2022
<picture><caption>This is the caption of figure 1.</caption></picture>
2123
<picture><caption>This is the caption of figure 2.</caption></picture>
22-
<unordered_list><unordered_list><list_item>subitem of list</list_item>
24+
<unordered_list><list_item>item 1 of list</list_item>
2325
</unordered_list>
24-
<list_item>item 1 of list</list_item>
25-
<list_item>item 2 of list</list_item>
26+
<unordered_list><unordered_list><list_item>item 1 of list after empty list</list_item>
27+
<list_item>item 2 of list after empty list</list_item>
2628
</unordered_list>
2729
<unordered_list><list_item>item 1 of neighboring list</list_item>
2830
<list_item>item 2 of neighboring list</list_item>
@@ -35,6 +37,7 @@ Affiliation 2</text>
3537
<formula>E=mc^2</formula>
3638
<paragraph>(to be displayed inline)</paragraph>
3739
</unordered_list>
40+
</unordered_list>
3841
<paragraph>Here a code block:</paragraph>
3942
<code<_unknown_>print("Hello world")</code
4043
<paragraph>Here a formula block:</paragraph>

test/data/doc/constructed_doc.embedded.html.gt

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,9 @@
6969
}
7070
</style>
7171
</head>
72+
<ul>
73+
<li>item of leading list</li>
74+
</ul>
7275
<h1>Title of the Document</h1>
7376
<p>Author 1<br>Affiliation 1</p>
7477
<p>Author 2<br>Affiliation 2</p>
@@ -92,11 +95,12 @@
9295
<figure><figcaption>This is the caption of figure 1.</figcaption></figure>
9396
<figure><figcaption>This is the caption of figure 2.</figcaption><img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAEAAAABACAIAAAAlC+aJAAAAIklEQVR4nO3BAQ0AAADCoPdPbQ8HFAAAAAAAAAAAAAAA8G4wQAABiwCo9wAAAABJRU5ErkJggg=="></figure>
9497
<ul>
95-
<ul>
96-
<li>subitem of list</li>
97-
</ul>
9898
<li>item 1 of list</li>
99-
<li>item 2 of list</li>
99+
</ul>
100+
<ul>
101+
<ul>
102+
<li>item 1 of list after empty list</li>
103+
<li>item 2 of list after empty list</li>
100104
</ul>
101105
<ul>
102106
<li>item 1 of neighboring list</li>
@@ -111,6 +115,7 @@
111115
<div><math xmlns="http://www.w3.org/1998/Math/MathML" display="block"><mrow><mi>E</mi><mo>&#x0003D;</mo><mi>m</mi><msup><mi>c</mi><mn>2</mn></msup></mrow><annotation encoding="TeX">E=mc^2</annotation></math></div>
112116
<p>(to be displayed inline)</p>
113117
</ul>
118+
</ul>
114119
<p>Here a code block:</p>
115120
<pre><code>print("Hello world")</code></pre>
116121
<p>Here a formula block:</p>

0 commit comments

Comments
 (0)