Skip to content

Commit 7ed4d22

Browse files
authored
fix(markdown): fix ordered list numbering (#200)
Signed-off-by: Panos Vagenas <[email protected]>
1 parent 9a3c8c1 commit 7ed4d22

18 files changed

+847
-12
lines changed

docling_core/experimental/serializer/markdown.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -310,17 +310,25 @@ def serialize(
310310
is_inline_scope=is_inline_scope,
311311
visited=my_visited,
312312
)
313+
sep = "\n"
314+
my_parts: list[SerializationResult] = []
315+
for p in parts:
316+
if p.text and p.text[0] == " " and my_parts:
317+
my_parts[-1].text = sep.join([my_parts[-1].text, p.text]) # update last
318+
else:
319+
my_parts.append(p)
320+
313321
indent_str = list_level * self.indent * " "
314322
is_ol = isinstance(item, OrderedList)
315-
text_res = "\n".join(
323+
text_res = sep.join(
316324
[
317325
# avoid additional marker on already evaled sublists
318326
(
319327
c.text
320328
if c.text and c.text[0] == " "
321329
else f"{indent_str}{f'{i + 1}.' if is_ol else '-'} {c.text}"
322330
)
323-
for i, c in enumerate(parts)
331+
for i, c in enumerate(my_parts)
324332
]
325333
)
326334
return SerializationResult(text=text_res)

test/data/doc/constructed_doc.dt

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,5 +50,17 @@ Affiliation 2</text>
5050
<text>hyperlink</text>
5151
<text>&</text>
5252
<text>everything at the same time.</text>
53+
<ordered_list><list_item>Item 1 in A</list_item>
54+
<list_item>Item 2 in A</list_item>
55+
<list_item>Item 3 in A</list_item>
56+
<ordered_list><list_item>Item 1 in B</list_item>
57+
<list_item>Item 2 in B</list_item>
58+
<ordered_list><list_item>Item 1 in C</list_item>
59+
<list_item>Item 2 in C</list_item>
60+
</ordered_list>
61+
</ordered_list>
62+
<list_item>Item 3 in B</list_item>
63+
</ordered_list>
64+
<list_item>Item 4 in A</list_item>
5365
<text>The end.</text>
5466
</doctag>

test/data/doc/constructed_doc.dt.gt

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,5 +50,17 @@ Affiliation 2</text>
5050
<text>hyperlink</text>
5151
<text>&</text>
5252
<text>everything at the same time.</text>
53+
<ordered_list><list_item>Item 1 in A</list_item>
54+
<list_item>Item 2 in A</list_item>
55+
<list_item>Item 3 in A</list_item>
56+
<ordered_list><list_item>Item 1 in B</list_item>
57+
<list_item>Item 2 in B</list_item>
58+
<ordered_list><list_item>Item 1 in C</list_item>
59+
<list_item>Item 2 in C</list_item>
60+
</ordered_list>
61+
</ordered_list>
62+
<list_item>Item 3 in B</list_item>
63+
</ordered_list>
64+
<list_item>Item 4 in A</list_item>
5365
<text>The end.</text>
5466
</doctag>

test/data/doc/constructed_doc.embedded.html.gt

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,5 +128,20 @@
128128
<p>hyperlink</p>
129129
<p>&amp;</p>
130130
<p>everything at the same time.</p>
131+
<ol>
132+
<li>Item 1 in A</li>
133+
<li>Item 2 in A</li>
134+
<li>Item 3 in A</li>
135+
<ol>
136+
<li>Item 1 in B</li>
137+
<li>Item 2 in B</li>
138+
<ol>
139+
<li>Item 1 in C</li>
140+
<li>Item 2 in C</li>
141+
</ol>
142+
</ol>
143+
<li>Item 3 in B</li>
144+
</ol>
145+
<li>Item 4 in A</li>
131146
<p>The end.</p>
132147
</html>

test/data/doc/constructed_doc.embedded.json.gt

Lines changed: 198 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,10 @@
7373
"$ref": "#/groups/12"
7474
},
7575
{
76-
"$ref": "#/texts/41"
76+
"$ref": "#/groups/13"
77+
},
78+
{
79+
"$ref": "#/texts/50"
7780
}
7881
],
7982
"content_layer": "body",
@@ -324,6 +327,66 @@
324327
"content_layer": "body",
325328
"name": "group",
326329
"label": "inline"
330+
},
331+
{
332+
"self_ref": "#/groups/13",
333+
"parent": {
334+
"$ref": "#/body"
335+
},
336+
"children": [
337+
{
338+
"$ref": "#/texts/41"
339+
},
340+
{
341+
"$ref": "#/texts/42"
342+
},
343+
{
344+
"$ref": "#/texts/43"
345+
},
346+
{
347+
"$ref": "#/texts/49"
348+
}
349+
],
350+
"content_layer": "body",
351+
"name": "list A",
352+
"label": "ordered_list"
353+
},
354+
{
355+
"self_ref": "#/groups/14",
356+
"parent": {
357+
"$ref": "#/texts/43"
358+
},
359+
"children": [
360+
{
361+
"$ref": "#/texts/44"
362+
},
363+
{
364+
"$ref": "#/texts/45"
365+
},
366+
{
367+
"$ref": "#/texts/48"
368+
}
369+
],
370+
"content_layer": "body",
371+
"name": "list B",
372+
"label": "ordered_list"
373+
},
374+
{
375+
"self_ref": "#/groups/15",
376+
"parent": {
377+
"$ref": "#/texts/45"
378+
},
379+
"children": [
380+
{
381+
"$ref": "#/texts/46"
382+
},
383+
{
384+
"$ref": "#/texts/47"
385+
}
386+
],
387+
"content_layer": "body",
388+
"name": "list C",
389+
"label": "ordered_list"
327390
}
328391
],
329392
"texts": [
@@ -911,6 +974,140 @@
911974
},
912975
{
913976
"self_ref": "#/texts/41",
977+
"parent": {
978+
"$ref": "#/groups/13"
979+
},
980+
"children": [],
981+
"content_layer": "body",
982+
"label": "list_item",
983+
"prov": [],
984+
"orig": "Item 1 in A",
985+
"text": "Item 1 in A",
986+
"enumerated": true,
987+
"marker": "-"
988+
},
989+
{
990+
"self_ref": "#/texts/42",
991+
"parent": {
992+
"$ref": "#/groups/13"
993+
},
994+
"children": [],
995+
"content_layer": "body",
996+
"label": "list_item",
997+
"prov": [],
998+
"orig": "Item 2 in A",
999+
"text": "Item 2 in A",
1000+
"enumerated": true,
1001+
"marker": "-"
1002+
},
1003+
{
1004+
"self_ref": "#/texts/43",
1005+
"parent": {
1006+
"$ref": "#/groups/13"
1007+
},
1008+
"children": [
1009+
{
1010+
"$ref": "#/groups/14"
1011+
}
1012+
],
1013+
"content_layer": "body",
1014+
"label": "list_item",
1015+
"prov": [],
1016+
"orig": "Item 3 in A",
1017+
"text": "Item 3 in A",
1018+
"enumerated": true,
1019+
"marker": "-"
1020+
},
1021+
{
1022+
"self_ref": "#/texts/44",
1023+
"parent": {
1024+
"$ref": "#/groups/14"
1025+
},
1026+
"children": [],
1027+
"content_layer": "body",
1028+
"label": "list_item",
1029+
"prov": [],
1030+
"orig": "Item 1 in B",
1031+
"text": "Item 1 in B",
1032+
"enumerated": true,
1033+
"marker": "-"
1034+
},
1035+
{
1036+
"self_ref": "#/texts/45",
1037+
"parent": {
1038+
"$ref": "#/groups/14"
1039+
},
1040+
"children": [
1041+
{
1042+
"$ref": "#/groups/15"
1043+
}
1044+
],
1045+
"content_layer": "body",
1046+
"label": "list_item",
1047+
"prov": [],
1048+
"orig": "Item 2 in B",
1049+
"text": "Item 2 in B",
1050+
"enumerated": true,
1051+
"marker": "-"
1052+
},
1053+
{
1054+
"self_ref": "#/texts/46",
1055+
"parent": {
1056+
"$ref": "#/groups/15"
1057+
},
1058+
"children": [],
1059+
"content_layer": "body",
1060+
"label": "list_item",
1061+
"prov": [],
1062+
"orig": "Item 1 in C",
1063+
"text": "Item 1 in C",
1064+
"enumerated": true,
1065+
"marker": "-"
1066+
},
1067+
{
1068+
"self_ref": "#/texts/47",
1069+
"parent": {
1070+
"$ref": "#/groups/15"
1071+
},
1072+
"children": [],
1073+
"content_layer": "body",
1074+
"label": "list_item",
1075+
"prov": [],
1076+
"orig": "Item 2 in C",
1077+
"text": "Item 2 in C",
1078+
"enumerated": true,
1079+
"marker": "-"
1080+
},
1081+
{
1082+
"self_ref": "#/texts/48",
1083+
"parent": {
1084+
"$ref": "#/groups/14"
1085+
},
1086+
"children": [],
1087+
"content_layer": "body",
1088+
"label": "list_item",
1089+
"prov": [],
1090+
"orig": "Item 3 in B",
1091+
"text": "Item 3 in B",
1092+
"enumerated": true,
1093+
"marker": "-"
1094+
},
1095+
{
1096+
"self_ref": "#/texts/49",
1097+
"parent": {
1098+
"$ref": "#/groups/13"
1099+
},
1100+
"children": [],
1101+
"content_layer": "body",
1102+
"label": "list_item",
1103+
"prov": [],
1104+
"orig": "Item 4 in A",
1105+
"text": "Item 4 in A",
1106+
"enumerated": true,
1107+
"marker": "-"
1108+
},
1109+
{
1110+
"self_ref": "#/texts/50",
9141111
"parent": {
9151112
"$ref": "#/body"
9161113
},

test/data/doc/constructed_doc.embedded.md.gt

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,4 +63,14 @@ $$E=mc^2$$
6363

6464
Some formatting chops: **bold** *italic* underline ~~strikethrough~~ [hyperlink](.) &amp; [~~***everything at the same time.***~~](https://github.com/DS4SD/docling)
6565

66+
1. Item 1 in A
67+
2. Item 2 in A
68+
3. Item 3 in A
69+
1. Item 1 in B
70+
2. Item 2 in B
71+
1. Item 1 in C
72+
2. Item 2 in C
73+
3. Item 3 in B
74+
4. Item 4 in A
75+
6676
The end.

0 commit comments

Comments
 (0)