Skip to content

Commit c49a50e

Browse files
authored
fix(markdown): fix case of empty page break string (#298)
Signed-off-by: Panos Vagenas <[email protected]>
1 parent 81760f5 commit c49a50e

File tree

5 files changed

+155
-2
lines changed

5 files changed

+155
-2
lines changed

docling_core/transforms/serializer/markdown.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -535,14 +535,14 @@ def serialize_doc(
535535
) -> SerializationResult:
536536
"""Serialize a document out of its parts."""
537537
text_res = "\n\n".join([p.text for p in parts if p.text])
538-
if self.params.page_break_placeholder:
538+
if self.requires_page_break():
539539
page_sep = self.params.page_break_placeholder or ""
540540
for full_match, _, _ in self._get_page_breaks(text=text_res):
541541
text_res = text_res.replace(full_match, page_sep)
542542

543543
return create_ser_result(text=text_res, span_source=parts)
544544

545545
@override
546-
def requires_page_break(self):
546+
def requires_page_break(self) -> bool:
547547
"""Whether to add page breaks."""
548548
return self.params.page_break_placeholder is not None
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
## Summer activities
2+
3+
## Swimming in the lake
4+
5+
Duck
6+
7+
Figure 1: This is a cute duckling
8+
9+
## Let's swim!
10+
11+
To get started with swimming, first lay down in a water and try not to drown:
12+
13+
- ∞ You can relax and look around
14+
- ∞ Paddle about
15+
- ∞ Enjoy summer warmth
16+
17+
Also, don't forget:
18+
19+
- 1. Wear sunglasses
20+
- 2. Don't forget to drink water
21+
- 3. Use sun cream
22+
23+
Hmm, what else…
24+
25+
- -Another activity item
26+
27+
- -Yet another one
28+
- -Stopping it here
29+
30+
Some text.
31+
32+
33+
34+
- -Starting the next page with a list item.
35+
- -Second item.
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
## Summer activities
2+
3+
## Swimming in the lake
4+
5+
Duck
6+
7+
Figure 1: This is a cute duckling
8+
9+
## Let's swim!
10+
11+
To get started with swimming, first lay down in a water and try not to drown:
12+
13+
- ∞ You can relax and look around
14+
- ∞ Paddle about
15+
- ∞ Enjoy summer warmth
16+
17+
Also, don't forget:
18+
19+
- 1. Wear sunglasses
20+
- 2. Don't forget to drink water
21+
- 3. Use sun cream
22+
23+
Hmm, what else…
24+
25+
- -Another activity item
26+
<!-- page-break -->
27+
- -Yet another one
28+
- -Stopping it here
29+
30+
Some text.
31+
32+
<!-- page-break -->
33+
34+
- -Starting the next page with a list item.
35+
- -Second item.
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
## Summer activities
2+
3+
## Swimming in the lake
4+
5+
Duck
6+
7+
Figure 1: This is a cute duckling
8+
9+
## Let's swim!
10+
11+
To get started with swimming, first lay down in a water and try not to drown:
12+
13+
- ∞ You can relax and look around
14+
- ∞ Paddle about
15+
- ∞ Enjoy summer warmth
16+
17+
Also, don't forget:
18+
19+
- 1. Wear sunglasses
20+
- 2. Don't forget to drink water
21+
- 3. Use sun cream
22+
23+
Hmm, what else…
24+
25+
- -Another activity item
26+
- -Yet another one
27+
- -Stopping it here
28+
29+
Some text.
30+
31+
- -Starting the next page with a list item.
32+
- -Second item.

test/test_serialization.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,57 @@ def test_md_cross_page_list_page_break():
4848
verify(exp_file=src.parent / f"{src.stem}.gt.md", actual=actual)
4949

5050

51+
def test_md_cross_page_list_page_break_none():
52+
src = Path("./test/data/doc/activities.json")
53+
doc = DoclingDocument.load_from_json(src)
54+
55+
ser = MarkdownDocSerializer(
56+
doc=doc,
57+
params=MarkdownParams(
58+
image_mode=ImageRefMode.PLACEHOLDER,
59+
image_placeholder="<!-- image -->",
60+
page_break_placeholder=None,
61+
labels=_DEFAULT_LABELS - {DocItemLabel.PICTURE},
62+
),
63+
)
64+
actual = ser.serialize().text
65+
verify(exp_file=src.parent / f"{src.stem}_pb_none.gt.md", actual=actual)
66+
67+
68+
def test_md_cross_page_list_page_break_empty():
69+
src = Path("./test/data/doc/activities.json")
70+
doc = DoclingDocument.load_from_json(src)
71+
72+
ser = MarkdownDocSerializer(
73+
doc=doc,
74+
params=MarkdownParams(
75+
image_mode=ImageRefMode.PLACEHOLDER,
76+
image_placeholder="<!-- image -->",
77+
page_break_placeholder="",
78+
labels=_DEFAULT_LABELS - {DocItemLabel.PICTURE},
79+
),
80+
)
81+
actual = ser.serialize().text
82+
verify(exp_file=src.parent / f"{src.stem}_pb_empty.gt.md", actual=actual)
83+
84+
85+
def test_md_cross_page_list_page_break_non_empty():
86+
src = Path("./test/data/doc/activities.json")
87+
doc = DoclingDocument.load_from_json(src)
88+
89+
ser = MarkdownDocSerializer(
90+
doc=doc,
91+
params=MarkdownParams(
92+
image_mode=ImageRefMode.PLACEHOLDER,
93+
image_placeholder="<!-- image -->",
94+
page_break_placeholder="<!-- page-break -->",
95+
labels=_DEFAULT_LABELS - {DocItemLabel.PICTURE},
96+
),
97+
)
98+
actual = ser.serialize().text
99+
verify(exp_file=src.parent / f"{src.stem}_pb_non_empty.gt.md", actual=actual)
100+
101+
51102
def test_md_cross_page_list_page_break_p2():
52103
src = Path("./test/data/doc/activities.json")
53104
doc = DoclingDocument.load_from_json(src)

0 commit comments

Comments
 (0)