Skip to content

Commit 928e5c5

Browse files
authored
fix: fix page breaking in case page starts with group (#253)
Signed-off-by: Panos Vagenas <[email protected]>
1 parent 91677c9 commit 928e5c5

File tree

6 files changed

+305
-81
lines changed

6 files changed

+305
-81
lines changed

docling_core/experimental/serializer/common.py

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,9 @@ def _iterate_items(
6969
node: Optional[NodeItem] = None,
7070
traverse_pictures: bool = False,
7171
add_page_breaks: bool = False,
72+
visited: Optional[set[str]] = None,
7273
):
74+
my_visited: set[str] = visited if visited is not None else set()
7375
prev_page_nr: Optional[int] = None
7476
page_break_i = 0
7577
for item, _ in doc.iterate_items(
@@ -78,10 +80,33 @@ def _iterate_items(
7880
included_content_layers=layers,
7981
traverse_pictures=traverse_pictures,
8082
):
81-
if isinstance(item, DocItem):
82-
if item.prov:
83+
if add_page_breaks:
84+
if (
85+
isinstance(item, (UnorderedList, OrderedList, InlineGroup))
86+
and item.self_ref not in my_visited
87+
):
88+
# if group starts with new page, yield page break before group node
89+
my_visited.add(item.self_ref)
90+
for it in _iterate_items(
91+
doc=doc,
92+
layers=layers,
93+
node=item,
94+
traverse_pictures=traverse_pictures,
95+
add_page_breaks=add_page_breaks,
96+
visited=my_visited,
97+
):
98+
if isinstance(it, DocItem) and it.prov:
99+
page_no = it.prov[0].page_no
100+
if prev_page_nr is not None and page_no > prev_page_nr:
101+
yield _PageBreakNode(
102+
self_ref=f"#/pb/{page_break_i}",
103+
prev_page=prev_page_nr,
104+
next_page=page_no,
105+
)
106+
break
107+
elif isinstance(item, DocItem) and item.prov:
83108
page_no = item.prov[0].page_no
84-
if add_page_breaks and (prev_page_nr is None or page_no > prev_page_nr):
109+
if prev_page_nr is None or page_no > prev_page_nr:
85110
if prev_page_nr is not None: # close previous range
86111
yield _PageBreakNode(
87112
self_ref=f"#/pb/{page_break_i}",

test/data/doc/activities.gt.html

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,11 @@ <h2>Let's swim!</h2>
147147
<li>-Yet another one</li>
148148
<li>-Stopping it here</li>
149149
</ul>
150+
<p>Some text.</p>
151+
<ul>
152+
<li>-Starting the next page with a list item.</li>
153+
<li>-Second item.</li>
154+
</ul>
150155
</div>
151156
</body>
152157
</html>

test/data/doc/activities.gt.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,3 +26,10 @@ Hmm, what else…
2626
<!-- page break -->
2727
- -Yet another one
2828
- -Stopping it here
29+
30+
Some text.
31+
32+
<!-- page break -->
33+
34+
- -Starting the next page with a list item.
35+
- -Second item.

0 commit comments

Comments
 (0)