Skip to content

Commit e4766e5

Browse files
committed
fix: use pages dict for page break generation to support failed pages
Only generate page breaks for pages present in DoclingDocument.pages dict. This enables proper page break markers for failed pages (added by docling) while maintaining compatibility with filter() method (which removes pages). Changes: - Add page_numbers parameter to _yield_page_breaks() function - Extract page_numbers from doc.pages.keys() in _iterate_items() - Update test data to include failed pages in pages dict - Update test expectations for new behavior Signed-off-by: jhchoi1182 <jhchoi1182@gmail.com>
1 parent a93d0ec commit e4766e5

File tree

5 files changed

+70
-53
lines changed

5 files changed

+70
-53
lines changed

docling_core/transforms/serializer/common.py

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -84,23 +84,34 @@ def _yield_page_breaks(
8484
next_page: int,
8585
lvl: int,
8686
start_index: int,
87+
page_numbers: Optional[set[int]] = None,
8788
) -> Iterable[tuple[_PageBreakNode, int, int]]:
8889
"""Yield page break nodes for each page in range (prev_page, next_page].
8990
9091
Generates one PageBreakNode per page transition. For example, if prev_page=1
91-
and next_page=4, yields 3 page breaks for pages 2, 3, and 4.
92+
and next_page=4, and page_numbers contains pages 1, 2, 3, 4, yields 3 page
93+
breaks for pages 2, 3, and 4.
94+
95+
If page_numbers is provided, only generates page breaks for pages that exist
96+
in page_numbers. This ensures filtered documents (via filter()) don't generate
97+
spurious page breaks for excluded pages.
9298
9399
Args:
94100
prev_page: The last seen page number (1-based physical index).
95101
next_page: The current page number (1-based physical index).
96102
lvl: The nesting level for the yielded nodes.
97103
start_index: The starting index for page break node IDs.
104+
page_numbers: Optional set of valid page numbers. If provided, only pages
105+
in this set will generate page breaks.
98106
99107
Yields:
100108
Tuples of (PageBreakNode, level, next_index) for each page transition.
101109
"""
102110
idx = start_index
103111
for page in range(prev_page + 1, next_page + 1):
112+
# Skip pages that are not in the document's pages dict
113+
if page_numbers is not None and page not in page_numbers:
114+
continue
104115
yield (
105116
_PageBreakNode(
106117
self_ref=f"#/pb/{idx}",
@@ -124,6 +135,9 @@ def _iterate_items(
124135
my_visited: set[str] = visited if visited is not None else set()
125136
prev_page_nr: Optional[int] = None
126137
page_break_i = 0
138+
# Get the set of valid page numbers from the document's pages dict
139+
# This ensures filtered documents don't generate spurious page breaks
140+
page_numbers: set[int] = set(doc.pages.keys())
127141
for item, lvl in doc.iterate_items(
128142
root=node,
129143
with_groups=True,
@@ -146,7 +160,7 @@ def _iterate_items(
146160
page_no = it.prov[0].page_no
147161
if prev_page_nr is not None and page_no > prev_page_nr:
148162
for pb_node, pb_lvl, page_break_i in _yield_page_breaks(
149-
prev_page_nr, page_no, lvl, page_break_i
163+
prev_page_nr, page_no, lvl, page_break_i, page_numbers
150164
):
151165
yield pb_node, pb_lvl
152166
# update previous page number to avoid duplicate page breaks
@@ -157,7 +171,7 @@ def _iterate_items(
157171
if prev_page_nr is None or page_no > prev_page_nr:
158172
if prev_page_nr is not None: # close previous range
159173
for pb_node, pb_lvl, page_break_i in _yield_page_breaks(
160-
prev_page_nr, page_no, lvl, page_break_i
174+
prev_page_nr, page_no, lvl, page_break_i, page_numbers
161175
):
162176
yield pb_node, pb_lvl
163177
prev_page_nr = page_no

test/data/doc/2408.09869v3_enriched_p2_p3_p5.gt.html

Lines changed: 10 additions & 0 deletions
Large diffs are not rendered by default.

test/data/doc/skipped_1page.json

Lines changed: 13 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -72,10 +72,7 @@
7272
"b": 672.1780283203125,
7373
"coord_origin": "BOTTOMLEFT"
7474
},
75-
"charspan": [
76-
0,
77-
30
78-
]
75+
"charspan": [0, 30]
7976
}
8077
],
8178
"orig": "78 ICT\u00a0\uae30\uc220\ubcc0\ud654\uc5d0\u00a0\ub530\ub978\u00a0\ubbf8\ub798\u00a0\ubcf4\uc548\uae30\uc220\u00a0\uc804\ub9dd\u00a0\ubcf4\uace0\uc11c",
@@ -99,10 +96,7 @@
9996
"b": 615.8270283203125,
10097
"coord_origin": "BOTTOMLEFT"
10198
},
102-
"charspan": [
103-
0,
104-
3
105-
]
99+
"charspan": [0, 3]
106100
}
107101
],
108102
"orig": "\ubcf4\uc548\ubd84",
@@ -127,10 +121,7 @@
127121
"b": 672.1780283203125,
128122
"coord_origin": "BOTTOMLEFT"
129123
},
130-
"charspan": [
131-
0,
132-
30
133-
]
124+
"charspan": [0, 30]
134125
}
135126
],
136127
"orig": "84 ICT\u00a0\uae30\uc220\ubcc0\ud654\uc5d0\u00a0\ub530\ub978\u00a0\ubbf8\ub798\u00a0\ubcf4\uc548\uae30\uc220\u00a0\uc804\ub9dd\u00a0\ubcf4\uace0\uc11c",
@@ -154,10 +145,7 @@
154145
"b": 359.95302832031246,
155146
"coord_origin": "BOTTOMLEFT"
156147
},
157-
"charspan": [
158-
0,
159-
164
160-
]
148+
"charspan": [0, 164]
161149
}
162150
],
163151
"orig": "- \u00a0 AI\u00a0\uae30\uc220\uc744\u00a0\ud65c\uc6a9\ud574\u00a0\uc624\ud0d0\uc728\uc744\u00a0\uc904\uc774\uace0,\u00a0\ube60\ub978\u00a0\ubd84\uc11d\u00a0\ud0d0\uc9c0\ub97c\u00a0\ub118\uc5b4\u00a0\uc790\ub3d9\uc801\uc778\u00a0\uaca9\ub9ac\uae4c\uc9c0\u00a0\uc81c\uacf5\ud558\ub294\u00a0\ubc29\ud5a5\uc73c\ub85c\u00a0\uc9c4\ud654 - \u00a0 \uac01\uc885\u00a0\uc0ac\uc774\ubc84\uc704\ud611\uc5d0\u00a0\ub300\ud55c\u00a0\ud558\ub098\uc758\u00a0\uae30\ubc95\u00a0\ub610\ub294\u00a0\uc54c\uace0\ub9ac\uc998\uc774\u00a0\uc544\ub2cc,\u00a0\uce68\ud574\u00a0\ub300\uc751\uc758\u00a0\uac01\u00a0\uc694\uc18c\ubcc4\ub85c\u00a0\ucd5c\uc801\ud654\ub41c\u00a0\uc54c\uace0\ub9ac\uc998\uacfc \uae30\ubc95,\u00a0\uc774\uc804\uc5d0\u00a0\uc124\uba85\ud55c\u00a0\ud559\uc2b5\u00a0\ubc29\ubc95\uc744\u00a0\uc9c0\uc6d0\ud558\uace0\u00a0\uc774\ub97c\u00a0\ud1b5\ud569\ud560\u00a0\uc218\u00a0\uc788\ub294\u00a0\uae30\uc220\u00a0\ud544\uc694",
@@ -185,10 +173,7 @@
185173
"b": 620.6187973022461,
186174
"coord_origin": "BOTTOMLEFT"
187175
},
188-
"charspan": [
189-
0,
190-
0
191-
]
176+
"charspan": [0, 0]
192177
}
193178
],
194179
"captions": [],
@@ -208,6 +193,13 @@
208193
},
209194
"page_no": 1
210195
},
196+
"2": {
197+
"size": {
198+
"width": 515.906005859375,
199+
"height": 728.5040283203125
200+
},
201+
"page_no": 2
202+
},
211203
"3": {
212204
"size": {
213205
"width": 515.906005859375,
@@ -216,4 +208,4 @@
216208
"page_no": 3
217209
}
218210
}
219-
}
211+
}

test/data/doc/skipped_2pages.json

Lines changed: 20 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -72,10 +72,7 @@
7272
"b": 672.1780283203125,
7373
"coord_origin": "BOTTOMLEFT"
7474
},
75-
"charspan": [
76-
0,
77-
30
78-
]
75+
"charspan": [0, 30]
7976
}
8077
],
8178
"orig": "78 ICT\u00a0\uae30\uc220\ubcc0\ud654\uc5d0\u00a0\ub530\ub978\u00a0\ubbf8\ub798\u00a0\ubcf4\uc548\uae30\uc220\u00a0\uc804\ub9dd\u00a0\ubcf4\uace0\uc11c",
@@ -99,10 +96,7 @@
9996
"b": 615.8270283203125,
10097
"coord_origin": "BOTTOMLEFT"
10198
},
102-
"charspan": [
103-
0,
104-
3
105-
]
99+
"charspan": [0, 3]
106100
}
107101
],
108102
"orig": "\ubcf4\uc548\ubd84",
@@ -127,10 +121,7 @@
127121
"b": 672.1780283203125,
128122
"coord_origin": "BOTTOMLEFT"
129123
},
130-
"charspan": [
131-
0,
132-
30
133-
]
124+
"charspan": [0, 30]
134125
}
135126
],
136127
"orig": "84 ICT\u00a0\uae30\uc220\ubcc0\ud654\uc5d0\u00a0\ub530\ub978\u00a0\ubbf8\ub798\u00a0\ubcf4\uc548\uae30\uc220\u00a0\uc804\ub9dd\u00a0\ubcf4\uace0\uc11c",
@@ -154,10 +145,7 @@
154145
"b": 359.95302832031246,
155146
"coord_origin": "BOTTOMLEFT"
156147
},
157-
"charspan": [
158-
0,
159-
164
160-
]
148+
"charspan": [0, 164]
161149
}
162150
],
163151
"orig": "- \u00a0 AI\u00a0\uae30\uc220\uc744\u00a0\ud65c\uc6a9\ud574\u00a0\uc624\ud0d0\uc728\uc744\u00a0\uc904\uc774\uace0,\u00a0\ube60\ub978\u00a0\ubd84\uc11d\u00a0\ud0d0\uc9c0\ub97c\u00a0\ub118\uc5b4\u00a0\uc790\ub3d9\uc801\uc778\u00a0\uaca9\ub9ac\uae4c\uc9c0\u00a0\uc81c\uacf5\ud558\ub294\u00a0\ubc29\ud5a5\uc73c\ub85c\u00a0\uc9c4\ud654 - \u00a0 \uac01\uc885\u00a0\uc0ac\uc774\ubc84\uc704\ud611\uc5d0\u00a0\ub300\ud55c\u00a0\ud558\ub098\uc758\u00a0\uae30\ubc95\u00a0\ub610\ub294\u00a0\uc54c\uace0\ub9ac\uc998\uc774\u00a0\uc544\ub2cc,\u00a0\uce68\ud574\u00a0\ub300\uc751\uc758\u00a0\uac01\u00a0\uc694\uc18c\ubcc4\ub85c\u00a0\ucd5c\uc801\ud654\ub41c\u00a0\uc54c\uace0\ub9ac\uc998\uacfc \uae30\ubc95,\u00a0\uc774\uc804\uc5d0\u00a0\uc124\uba85\ud55c\u00a0\ud559\uc2b5\u00a0\ubc29\ubc95\uc744\u00a0\uc9c0\uc6d0\ud558\uace0\u00a0\uc774\ub97c\u00a0\ud1b5\ud569\ud560\u00a0\uc218\u00a0\uc788\ub294\u00a0\uae30\uc220\u00a0\ud544\uc694",
@@ -185,10 +173,7 @@
185173
"b": 620.6191101074219,
186174
"coord_origin": "BOTTOMLEFT"
187175
},
188-
"charspan": [
189-
0,
190-
0
191-
]
176+
"charspan": [0, 0]
192177
}
193178
],
194179
"captions": [],
@@ -208,6 +193,20 @@
208193
},
209194
"page_no": 1
210195
},
196+
"2": {
197+
"size": {
198+
"width": 515.906005859375,
199+
"height": 728.5040283203125
200+
},
201+
"page_no": 2
202+
},
203+
"3": {
204+
"size": {
205+
"width": 515.906005859375,
206+
"height": 728.5040283203125
207+
},
208+
"page_no": 3
209+
},
211210
"4": {
212211
"size": {
213212
"width": 515.906005859375,
@@ -216,4 +215,4 @@
216215
"page_no": 4
217216
}
218217
}
219-
}
218+
}

test/test_page_break_skipped_pages.py

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -35,26 +35,28 @@ def test_normal_4pages_has_all_pages():
3535
], f"Expected pages [1, 2, 3, 4], got {page_numbers}"
3636

3737

38-
def test_skipped_2pages_has_only_two_pages():
39-
"""Test that skipped_2pages.json has only 2 pages (pages 2 and 3 failed to parse)."""
38+
def test_skipped_2pages_has_all_pages_including_failed():
39+
"""Test that skipped_2pages.json has all 4 pages (pages 2 and 3 failed to parse but are still present)."""
4040
src = Path("./test/data/doc/skipped_2pages.json")
4141
doc = DoclingDocument.load_from_json(src)
4242

4343
page_numbers = list(doc.pages.keys())
4444

45-
assert len(page_numbers) == 2, f"Expected 2 pages in skipped_2pages.json, got {len(page_numbers)}"
46-
assert page_numbers == [1, 4], f"Expected pages [1, 4], got {page_numbers}"
45+
# After fix: all pages including failed ones should be present in pages dict
46+
assert len(page_numbers) == 4, f"Expected 4 pages in skipped_2pages.json, got {len(page_numbers)}"
47+
assert page_numbers == [1, 2, 3, 4], f"Expected pages [1, 2, 3, 4], got {page_numbers}"
4748

4849

49-
def test_skipped_1page_has_two_pages():
50-
"""Test that skipped_1page.json has 2 pages (page 2 failed to parse)."""
50+
def test_skipped_1page_has_all_pages_including_failed():
51+
"""Test that skipped_1page.json has all 3 pages (page 2 failed to parse but is still present)."""
5152
src = Path("./test/data/doc/skipped_1page.json")
5253
doc = DoclingDocument.load_from_json(src)
5354

5455
page_numbers = list(doc.pages.keys())
5556

56-
assert len(page_numbers) == 2, f"Expected 2 pages in skipped_1page.json, got {len(page_numbers)}"
57-
assert page_numbers == [1, 3], f"Expected pages [1, 3], got {page_numbers}"
57+
# After fix: all pages including failed ones should be present in pages dict
58+
assert len(page_numbers) == 3, f"Expected 3 pages in skipped_1page.json, got {len(page_numbers)}"
59+
assert page_numbers == [1, 2, 3], f"Expected pages [1, 2, 3], got {page_numbers}"
5860

5961

6062
# =============================================================================

0 commit comments

Comments
 (0)