Skip to content

Commit 73b9941

Browse files
authored
feat: expose page number in Serialization API (#238)
Signed-off-by: Panos Vagenas <[email protected]>
1 parent 0482bac commit 73b9941

File tree

3 files changed

+45
-29
lines changed

3 files changed

+45
-29
lines changed

docling_core/experimental/serializer/common.py

Lines changed: 29 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -150,20 +150,26 @@ def get_excluded_refs(self, **kwargs) -> list[str]:
150150
return refs
151151

152152
@abstractmethod
153-
def serialize_page(self, parts: list[SerializationResult]) -> SerializationResult:
153+
def serialize_page(
154+
self, *, parts: list[SerializationResult], **kwargs
155+
) -> SerializationResult:
154156
"""Serialize a page out of its parts."""
155157
...
156158

157159
@abstractmethod
158-
def serialize_doc(self, pages: list[SerializationResult]) -> SerializationResult:
160+
def serialize_doc(
161+
self, *, pages: dict[Optional[int], SerializationResult], **kwargs
162+
) -> SerializationResult:
159163
"""Serialize a document out of its pages."""
160164
...
161165

162166
def _serialize_body(self) -> SerializationResult:
163167
"""Serialize the document body."""
164168
# find page ranges if available; otherwise regard whole doc as a single page
165-
last_page: Optional[int] = None
166-
starts: list[int] = []
169+
prev_start: int = 0
170+
prev_page_nr: Optional[int] = None
171+
range_by_page_nr: dict[Optional[int], tuple[int, int]] = {}
172+
167173
for ix, (item, _) in enumerate(
168174
self.doc.iterate_items(
169175
with_groups=True,
@@ -173,28 +179,30 @@ def _serialize_body(self) -> SerializationResult:
173179
):
174180
if isinstance(item, DocItem):
175181
if item.prov:
176-
if last_page is None or item.prov[0].page_no > last_page:
177-
starts.append(ix)
178-
last_page = item.prov[0].page_no
179-
page_ranges = [
180-
(
181-
(starts[i] if i > 0 else 0),
182-
(starts[i + 1] if i < len(starts) - 1 else sys.maxsize),
183-
)
184-
for i, _ in enumerate(starts)
185-
] or [
186-
(0, sys.maxsize)
187-
] # use whole range if no pages detected
182+
page_no = item.prov[0].page_no
183+
if prev_page_nr is None or page_no > prev_page_nr:
184+
if prev_page_nr is not None: # close previous range
185+
range_by_page_nr[prev_page_nr] = (prev_start, ix)
186+
187+
prev_start = ix
188+
# could alternatively always start 1st page from 0:
189+
# prev_start = ix if prev_page_nr is not None else 0
190+
191+
prev_page_nr = page_no
192+
193+
# close last (and single if no pages) range
194+
range_by_page_nr[prev_page_nr] = (prev_start, sys.maxsize)
188195

189-
page_results: list[SerializationResult] = []
190-
for page_range in page_ranges:
196+
page_results: dict[Optional[int], SerializationResult] = {}
197+
for page_nr in range_by_page_nr:
198+
page_range = range_by_page_nr[page_nr]
191199
params_to_pass = deepcopy(self.params)
192200
params_to_pass.start_idx = page_range[0]
193201
params_to_pass.stop_idx = page_range[1]
194202
subparts = self.get_parts(**params_to_pass.model_dump())
195-
page_res = self.serialize_page(subparts)
196-
page_results.append(page_res)
197-
res = self.serialize_doc(page_results)
203+
page_res = self.serialize_page(parts=subparts)
204+
page_results[page_nr] = page_res
205+
res = self.serialize_doc(pages=page_results)
198206
return res
199207

200208
@override

docling_core/experimental/serializer/doctags.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -476,21 +476,25 @@ class DocTagsDocSerializer(DocSerializer):
476476
params: DocTagsParams = DocTagsParams()
477477

478478
@override
479-
def serialize_page(self, parts: list[SerializationResult]) -> SerializationResult:
479+
def serialize_page(
480+
self, *, parts: list[SerializationResult], **kwargs
481+
) -> SerializationResult:
480482
"""Serialize a page out of its parts."""
481483
delim = _get_delim(params=self.params)
482484
text_res = delim.join([p.text for p in parts])
483485
return SerializationResult(text=text_res)
484486

485487
@override
486-
def serialize_doc(self, pages: list[SerializationResult]) -> SerializationResult:
488+
def serialize_doc(
489+
self, *, pages: dict[Optional[int], SerializationResult], **kwargs
490+
) -> SerializationResult:
487491
"""Serialize a document out of its pages."""
488492
delim = _get_delim(params=self.params)
489493
if self.params.add_page_break:
490494
page_sep = f"{delim}<{DocumentToken.PAGE_BREAK.value}>{delim}"
491-
content = page_sep.join([p.text for p in pages if p.text])
495+
content = page_sep.join([text for k in pages if (text := pages[k].text)])
492496
else:
493-
content = self.serialize_page(parts=pages).text
497+
content = self.serialize_page(parts=list(pages.values())).text
494498
wrap_tag = DocumentToken.DOCUMENT.value
495499
text_res = f"<{wrap_tag}>{content}{delim}</{wrap_tag}>"
496500
return SerializationResult(text=text_res)

docling_core/experimental/serializer/markdown.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -499,17 +499,21 @@ def post_process(
499499
return res
500500

501501
@override
502-
def serialize_page(self, parts: list[SerializationResult]) -> SerializationResult:
502+
def serialize_page(
503+
self, *, parts: list[SerializationResult], **kwargs
504+
) -> SerializationResult:
503505
"""Serialize a page out of its parts."""
504506
text_res = "\n\n".join([p.text for p in parts])
505507
return SerializationResult(text=text_res)
506508

507509
@override
508-
def serialize_doc(self, pages: list[SerializationResult]) -> SerializationResult:
510+
def serialize_doc(
511+
self, *, pages: dict[Optional[int], SerializationResult], **kwargs
512+
) -> SerializationResult:
509513
"""Serialize a document out of its pages."""
510514
if self.params.page_break_placeholder is not None:
511515
sep = f"\n\n{self.params.page_break_placeholder}\n\n"
512-
text_res = sep.join([p.text for p in pages if p.text])
516+
text_res = sep.join([text for k in pages if (text := pages[k].text)])
513517
return SerializationResult(text=text_res)
514518
else:
515-
return self.serialize_page(parts=pages)
519+
return self.serialize_page(parts=list(pages.values()))

0 commit comments

Comments
 (0)