Skip to content

Commit 9dc526d

Browse files
authored
feat: add page filtering to DoclingDocument (#378)
Signed-off-by: Panos Vagenas <[email protected]>
1 parent d12621d commit 9dc526d

File tree

4 files changed

+3367
-13
lines changed

4 files changed

+3367
-13
lines changed

docling_core/types/doc/document.py

Lines changed: 25 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -4045,7 +4045,7 @@ def iterate_items(
40454045
root=root,
40464046
with_groups=with_groups,
40474047
traverse_pictures=traverse_pictures,
4048-
page_no=page_no,
4048+
page_nrs={page_no} if page_no is not None else None,
40494049
included_content_layers=included_content_layers,
40504050
):
40514051
yield item, len(stack)
@@ -4055,7 +4055,7 @@ def _iterate_items_with_stack(
40554055
root: Optional[NodeItem] = None,
40564056
with_groups: bool = False,
40574057
traverse_pictures: bool = False,
4058-
page_no: Optional[int] = None,
4058+
page_nrs: Optional[set[int]] = None,
40594059
included_content_layers: Optional[set[ContentLayer]] = None,
40604060
_stack: Optional[list[int]] = None,
40614061
) -> typing.Iterable[Tuple[NodeItem, list[int]]]: # tuple of node and level
@@ -4078,8 +4078,8 @@ def _iterate_items_with_stack(
40784078
and (
40794079
not isinstance(root, DocItem)
40804080
or (
4081-
page_no is None
4082-
or any(prov.page_no == page_no for prov in root.prov)
4081+
page_nrs is None
4082+
or any(prov.page_no in page_nrs for prov in root.prov)
40834083
)
40844084
)
40854085
and root.content_layer in my_layers
@@ -4113,7 +4113,7 @@ def _iterate_items_with_stack(
41134113
child,
41144114
with_groups=with_groups,
41154115
traverse_pictures=traverse_pictures,
4116-
page_no=page_no,
4116+
page_nrs=page_nrs,
41174117
_stack=my_stack,
41184118
included_content_layers=my_layers,
41194119
)
@@ -5603,7 +5603,9 @@ class _DocIndex(BaseModel):
56035603
def get_item_list(self, key: str) -> list[NodeItem]:
56045604
return getattr(self, key)
56055605

5606-
def index(self, doc: "DoclingDocument") -> None:
5606+
def index(
5607+
self, doc: "DoclingDocument", page_nrs: Optional[set[int]] = None
5608+
) -> None:
56075609

56085610
orig_ref_to_new_ref: dict[str, str] = {}
56095611
page_delta = self._max_page - min(doc.pages.keys()) + 1 if doc.pages else 0
@@ -5614,10 +5616,11 @@ def index(self, doc: "DoclingDocument") -> None:
56145616
self._names.append(doc.name)
56155617

56165618
# collect items in traversal order
5617-
for item, _ in doc.iterate_items(
5619+
for item, _ in doc._iterate_items_with_stack(
56185620
with_groups=True,
56195621
traverse_pictures=True,
56205622
included_content_layers={c for c in ContentLayer},
5623+
page_nrs=page_nrs,
56215624
):
56225625
key = item.self_ref.split("/")[1]
56235626
is_body = key == "body"
@@ -5686,12 +5689,13 @@ def index(self, doc: "DoclingDocument") -> None:
56865689
# update pages
56875690
new_max_page = None
56885691
for page_nr in doc.pages:
5689-
new_page = copy.deepcopy(doc.pages[page_nr])
5690-
new_page_nr = page_nr + page_delta
5691-
new_page.page_no = new_page_nr
5692-
self.pages[new_page_nr] = new_page
5693-
if new_max_page is None or new_page_nr > new_max_page:
5694-
new_max_page = new_page_nr
5692+
if page_nrs is None or page_nr in page_nrs:
5693+
new_page = copy.deepcopy(doc.pages[page_nr])
5694+
new_page_nr = page_nr + page_delta
5695+
new_page.page_no = new_page_nr
5696+
self.pages[new_page_nr] = new_page
5697+
if new_max_page is None or new_page_nr > new_max_page:
5698+
new_max_page = new_page_nr
56955699
if new_max_page is not None:
56965700
self._max_page = new_max_page
56975701

@@ -5715,6 +5719,14 @@ def _normalize_references(self) -> None:
57155719
doc_index.index(doc=self)
57165720
self._update_from_index(doc_index)
57175721

5722+
def filter(self, page_nrs: Optional[set[int]] = None) -> "DoclingDocument":
5723+
"""Create a new document based on the provided filter parameters."""
5724+
doc_index = DoclingDocument._DocIndex()
5725+
doc_index.index(doc=self, page_nrs=page_nrs)
5726+
res_doc = DoclingDocument(name=self.name)
5727+
res_doc._update_from_index(doc_index)
5728+
return res_doc
5729+
57185730
@classmethod
57195731
def concatenate(cls, docs: Sequence["DoclingDocument"]) -> "DoclingDocument":
57205732
"""Concatenate multiple documents into a single document."""

test/data/doc/2408.09869v3_enriched_p2_p3_p5.gt.html

Lines changed: 168 additions & 0 deletions
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)