@@ -4045,7 +4045,7 @@ def iterate_items(
40454045 root = root ,
40464046 with_groups = with_groups ,
40474047 traverse_pictures = traverse_pictures ,
4048- page_no = page_no ,
4048+ page_nrs = { page_no } if page_no is not None else None ,
40494049 included_content_layers = included_content_layers ,
40504050 ):
40514051 yield item , len (stack )
@@ -4055,7 +4055,7 @@ def _iterate_items_with_stack(
40554055 root : Optional [NodeItem ] = None ,
40564056 with_groups : bool = False ,
40574057 traverse_pictures : bool = False ,
4058- page_no : Optional [int ] = None ,
4058+ page_nrs : Optional [set [ int ] ] = None ,
40594059 included_content_layers : Optional [set [ContentLayer ]] = None ,
40604060 _stack : Optional [list [int ]] = None ,
40614061 ) -> typing .Iterable [Tuple [NodeItem , list [int ]]]: # tuple of node and level
@@ -4078,8 +4078,8 @@ def _iterate_items_with_stack(
40784078 and (
40794079 not isinstance (root , DocItem )
40804080 or (
4081- page_no is None
4082- or any (prov .page_no == page_no for prov in root .prov )
4081+ page_nrs is None
4082+ or any (prov .page_no in page_nrs for prov in root .prov )
40834083 )
40844084 )
40854085 and root .content_layer in my_layers
@@ -4113,7 +4113,7 @@ def _iterate_items_with_stack(
41134113 child ,
41144114 with_groups = with_groups ,
41154115 traverse_pictures = traverse_pictures ,
4116- page_no = page_no ,
4116+ page_nrs = page_nrs ,
41174117 _stack = my_stack ,
41184118 included_content_layers = my_layers ,
41194119 )
@@ -5603,7 +5603,9 @@ class _DocIndex(BaseModel):
56035603 def get_item_list (self , key : str ) -> list [NodeItem ]:
56045604 return getattr (self , key )
56055605
5606- def index (self , doc : "DoclingDocument" ) -> None :
5606+ def index (
5607+ self , doc : "DoclingDocument" , page_nrs : Optional [set [int ]] = None
5608+ ) -> None :
56075609
56085610 orig_ref_to_new_ref : dict [str , str ] = {}
56095611 page_delta = self ._max_page - min (doc .pages .keys ()) + 1 if doc .pages else 0
@@ -5614,10 +5616,11 @@ def index(self, doc: "DoclingDocument") -> None:
56145616 self ._names .append (doc .name )
56155617
56165618 # collect items in traversal order
5617- for item , _ in doc .iterate_items (
5619+ for item , _ in doc ._iterate_items_with_stack (
56185620 with_groups = True ,
56195621 traverse_pictures = True ,
56205622 included_content_layers = {c for c in ContentLayer },
5623+ page_nrs = page_nrs ,
56215624 ):
56225625 key = item .self_ref .split ("/" )[1 ]
56235626 is_body = key == "body"
@@ -5686,12 +5689,13 @@ def index(self, doc: "DoclingDocument") -> None:
56865689 # update pages
56875690 new_max_page = None
56885691 for page_nr in doc .pages :
5689- new_page = copy .deepcopy (doc .pages [page_nr ])
5690- new_page_nr = page_nr + page_delta
5691- new_page .page_no = new_page_nr
5692- self .pages [new_page_nr ] = new_page
5693- if new_max_page is None or new_page_nr > new_max_page :
5694- new_max_page = new_page_nr
5692+ if page_nrs is None or page_nr in page_nrs :
5693+ new_page = copy .deepcopy (doc .pages [page_nr ])
5694+ new_page_nr = page_nr + page_delta
5695+ new_page .page_no = new_page_nr
5696+ self .pages [new_page_nr ] = new_page
5697+ if new_max_page is None or new_page_nr > new_max_page :
5698+ new_max_page = new_page_nr
56955699 if new_max_page is not None :
56965700 self ._max_page = new_max_page
56975701
@@ -5715,6 +5719,14 @@ def _normalize_references(self) -> None:
57155719 doc_index .index (doc = self )
57165720 self ._update_from_index (doc_index )
57175721
5722+ def filter (self , page_nrs : Optional [set [int ]] = None ) -> "DoclingDocument" :
5723+ """Create a new document based on the provided filter parameters."""
5724+ doc_index = DoclingDocument ._DocIndex ()
5725+ doc_index .index (doc = self , page_nrs = page_nrs )
5726+ res_doc = DoclingDocument (name = self .name )
5727+ res_doc ._update_from_index (doc_index )
5728+ return res_doc
5729+
57185730 @classmethod
57195731 def concatenate (cls , docs : Sequence ["DoclingDocument" ]) -> "DoclingDocument" :
57205732 """Concatenate multiple documents into a single document."""
0 commit comments