@@ -150,20 +150,26 @@ def get_excluded_refs(self, **kwargs) -> list[str]:
150150 return refs
151151
152152 @abstractmethod
153- def serialize_page (self , parts : list [SerializationResult ]) -> SerializationResult :
153+ def serialize_page (
154+ self , * , parts : list [SerializationResult ], ** kwargs
155+ ) -> SerializationResult :
154156 """Serialize a page out of its parts."""
155157 ...
156158
157159 @abstractmethod
158- def serialize_doc (self , pages : list [SerializationResult ]) -> SerializationResult :
160+ def serialize_doc (
161+ self , * , pages : dict [Optional [int ], SerializationResult ], ** kwargs
162+ ) -> SerializationResult :
159163 """Serialize a document out of its pages."""
160164 ...
161165
162166 def _serialize_body (self ) -> SerializationResult :
163167 """Serialize the document body."""
164168 # find page ranges if available; otherwise regard whole doc as a single page
165- last_page : Optional [int ] = None
166- starts : list [int ] = []
169+ prev_start : int = 0
170+ prev_page_nr : Optional [int ] = None
171+ range_by_page_nr : dict [Optional [int ], tuple [int , int ]] = {}
172+
167173 for ix , (item , _ ) in enumerate (
168174 self .doc .iterate_items (
169175 with_groups = True ,
@@ -173,28 +179,30 @@ def _serialize_body(self) -> SerializationResult:
173179 ):
174180 if isinstance (item , DocItem ):
175181 if item .prov :
176- if last_page is None or item .prov [0 ].page_no > last_page :
177- starts .append (ix )
178- last_page = item .prov [0 ].page_no
179- page_ranges = [
180- (
181- (starts [i ] if i > 0 else 0 ),
182- (starts [i + 1 ] if i < len (starts ) - 1 else sys .maxsize ),
183- )
184- for i , _ in enumerate (starts )
185- ] or [
186- (0 , sys .maxsize )
187- ] # use whole range if no pages detected
182+ page_no = item .prov [0 ].page_no
183+ if prev_page_nr is None or page_no > prev_page_nr :
184+ if prev_page_nr is not None : # close previous range
185+ range_by_page_nr [prev_page_nr ] = (prev_start , ix )
186+
187+ prev_start = ix
188+ # could alternatively always start 1st page from 0:
189+ # prev_start = ix if prev_page_nr is not None else 0
190+
191+ prev_page_nr = page_no
192+
193+ # close last (and single if no pages) range
194+ range_by_page_nr [prev_page_nr ] = (prev_start , sys .maxsize )
188195
189- page_results : list [SerializationResult ] = []
190- for page_range in page_ranges :
196+ page_results : dict [Optional [int ], SerializationResult ] = {}
197+ for page_nr in range_by_page_nr :
198+ page_range = range_by_page_nr [page_nr ]
191199 params_to_pass = deepcopy (self .params )
192200 params_to_pass .start_idx = page_range [0 ]
193201 params_to_pass .stop_idx = page_range [1 ]
194202 subparts = self .get_parts (** params_to_pass .model_dump ())
195- page_res = self .serialize_page (subparts )
196- page_results . append ( page_res )
197- res = self .serialize_doc (page_results )
203+ page_res = self .serialize_page (parts = subparts )
204+ page_results [ page_nr ] = page_res
205+ res = self .serialize_doc (pages = page_results )
198206 return res
199207
200208 @override
0 commit comments