2525 BaseTableSerializer ,
2626 BaseTextSerializer ,
2727 SerializationResult ,
28+ Span ,
2829)
2930from docling_core .types .doc .document import (
3031 DOCUMENT_TOKENS_EXPORT_LABELS ,
4950_DEFAULT_LAYERS = {cl for cl in ContentLayer }
5051
5152
53+ def create_ser_result (
54+ * ,
55+ text : str = "" ,
56+ span_source : Union [DocItem , list [SerializationResult ]] = [],
57+ ) -> SerializationResult :
58+ """Function for creating `SerializationResult` instances.
59+
60+ Args:
61+ text: the text the use. Defaults to "".
62+ span_source: the item or list of results to use as span source. Defaults to [].
63+
64+ Returns:
65+ The created `SerializationResult`.
66+ """
67+ spans : list [Span ]
68+ if isinstance (span_source , DocItem ):
69+ spans = [Span (item = span_source )]
70+ else :
71+ results : list [SerializationResult ] = span_source
72+ spans = []
73+ for ser_res in results :
74+ for span in ser_res .spans :
75+ if span not in spans :
76+ spans .append (span )
77+ return SerializationResult (
78+ text = text ,
79+ spans = spans ,
80+ )
81+
82+
5283class CommonParams (BaseModel ):
5384 """Common serialization parameters."""
5485
@@ -217,14 +248,17 @@ def serialize(
217248 ) -> SerializationResult :
218249 """Serialize a given node."""
219250 my_visited : set [str ] = visited if visited is not None else set ()
220- empty_res = SerializationResult (text = "" )
251+ my_kwargs = self .params .merge_with_patch (patch = kwargs ).model_dump ()
252+ empty_res = create_ser_result ()
221253 if item is None or item == self .doc .body :
222254 if self .doc .body .self_ref not in my_visited :
223255 my_visited .add (self .doc .body .self_ref )
224256 return self ._serialize_body ()
225257 else :
226258 return empty_res
227259
260+ my_visited .add (item .self_ref )
261+
228262 ########
229263 # groups
230264 ########
@@ -236,7 +270,7 @@ def serialize(
236270 list_level = list_level ,
237271 is_inline_scope = is_inline_scope ,
238272 visited = my_visited ,
239- ** kwargs ,
273+ ** my_kwargs ,
240274 )
241275 elif isinstance (item , InlineGroup ):
242276 part = self .inline_serializer .serialize (
@@ -245,7 +279,7 @@ def serialize(
245279 doc = self .doc ,
246280 list_level = list_level ,
247281 visited = my_visited ,
248- ** kwargs ,
282+ ** my_kwargs ,
249283 )
250284 ###########
251285 # doc items
@@ -261,7 +295,7 @@ def serialize(
261295 doc_serializer = self ,
262296 doc = self .doc ,
263297 is_inline_scope = is_inline_scope ,
264- ** kwargs ,
298+ ** my_kwargs ,
265299 )
266300 if item .self_ref not in self .get_excluded_refs (** kwargs )
267301 else empty_res
@@ -271,36 +305,36 @@ def serialize(
271305 item = item ,
272306 doc_serializer = self ,
273307 doc = self .doc ,
274- ** kwargs ,
308+ ** my_kwargs ,
275309 )
276310 elif isinstance (item , PictureItem ):
277311 part = self .picture_serializer .serialize (
278312 item = item ,
279313 doc_serializer = self ,
280314 doc = self .doc ,
281315 visited = my_visited ,
282- ** kwargs ,
316+ ** my_kwargs ,
283317 )
284318 elif isinstance (item , KeyValueItem ):
285319 part = self .key_value_serializer .serialize (
286320 item = item ,
287321 doc_serializer = self ,
288322 doc = self .doc ,
289- ** kwargs ,
323+ ** my_kwargs ,
290324 )
291325 elif isinstance (item , FormItem ):
292326 part = self .form_serializer .serialize (
293327 item = item ,
294328 doc_serializer = self ,
295329 doc = self .doc ,
296- ** kwargs ,
330+ ** my_kwargs ,
297331 )
298332 else :
299333 part = self .fallback_serializer .serialize (
300334 item = item ,
301335 doc_serializer = self ,
302336 doc = self .doc ,
303- ** kwargs ,
337+ ** my_kwargs ,
304338 )
305339 return part
306340
@@ -401,15 +435,16 @@ def serialize_captions(
401435 ) -> SerializationResult :
402436 """Serialize the item's captions."""
403437 params = self .params .merge_with_patch (patch = kwargs )
438+ results : list [SerializationResult ] = []
404439 if DocItemLabel .CAPTION in params .labels :
405- text_parts : list [ str ] = [
406- it .text
440+ results = [
441+ create_ser_result ( text = it .text , span_source = it )
407442 for cap in item .captions
408443 if isinstance (it := cap .resolve (self .doc ), TextItem )
409444 and it .self_ref not in self .get_excluded_refs (** kwargs )
410445 ]
411- text_res = params .caption_delim .join (text_parts )
446+ text_res = params .caption_delim .join ([ r . text for r in results ] )
412447 text_res = self .post_process (text = text_res )
413448 else :
414449 text_res = ""
415- return SerializationResult (text = text_res )
450+ return create_ser_result (text = text_res , span_source = results )
0 commit comments