33import html
44from enum import Enum
55from pathlib import Path
6- from typing import Optional , Union
6+ from typing import Dict , List , Optional , Union
77
88from pydantic import AnyUrl , BaseModel
99from typing_extensions import override
2323from docling_core .experimental .serializer .common import CommonParams , DocSerializer
2424from docling_core .types .doc .document import (
2525 CodeItem ,
26+ DocItem ,
2627 DoclingDocument ,
2728 Formatting ,
2829 FormItem ,
@@ -54,7 +55,6 @@ class Mode(str, Enum):
5455 MINIFIED = "minified"
5556 HUMAN_FRIENDLY = "human_friendly"
5657
57- new_line : str = ""
5858 xsize : int = 500
5959 ysize : int = 500
6060 add_location : bool = True
@@ -67,13 +67,13 @@ class Mode(str, Enum):
6767 mode : Mode = Mode .HUMAN_FRIENDLY
6868
6969
70- def _get_delim (mode : DocTagsParams . Mode ) -> str :
71- if mode == DocTagsParams .Mode .HUMAN_FRIENDLY :
70+ def _get_delim (params : DocTagsParams ) -> str :
71+ if params . mode == DocTagsParams .Mode .HUMAN_FRIENDLY :
7272 delim = "\n "
73- elif mode == DocTagsParams .Mode .MINIFIED :
73+ elif params . mode == DocTagsParams .Mode .MINIFIED :
7474 delim = ""
7575 else :
76- raise RuntimeError (f"Unknown DocTags mode: { mode } " )
76+ raise RuntimeError (f"Unknown DocTags mode: { params . mode } " )
7777 return delim
7878
7979
@@ -102,7 +102,6 @@ def serialize(
102102 if params .add_location :
103103 location = item .get_location_tokens (
104104 doc = doc ,
105- new_line = params .new_line ,
106105 xsize = params .xsize ,
107106 ysize = params .ysize ,
108107 )
@@ -158,7 +157,6 @@ def serialize(
158157 if params .add_location :
159158 body += item .get_location_tokens (
160159 doc = doc ,
161- new_line = params .new_line ,
162160 xsize = params .xsize ,
163161 ysize = params .ysize ,
164162 )
@@ -178,15 +176,14 @@ def serialize(
178176 body += f"<{ DocumentToken .CAPTION .value } >"
179177 for caption in item .captions :
180178 if caption .cref not in doc_serializer .get_excluded_refs (** kwargs ):
181- body + = caption .resolve (doc ). get_location_tokens (
182- doc = doc ,
183- new_line = params . new_line ,
184- xsize = params .xsize ,
185- ysize = params .ysize ,
186- )
179+ if isinstance ( cap : = caption .resolve (doc ), DocItem ):
180+ body += cap . get_location_tokens (
181+ doc = doc ,
182+ xsize = params .xsize ,
183+ ysize = params .ysize ,
184+ )
187185 body += f"{ text .strip ()} "
188186 body += f"</{ DocumentToken .CAPTION .value } >"
189- body += f"{ params .new_line } "
190187
191188 if body :
192189 body = _wrap (text = body , wrap_tag = DocumentToken .OTSL .value )
@@ -208,15 +205,13 @@ def serialize(
208205 ) -> SerializationResult :
209206 """Serializes the passed item."""
210207 params = DocTagsParams (** kwargs )
211-
212208 parts : list [str ] = []
213209
214210 if item .self_ref not in doc_serializer .get_excluded_refs (** kwargs ):
215211 body = ""
216212 if params .add_location :
217213 body += item .get_location_tokens (
218214 doc = doc ,
219- new_line = params .new_line ,
220215 xsize = params .xsize ,
221216 ysize = params .ysize ,
222217 )
@@ -246,13 +241,13 @@ def serialize(
246241 body = ""
247242 for caption in item .captions :
248243 if caption .cref not in doc_serializer .get_excluded_refs (** kwargs ):
249- body + = caption .resolve (doc ). get_location_tokens (
250- doc = doc ,
251- new_line = params . new_line ,
252- xsize = params .xsize ,
253- ysize = params .ysize ,
254- )
255- body += f"{ text .strip ()} "
244+ if isinstance ( cap : = caption .resolve (doc ), DocItem ):
245+ body += cap . get_location_tokens (
246+ doc = doc ,
247+ xsize = params .xsize ,
248+ ysize = params .ysize ,
249+ )
250+ body += f"{ text .strip ()} "
256251 if body :
257252 body = _wrap (text = body , wrap_tag = DocumentToken .CAPTION .value )
258253 parts .append (body )
@@ -279,9 +274,56 @@ def serialize(
279274 ** kwargs ,
280275 ) -> SerializationResult :
281276 """Serializes the passed item."""
282- # TODO add actual implementation
283- text_res = ""
284- return SerializationResult (text = text_res )
277+ params = DocTagsParams (** kwargs )
278+
279+ body = ""
280+
281+ page_no = 1
282+ if len (item .prov ) > 0 :
283+ page_no = item .prov [0 ].page_no
284+
285+ if params .add_location :
286+ body += item .get_location_tokens (
287+ doc = doc ,
288+ xsize = params .xsize ,
289+ ysize = params .ysize ,
290+ )
291+
292+ # mapping from source_cell_id to a list of target_cell_ids
293+ source_to_targets : Dict [int , List [int ]] = {}
294+ for link in item .graph .links :
295+ source_to_targets .setdefault (link .source_cell_id , []).append (
296+ link .target_cell_id
297+ )
298+
299+ for cell in item .graph .cells :
300+ cell_txt = ""
301+ if cell .prov is not None :
302+ if len (doc .pages .keys ()):
303+ page_w , page_h = doc .pages [page_no ].size .as_tuple ()
304+ cell_txt += DocumentToken .get_location (
305+ bbox = cell .prov .bbox .to_top_left_origin (page_h ).as_tuple (),
306+ page_w = page_w ,
307+ page_h = page_h ,
308+ xsize = params .xsize ,
309+ ysize = params .ysize ,
310+ )
311+ if params .add_content :
312+ cell_txt += cell .text .strip ()
313+
314+ if cell .cell_id in source_to_targets :
315+ targets = source_to_targets [cell .cell_id ]
316+ for target in targets :
317+ # TODO centralize token creation
318+ cell_txt += f"<link_{ target } >"
319+
320+ # TODO centralize token creation
321+ tok = f"{ cell .label .value } _{ cell .cell_id } "
322+ cell_txt = _wrap (text = cell_txt , wrap_tag = tok )
323+ body += cell_txt
324+
325+ body = _wrap (body , DocumentToken .KEY_VALUE_REGION .value )
326+ return SerializationResult (text = body )
285327
286328
287329class DocTagsFormSerializer (BaseFormSerializer ):
@@ -329,7 +371,7 @@ def serialize(
329371 visited = my_visited ,
330372 ** kwargs ,
331373 )
332- delim = _get_delim (mode = params . mode )
374+ delim = _get_delim (params = params )
333375 if parts :
334376 text_res = delim .join (
335377 [
@@ -374,7 +416,7 @@ def serialize(
374416 ** kwargs ,
375417 )
376418 wrap_tag = DocumentToken .INLINE .value
377- delim = _get_delim (mode = params . mode )
419+ delim = _get_delim (params = params )
378420 text_res = delim .join ([p .text for p in parts if p .text ])
379421 if text_res :
380422 text_res = f"{ text_res } { delim } "
@@ -437,14 +479,14 @@ def post_process(
437479 @override
438480 def serialize_page (self , parts : list [SerializationResult ]) -> SerializationResult :
439481 """Serialize a page out of its parts."""
440- delim = _get_delim (mode = self .params . mode )
482+ delim = _get_delim (params = self .params )
441483 text_res = delim .join ([p .text for p in parts ])
442484 return SerializationResult (text = text_res )
443485
444486 @override
445487 def serialize_doc (self , pages : list [SerializationResult ]) -> SerializationResult :
446488 """Serialize a document out of its pages."""
447- delim = _get_delim (mode = self .params . mode )
489+ delim = _get_delim (params = self .params )
448490 if self .params .add_page_break :
449491 page_sep = f"{ delim } <{ DocumentToken .PAGE_BREAK .value } >{ delim } "
450492 content = page_sep .join ([p .text for p in pages if p .text ])
0 commit comments