1010from pydantic import BaseModel , Field , PositiveInt , StrictStr
1111
1212from docling_core .search .mapping import es_field
13+ from docling_core .types .doc .tokens import DocumentToken
1314from docling_core .utils .alias import AliasModel
1415
1516CellData = tuple [float , float , float , float , str , str ]
@@ -132,10 +133,6 @@ class GlmTableCell(TableCell):
132133class BaseCell (AliasModel ):
133134 """Base cell."""
134135
135- # FIXME: we need to check why we have bounding_box (this should be in prov)
136- bounding_box : Optional [BoundingBoxContainer ] = Field (
137- default = None , alias = "bounding-box" , json_schema_extra = es_field (suppress = True )
138- )
139136 prov : Optional [list [Prov ]] = None
140137 text : Optional [str ] = Field (
141138 default = None , json_schema_extra = es_field (term_vector = "with_positions_offsets" )
@@ -144,6 +141,38 @@ class BaseCell(AliasModel):
144141 alias = "type" , json_schema_extra = es_field (type = "keyword" , ignore_above = 8191 )
145142 )
146143
144+ def get_location_tokens (
145+ self ,
146+ new_line : str ,
147+ page_w : float ,
148+ page_h : float ,
149+ xsize : int = 100 ,
150+ ysize : int = 100 ,
151+ add_page_index : bool = True ,
152+ ) -> str :
153+ """Get the location string for the BaseCell."""
154+ if self .prov is None :
155+ return ""
156+
157+ location = ""
158+ for prov in self .prov :
159+
160+ page_i = - 1
161+ if add_page_index :
162+ page_i = prov .page
163+
164+ loc_str = DocumentToken .get_location (
165+ bbox = prov .bbox ,
166+ page_w = page_w ,
167+ page_h = page_h ,
168+ xsize = xsize ,
169+ ysize = ysize ,
170+ page_i = page_i ,
171+ )
172+ location += f"{ loc_str } { new_line } "
173+
174+ return location
175+
147176
148177class Table (BaseCell ):
149178 """Table."""
@@ -153,6 +182,11 @@ class Table(BaseCell):
153182 data : Optional [list [list [Union [GlmTableCell , TableCell ]]]] = None
154183 model : Optional [str ] = None
155184
185+ # FIXME: we need to check why we have bounding_box (this should be in prov)
186+ bounding_box : Optional [BoundingBoxContainer ] = Field (
187+ default = None , alias = "bounding-box" , json_schema_extra = es_field (suppress = True )
188+ )
189+
156190 def _get_tablecell_span (self , cell : TableCell , ix : int ):
157191 if cell .spans is None :
158192 span = set ()
@@ -249,26 +283,185 @@ def export_to_html(self) -> str:
249283
250284 return body
251285
286+ def export_to_document_tokens (
287+ self ,
288+ new_line : str = "\n " ,
289+ page_w : float = 0.0 ,
290+ page_h : float = 0.0 ,
291+ xsize : int = 100 ,
292+ ysize : int = 100 ,
293+ add_location : bool = True ,
294+ add_caption : bool = True ,
295+ add_content : bool = True ,
296+ add_cell_location : bool = True ,
297+ add_cell_label : bool = True ,
298+ add_cell_text : bool = True ,
299+ add_page_index : bool = True ,
300+ ):
301+ """Export table to document tokens format."""
302+ body = f"{ DocumentToken .BEG_TABLE .value } { new_line } "
303+
304+ if add_location :
305+ body += self .get_location_tokens (
306+ new_line = new_line ,
307+ page_w = page_w ,
308+ page_h = page_h ,
309+ xsize = xsize ,
310+ ysize = ysize ,
311+ add_page_index = add_page_index ,
312+ )
313+
314+ if add_caption and self .text is not None and len (self .text ) > 0 :
315+ body += f"{ DocumentToken .BEG_CAPTION .value } "
316+ body += f"{ self .text .strip ()} "
317+ body += f"{ DocumentToken .END_CAPTION .value } "
318+ body += f"{ new_line } "
319+
320+ if add_content and self .data is not None and len (self .data ) > 0 :
321+ for i , row in enumerate (self .data ):
322+ body += f"<row_{ i } >"
323+ for j , col in enumerate (row ):
324+
325+ text = ""
326+ if add_cell_text :
327+ text = col .text .strip ()
328+
329+ cell_loc = ""
330+ if (
331+ col .bbox is not None
332+ and add_cell_location
333+ and add_page_index
334+ and self .prov is not None
335+ and len (self .prov ) > 0
336+ ):
337+ cell_loc = DocumentToken .get_location (
338+ bbox = col .bbox ,
339+ page_w = page_w ,
340+ page_h = page_h ,
341+ xsize = xsize ,
342+ ysize = ysize ,
343+ page_i = self .prov [0 ].page ,
344+ )
345+ elif (
346+ col .bbox is not None
347+ and add_cell_location
348+ and not add_page_index
349+ ):
350+ cell_loc = DocumentToken .get_location (
351+ bbox = col .bbox ,
352+ page_w = page_w ,
353+ page_h = page_h ,
354+ xsize = xsize ,
355+ ysize = ysize ,
356+ page_i = - 1 ,
357+ )
358+
359+ cell_label = ""
360+ if (
361+ add_cell_label
362+ and col .obj_type is not None
363+ and len (col .obj_type ) > 0
364+ ):
365+ cell_label = f"<{ col .obj_type } >"
366+
367+ body += f"<col_{ j } >{ cell_loc } { cell_label } { text } </col_{ j } >"
368+
369+ body += f"</row_{ i } >{ new_line } "
370+
371+ body += f"{ DocumentToken .END_TABLE .value } { new_line } "
372+
373+ return body
374+
252375
253376# FIXME: let's add some figure specific data-types later
254377class Figure (BaseCell ):
255378 """Figure."""
256379
380+ # FIXME: we need to check why we have bounding_box (this should be in prov)
381+ bounding_box : Optional [BoundingBoxContainer ] = Field (
382+ default = None , alias = "bounding-box" , json_schema_extra = es_field (suppress = True )
383+ )
384+
385+ def export_to_document_tokens (
386+ self ,
387+ new_line : str = "\n " ,
388+ page_w : float = 0.0 ,
389+ page_h : float = 0.0 ,
390+ xsize : int = 100 ,
391+ ysize : int = 100 ,
392+ add_location : bool = True ,
393+ add_caption : bool = True ,
394+ add_content : bool = True , # not used at the moment
395+ add_page_index : bool = True ,
396+ ):
397+ """Export figure to document tokens format."""
398+ body = f"{ DocumentToken .BEG_FIGURE .value } { new_line } "
399+
400+ if add_location :
401+ body += self .get_location_tokens (
402+ new_line = new_line ,
403+ page_w = page_w ,
404+ page_h = page_h ,
405+ xsize = xsize ,
406+ ysize = ysize ,
407+ add_page_index = add_page_index ,
408+ )
409+
410+ if add_caption and self .text is not None and len (self .text ) > 0 :
411+ body += f"{ DocumentToken .BEG_CAPTION .value } "
412+ body += f"{ self .text .strip ()} "
413+ body += f"{ DocumentToken .END_CAPTION .value } "
414+ body += f"{ new_line } "
415+
416+ body += f"{ DocumentToken .END_FIGURE .value } { new_line } "
417+
418+ return body
419+
257420
258- class BaseText (AliasModel ):
421+ class BaseText (BaseCell ):
259422 """Base model for text objects."""
260423
261- text : StrictStr = Field (
262- json_schema_extra = es_field (term_vector = "with_positions_offsets" )
263- )
264- obj_type : StrictStr = Field (
265- alias = "type" , json_schema_extra = es_field (type = "keyword" , ignore_above = 8191 )
266- )
424+ # FIXME: do we need these ???
267425 name : Optional [StrictStr ] = Field (
268426 default = None , json_schema_extra = es_field (type = "keyword" , ignore_above = 8191 )
269427 )
270428 font : Optional [str ] = None
271- prov : Optional [list [Prov ]] = None
429+
430+ def export_to_document_tokens (
431+ self ,
432+ new_line : str = "\n " ,
433+ page_w : float = 0.0 ,
434+ page_h : float = 0.0 ,
435+ xsize : int = 100 ,
436+ ysize : int = 100 ,
437+ add_location : bool = True ,
438+ add_content : bool = True ,
439+ add_page_index : bool = True ,
440+ ):
441+ """Export text element to document tokens format."""
442+ body = f"<{ self .obj_type } >"
443+ # body = f"<{self.name}>"
444+
445+ assert DocumentToken .is_known_token (
446+ body
447+ ), f"failed DocumentToken.is_known_token({ body } )"
448+
449+ if add_location :
450+ body += self .get_location_tokens (
451+ new_line = "" ,
452+ page_w = page_w ,
453+ page_h = page_h ,
454+ xsize = xsize ,
455+ ysize = ysize ,
456+ add_page_index = add_page_index ,
457+ )
458+
459+ if add_content and self .text is not None :
460+ body += self .text .strip ()
461+
462+ body += f"</{ self .obj_type } >{ new_line } "
463+
464+ return body
272465
273466
274467class ListItem (BaseText ):
0 commit comments