66"""Models for the Docling Document data type."""
77
88from datetime import datetime
9- from typing import Generic , Optional , Union
9+ from enum import Enum
10+ from typing import Generic , Optional , Tuple , Union
1011
1112from pydantic import (
1213 AnyHttpUrl ,
3536 BaseCell ,
3637 BaseText ,
3738 BitmapObject ,
39+ Figure ,
3840 PageDimensions ,
3941 PageReference ,
4042 Ref ,
@@ -275,7 +277,7 @@ class MinimalDocument(
275277 main_text : Optional [list [Union [Ref , BaseText ]]] = Field (
276278 default = None , alias = "main-text"
277279 )
278- figures : Optional [list [BaseCell ]] = None
280+ figures : Optional [list [Figure ]] = None
279281 tables : Optional [list [Table ]] = None
280282
281283
@@ -345,6 +347,107 @@ def from_dict(cls, data):
345347 return data
346348
347349
350+ class DocumentToken (Enum ):
351+ """Class to represent an LLM friendly representation of a Document."""
352+
353+ BEG_DOCUMENT = "<document>"
354+ END_DOCUMENT = "</document>"
355+
356+ BEG_TITLE = "<title>"
357+ END_TITLE = "</title>"
358+
359+ BEG_ABSTRACT = "<abstract>"
360+ END_ABSTRACT = "</abstract>"
361+
362+ BEG_DOI = "<doi>"
363+ END_DOI = "</doi>"
364+ BEG_DATE = "<date>"
365+ END_DATE = "</date>"
366+
367+ BEG_AUTHORS = "<authors>"
368+ END_AUTHORS = "</authors>"
369+ BEG_AUTHOR = "<author>"
370+ END_AUTHOR = "</author>"
371+
372+ BEG_AFFILIATIONS = "<affiliations>"
373+ END_AFFILIATIONS = "</affiliations>"
374+ BEG_AFFILIATION = "<affiliation>"
375+ END_AFFILIATION = "</affiliation>"
376+
377+ BEG_HEADER = "<section-header>"
378+ END_HEADER = "</section-header>"
379+ BEG_TEXT = "<text>"
380+ END_TEXT = "</text>"
381+ BEG_PARAGRAPH = "<paragraph>"
382+ END_PARAGRAPH = "</paragraph>"
383+ BEG_TABLE = "<table>"
384+ END_TABLE = "</table>"
385+ BEG_FIGURE = "<figure>"
386+ END_FIGURE = "</figure>"
387+ BEG_CAPTION = "<caption>"
388+ END_CAPTION = "</caption>"
389+ BEG_EQUATION = "<equation>"
390+ END_EQUATION = "</equation>"
391+ BEG_LIST = "<list>"
392+ END_LIST = "</list>"
393+ BEG_LISTITEM = "<list-item>"
394+ END_LISTITEM = "</list-item>"
395+
396+ BEG_LOCATION = "<location>"
397+ END_LOCATION = "</location>"
398+ BEG_GROUP = "<group>"
399+ END_GROUP = "</group>"
400+
401+ @classmethod
402+ def get_special_tokens (
403+ cls ,
404+ max_rows : int = 100 ,
405+ max_cols : int = 100 ,
406+ max_pages : int = 1000 ,
407+ page_dimension : Tuple [int , int ] = (100 , 100 ),
408+ ):
409+ """Function to get all special document tokens."""
410+ special_tokens = [token .value for token in cls ]
411+
412+ # Adding dynamically generated row and col tokens
413+ for i in range (0 , max_rows ):
414+ special_tokens += [f"<row_{ i } >" , f"</row_{ i } >" ]
415+
416+ for i in range (0 , max_cols ):
417+ special_tokens += [f"<col_{ i } >" , f"</col_{ i } >" ]
418+
419+ for i in range (6 ):
420+ special_tokens += [f"<section-header-{ i } >" , f"</section-header-{ i } >" ]
421+
422+ # Adding dynamically generated page-tokens
423+ for i in range (0 , max_pages ):
424+ special_tokens .append (f"<page_{ i } >" )
425+
426+ # Adding dynamically generated location-tokens
427+ for i in range (0 , max (page_dimension [0 ], page_dimension [1 ])):
428+ special_tokens .append (f"<loc_{ i } >" )
429+
430+ return special_tokens
431+
432+ @staticmethod
433+ def get_page_token (page : int ):
434+ """Function to get page tokens."""
435+ return f"<page_{ page } >"
436+
437+ @staticmethod
438+ def get_location_token (val : float , rnorm : int = 100 ):
439+ """Function to get location tokens."""
440+ val_ = round (rnorm * val )
441+
442+ if val_ < 0 :
443+ return "<loc_0>"
444+
445+ if val_ > rnorm :
446+ return f"<loc_{ rnorm } >"
447+
448+ return f"<loc_{ val_ } >"
449+
450+
348451class ExportedCCSDocument (
349452 MinimalDocument ,
350453 Generic [
@@ -427,6 +530,14 @@ def export_to_markdown(
427530 delim : str = "\n \n " ,
428531 main_text_start : int = 0 ,
429532 main_text_stop : Optional [int ] = None ,
533+ main_text_labels : list [str ] = [
534+ "title" ,
535+ "subtitle-level-1" ,
536+ "paragraph" ,
537+ "caption" ,
538+ "table" ,
539+ ],
540+ strict_text : bool = False ,
430541 ) -> str :
431542 r"""Serialize to Markdown.
432543
@@ -461,12 +572,7 @@ def export_to_markdown(
461572 continue
462573
463574 item_type = item .obj_type
464- if isinstance (item , BaseText ) and item_type in {
465- "title" ,
466- "subtitle-level-1" ,
467- "paragraph" ,
468- "caption" ,
469- }:
575+ if isinstance (item , BaseText ) and item_type in main_text_labels :
470576 text = item .text
471577
472578 # ignore repeated text
@@ -477,20 +583,31 @@ def export_to_markdown(
477583
478584 # first title match
479585 if item_type == "title" and not has_title :
480- markdown_text = f"# { text } "
586+ if strict_text :
587+ markdown_text = f"{ text } "
588+ else :
589+ markdown_text = f"# { text } "
481590 has_title = True
482591
483592 # secondary titles
484593 elif item_type in {"title" , "subtitle-level-1" } or (
485594 has_title and item_type == "title"
486595 ):
487- markdown_text = f"## { text } "
596+ if strict_text :
597+ markdown_text = f"{ text } "
598+ else :
599+ markdown_text = f"## { text } "
488600
489601 # normal text
490602 else :
491603 markdown_text = text
492604
493- elif isinstance (item , Table ) and item .data :
605+ elif (
606+ isinstance (item , Table )
607+ and item .data
608+ and item_type in main_text_labels
609+ and not strict_text
610+ ):
494611 table = []
495612 for row in item .data :
496613 tmp = []
@@ -518,3 +635,157 @@ def export_to_markdown(
518635
519636 result = delim .join (md_texts )
520637 return result
638+
639+ def export_to_document_tokens (
640+ self ,
641+ delim : str = "\n \n " ,
642+ main_text_start : int = 0 ,
643+ main_text_stop : Optional [int ] = None ,
644+ main_text_labels : list [str ] = [
645+ "title" ,
646+ "subtitle-level-1" ,
647+ "paragraph" ,
648+ "caption" ,
649+ "table" ,
650+ "figure" ,
651+ ],
652+ page_tagging : bool = True ,
653+ location_tagging : bool = True ,
654+ location_dimensions : Tuple [int , int ] = (100 , 100 ),
655+ add_new_line : bool = True ,
656+ ) -> str :
657+ r"""Exports the document content to an DocumentToken format.
658+
659+ Operates on a slice of the document's main_text as defined through arguments
660+ main_text_start and main_text_stop; defaulting to the whole main_text.
661+
662+ Args:
663+ delim (str, optional): The delimiter used to separate text blocks in the
664+ exported XML. Default is two newline characters ("\n\n").
665+ main_text_start (int, optional): The starting index of the main text to
666+ be included in the XML. Default is 0 (the beginning of the text).
667+ main_text_stop (Optional[int], optional): The stopping index of the main
668+ text. If set to None, the export includes text up to the end.
669+ Default is None.
670+ main_text_labels (list[str], optional): A list of text labels that
671+ categorize the different sections of the document (e.g., "title",
672+ "subtitle-level-1", "paragraph", "caption"). Default labels are
673+ "title", "subtitle-level-1", "paragraph", and "caption".
674+ location_tagging (bool, optional): Determines whether to include
675+ location-based tagging in the XML. If True, the exported XML will
676+ contain information about the locations of the text elements.
677+ Default is True.
678+ location_dimensions (Tuple[int, int], optional): Specifies the dimensions
679+ (width and height) for the location tagging, if enabled.
680+ Default is [100, 100].
681+ add_new_line (bool, optional): Whether to add new line characters after
682+ each text block. If True, a new line is added after each block of
683+ text in the XML. Default is True.
684+
685+ Returns:
686+ str: The content of the document formatted as an XML string.
687+ """
688+ xml_str = DocumentToken .BEG_DOCUMENT .value
689+
690+ new_line = ""
691+ if add_new_line :
692+ new_line = "\n "
693+
694+ if self .main_text is not None :
695+ for orig_item in self .main_text [main_text_start :main_text_stop ]:
696+
697+ item = (
698+ self ._resolve_ref (orig_item )
699+ if isinstance (orig_item , Ref )
700+ else orig_item
701+ )
702+
703+ if item is None :
704+ continue
705+
706+ prov = item .prov
707+
708+ loc_str = "" # default is zero
709+ if (
710+ location_tagging
711+ and self .page_dimensions is not None
712+ and prov is not None
713+ and len (prov ) > 0
714+ ):
715+
716+ page = prov [0 ].page
717+ page_dim = self .page_dimensions [page - 1 ]
718+
719+ page_w = float (page_dim .width )
720+ page_h = float (page_dim .height )
721+
722+ x0 = float (prov [0 ].bbox [0 ]) / float (page_w )
723+ y0 = float (prov [0 ].bbox [1 ]) / float (page_h )
724+ x1 = float (prov [0 ].bbox [2 ]) / float (page_w )
725+ y1 = float (prov [0 ].bbox [3 ]) / float (page_h )
726+
727+ page_tok = ""
728+ if page_tagging :
729+ page_tok = DocumentToken .get_page_token (page = page )
730+
731+ x0_tok = DocumentToken .get_location_token (
732+ val = min (x0 , x1 ), rnorm = location_dimensions [0 ]
733+ )
734+ y0_tok = DocumentToken .get_location_token (
735+ val = min (y0 , y1 ), rnorm = location_dimensions [1 ]
736+ )
737+ x1_tok = DocumentToken .get_location_token (
738+ val = max (x0 , x1 ), rnorm = location_dimensions [0 ]
739+ )
740+ y1_tok = DocumentToken .get_location_token (
741+ val = max (y0 , y1 ), rnorm = location_dimensions [1 ]
742+ )
743+
744+ # update
745+ loc_str = f"{ DocumentToken .BEG_LOCATION .value } "
746+ loc_str += f"{ page_tok } "
747+ loc_str += f"{ x0_tok } { y0_tok } { x1_tok } { y1_tok } "
748+ loc_str += f"{ DocumentToken .END_LOCATION .value } "
749+
750+ item_type = item .obj_type
751+ if isinstance (item , BaseText ) and (item_type in main_text_labels ):
752+ text = item .text
753+
754+ xml_str += f"<{ item_type } >{ loc_str } { text } </{ item_type } >{ new_line } "
755+
756+ elif isinstance (item , Table ) and (item_type in main_text_labels ):
757+
758+ xml_str += f"<{ item_type } >{ loc_str } "
759+
760+ if item .text is not None and len (item .text ) > 0 :
761+ xml_str += f"{ DocumentToken .BEG_CAPTION .value } "
762+ xml_str += (
763+ f"{ item .text } { DocumentToken .END_CAPTION .value } { new_line } "
764+ )
765+
766+ if item .data is not None and len (item .data ) > 0 :
767+ for i , row in enumerate (item .data ):
768+ xml_str += f"<row_{ i } >"
769+ for j , col in enumerate (row ):
770+ text = col .text
771+ xml_str += f"<col_{ j } >{ text } </col_{ j } >"
772+
773+ xml_str += f"</row_{ i } >{ new_line } "
774+
775+ xml_str += f"</{ item_type } >{ new_line } "
776+
777+ elif isinstance (item , Figure ) and (item_type in main_text_labels ):
778+
779+ xml_str += f"<{ item_type } >{ loc_str } "
780+
781+ if item .text is not None and len (item .text ) > 0 :
782+ xml_str += f"{ DocumentToken .BEG_CAPTION .value } "
783+ xml_str += (
784+ f"{ item .text } { DocumentToken .END_CAPTION .value } { new_line } "
785+ )
786+
787+ xml_str += f"</{ item_type } >{ new_line } "
788+
789+ xml_str += DocumentToken .END_DOCUMENT .value
790+
791+ return xml_str
0 commit comments