fix: Document Tokens (doc tags) clean up, fix iterate_items for content_layer (#161)

nassarofficial · Ahmed Nassar · cau-git · web-flow · commit 58ed6c8ab75b · 2025-02-17T11:49:34.000+01:00
* Clean up removing unnused tags at the moment, reliance mainly on DocItemLabel and DocumentTokens when unavoidable.

Signed-off-by: Ahmed Nassar &lt;ahn@zurich.ibm.com&gt;

* fix: Fix inheritance of CodeItem for backward compatibility

Signed-off-by: Christoph Auer &lt;cau@zurich.ibm.com&gt;

* Update docs

Signed-off-by: Christoph Auer &lt;cau@zurich.ibm.com&gt;

---------

Signed-off-by: Ahmed Nassar &lt;ahn@zurich.ibm.com&gt;
Signed-off-by: Christoph Auer &lt;cau@zurich.ibm.com&gt;
Co-authored-by: Ahmed Nassar &lt;ahn@zurich.ibm.com&gt;
Co-authored-by: Christoph Auer &lt;cau@zurich.ibm.com&gt;
diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py
@@ -794,7 +794,7 @@ def export_to_document_tokens(
         :param add_content: bool:  (Default value = True)
 
         """
-        body = f"{DocumentToken.BEG_CODE.value}{new_line}"
+        body = f"<{self.label.value}{new_line}"
 
         if add_location:
             body += self.get_location_tokens(
@@ -807,7 +807,7 @@ def export_to_document_tokens(
         if add_content and self.text is not None:
             body += f"<_{self.code_language.value}_>{self.text}{new_line}"
 
-        body += f"{DocumentToken.END_CODE.value}\n"
+        body += f"</{self.label.value}\n"
 
         return body
 
@@ -977,8 +977,7 @@ def export_to_document_tokens(
         :param # not used at the moment
 
         """
-        body = f"{DocumentToken.BEG_PICTURE.value}{new_line}"
-
+        body = f"<{self.label.value}>{new_line}"
         if add_location:
             body += self.get_location_tokens(
                 doc=doc,
@@ -1002,7 +1001,7 @@ def export_to_document_tokens(
             text = self.caption_text(doc)
 
             if len(text):
-                body += f"{DocumentToken.BEG_CAPTION.value}"
+                body += f"<{DocItemLabel.CAPTION.value}>"
                 for caption in self.captions:
                     body += caption.resolve(doc).get_location_tokens(
                         doc=doc,
@@ -1011,10 +1010,10 @@ def export_to_document_tokens(
                         ysize=ysize,
                     )
                 body += f"{text.strip()}"
-                body += f"{DocumentToken.END_CAPTION.value}"
+                body += f"</{DocItemLabel.CAPTION.value}>"
                 body += f"{new_line}"
 
-        body += f"{DocumentToken.END_PICTURE.value}\n"
+        body += f"</{self.label.value}>\n"
 
         return body
 
@@ -1294,8 +1293,11 @@ def export_to_document_tokens(
         :param add_cell_location: bool:  (Default value = True)
         :param add_cell_text: bool:  (Default value = True)
         :param add_caption: bool:  (Default value = True)
+
         """
-        body = f"{DocumentToken.BEG_OTSL.value}{new_line}"
+        otsl_tag = DocumentToken.OTSL.value
+
+        body = f"<{otsl_tag}>{new_line}"
 
         if add_location:
             body += self.get_location_tokens(
@@ -1311,7 +1313,7 @@ def export_to_document_tokens(
             text = self.caption_text(doc)
 
             if len(text):
-                body += f"{DocumentToken.BEG_CAPTION.value}"
+                body += f"<{DocItemLabel.CAPTION.value}>"
                 for caption in self.captions:
                     body += caption.resolve(doc).get_location_tokens(
                         doc=doc,
@@ -1320,10 +1322,10 @@ def export_to_document_tokens(
                         ysize=ysize,
                     )
                 body += f"{text.strip()}"
-                body += f"{DocumentToken.END_CAPTION.value}"
+                body += f"</{DocItemLabel.CAPTION.value}>"
                 body += f"{new_line}"
 
-        body += f"{DocumentToken.END_OTSL.value}\n"
+        body += f"</{otsl_tag}>\n"
 
         return body
 
@@ -2777,9 +2779,9 @@ def _close_lists(
             while current_level < previous_level and ordered_list_stack:
                 last_is_ordered = ordered_list_stack.pop()
                 if last_is_ordered:
-                    output_parts.append("</ordered_list>\n")
+                    output_parts.append(f"</{DocumentToken.ORDERED_LIST.value}>\n")
                 else:
-                    output_parts.append("</unordered_list>\n")
+                    output_parts.append(f"</{DocumentToken.UNORDERED_LIST.value}>\n")
                 previous_level -= 1
             return ordered_list_stack
 
@@ -2806,7 +2808,7 @@ def _add_page_break_if_needed(
                 return output_parts, current_page_no
 
             if current_page_no != prev_page_no:
-                output_parts.append(f"{DocumentToken.PAGE_BREAK.value}\n")
+                output_parts.append(f"<{DocumentToken.PAGE_BREAK.value}>\n")
 
             return output_parts, current_page_no
 
@@ -2832,7 +2834,7 @@ def _get_standalone_captions(document_body):
         standalone_captions = _get_standalone_captions(self.body)
 
         # Begin document
-        output_parts.append(f"{DocumentToken.BEG_DOCUMENT.value}{delim}")
+        output_parts.append(f"<{DocumentToken.DOCUMENT.value}>{delim}")
 
         for ix, (item, current_level) in enumerate(
             self.iterate_items(
@@ -2868,10 +2870,12 @@ def _get_standalone_captions(document_body):
             # Handle list groups
             if isinstance(item, GroupItem):
                 if item.label == GroupLabel.ORDERED_LIST:
-                    output_parts.append(f"<ordered_list>{delim}")
+                    output_parts.append(f"<{DocumentToken.ORDERED_LIST.value}>{delim}")
                     ordered_list_stack.append(True)
                 elif item.label == GroupLabel.LIST:
-                    output_parts.append(f"<unordered_list>{delim}")
+                    output_parts.append(
+                        f"<{DocumentToken.UNORDERED_LIST.value}>{delim}"
+                    )
                     ordered_list_stack.append(False)
                 continue
 
@@ -2945,7 +2949,7 @@ def _get_standalone_captions(document_body):
         )
 
         # End document
-        output_parts.append(DocumentToken.END_DOCUMENT.value)
+        output_parts.append(f"</{DocumentToken.DOCUMENT.value}>")
 
         return "".join(output_parts)
 
diff --git a/docling_core/types/doc/tokens.py b/docling_core/types/doc/tokens.py
@@ -44,76 +44,21 @@ def is_known_token(label):
 class DocumentToken(Enum):
     """Class to represent an LLM friendly representation of a Document."""
 
-    BEG_DOCUMENT = "<doctag>"
-    END_DOCUMENT = "</doctag>"
-
-    BEG_TITLE = "<title>"
-    END_TITLE = "</title>"
-
-    BEG_ABSTRACT = "<abstract>"
-    END_ABSTRACT = "</abstract>"
-
-    BEG_DOI = "<doi>"
-    END_DOI = "</doi>"
-    BEG_DATE = "<date>"
-    END_DATE = "</date>"
-
-    BEG_AUTHORS = "<authors>"
-    END_AUTHORS = "</authors>"
-    BEG_AUTHOR = "<author>"
-    END_AUTHOR = "</author>"
-
-    BEG_AFFILIATIONS = "<affiliations>"
-    END_AFFILIATIONS = "</affiliations>"
-    BEG_AFFILIATION = "<affiliation>"
-    END_AFFILIATION = "</affiliation>"
-    BEG_TEXT = "<text>"
-    END_TEXT = "</text>"
-    BEG_PARAGRAPH = "<paragraph>"
-    END_PARAGRAPH = "</paragraph>"
-    BEG_TABLE = "<table>"
-    END_TABLE = "</table>"
-    BEG_OTSL = "<otsl>"
-    END_OTSL = "</otsl>"
-    BEG_PICTURE = "<picture>"
-    END_PICTURE = "</picture>"
-    BEG_CAPTION = "<caption>"
-    END_CAPTION = "</caption>"
-    BEG_EQUATION = "<formula>"
-    END_EQUATION = "</formula>"
-    BEG_CODE = "<code>"
-    END_CODE = "</code>"
-    BEG_LIST = "<list>"
-    END_LIST = "</list>"
-    BEG_LISTITEM = "<list-item>"
-    END_LISTITEM = "</list-item>"
-    BEG_LINE_NUMBER = "<line_number>"
-    END_LINE_NUMBER = "</line_number>"
-    BEG_LOCATION = "<location>"
-    END_LOCATION = "</location>"
-    BEG_GROUP = "<group>"
-    END_GROUP = "</group>"
-
-    PAGE_BREAK = "<page_break>"
+    DOCUMENT = "doctag"
+    OTSL = "otsl"
+    ORDERED_LIST = "ordered_list"
+    UNORDERED_LIST = "unordered_list"
+    LOC = "loc_"
+    PAGE_BREAK = "page_break"
 
     @classmethod
     def get_special_tokens(
         cls,
-        max_rows: int = 100,
-        max_cols: int = 100,
-        max_pages: int = 1000,
         page_dimension: Tuple[int, int] = (100, 100),
     ):
         """Function to get all special document tokens."""
         special_tokens = [token.value for token in cls]
 
-        # Adding dynamically generated row and col tokens
-        for i in range(0, max_rows + 1):
-            special_tokens += [f"<row_{i}>", f"</row_{i}>"]
-
-        for i in range(0, max_cols + 1):
-            special_tokens += [f"<col_{i}>", f"</col_{i}>"]
-
         for i in range(6):
             special_tokens += [
                 f"<section_header_level_{i}>",
@@ -135,22 +80,6 @@ def is_known_token(label):
         """Function to check if label is in tokens."""
         return label in DocumentToken.get_special_tokens()
 
-    @staticmethod
-    def get_row_token(row: int, beg=bool) -> str:
-        """Function to get page tokens."""
-        if beg:
-            return f"<row_{row}>"
-        else:
-            return f"</row_{row}>"
-
-    @staticmethod
-    def get_col_token(col: int, beg=bool) -> str:
-        """Function to get page tokens."""
-        if beg:
-            return f"<col_{col}>"
-        else:
-            return f"</col_{col}>"
-
     @staticmethod
     def get_picture_classification_token(classification: str) -> str:
         """Function to get picture classification tokens."""