Skip to content

Commit 58ed6c8

Browse files
nassarofficialAhmed Nassarcau-git
authored
fix: Document Tokens (doc tags) clean up, fix iterate_items for content_layer (#161)
* Clean up removing unnused tags at the moment, reliance mainly on DocItemLabel and DocumentTokens when unavoidable. Signed-off-by: Ahmed Nassar <[email protected]> * fix: Fix inheritance of CodeItem for backward compatibility Signed-off-by: Christoph Auer <[email protected]> * Update docs Signed-off-by: Christoph Auer <[email protected]> --------- Signed-off-by: Ahmed Nassar <[email protected]> Signed-off-by: Christoph Auer <[email protected]> Co-authored-by: Ahmed Nassar <[email protected]> Co-authored-by: Christoph Auer <[email protected]>
1 parent 7267c3f commit 58ed6c8

File tree

2 files changed

+28
-95
lines changed

2 files changed

+28
-95
lines changed

docling_core/types/doc/document.py

Lines changed: 22 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -794,7 +794,7 @@ def export_to_document_tokens(
794794
:param add_content: bool: (Default value = True)
795795
796796
"""
797-
body = f"{DocumentToken.BEG_CODE.value}{new_line}"
797+
body = f"<{self.label.value}{new_line}"
798798

799799
if add_location:
800800
body += self.get_location_tokens(
@@ -807,7 +807,7 @@ def export_to_document_tokens(
807807
if add_content and self.text is not None:
808808
body += f"<_{self.code_language.value}_>{self.text}{new_line}"
809809

810-
body += f"{DocumentToken.END_CODE.value}\n"
810+
body += f"</{self.label.value}\n"
811811

812812
return body
813813

@@ -977,8 +977,7 @@ def export_to_document_tokens(
977977
:param # not used at the moment
978978
979979
"""
980-
body = f"{DocumentToken.BEG_PICTURE.value}{new_line}"
981-
980+
body = f"<{self.label.value}>{new_line}"
982981
if add_location:
983982
body += self.get_location_tokens(
984983
doc=doc,
@@ -1002,7 +1001,7 @@ def export_to_document_tokens(
10021001
text = self.caption_text(doc)
10031002

10041003
if len(text):
1005-
body += f"{DocumentToken.BEG_CAPTION.value}"
1004+
body += f"<{DocItemLabel.CAPTION.value}>"
10061005
for caption in self.captions:
10071006
body += caption.resolve(doc).get_location_tokens(
10081007
doc=doc,
@@ -1011,10 +1010,10 @@ def export_to_document_tokens(
10111010
ysize=ysize,
10121011
)
10131012
body += f"{text.strip()}"
1014-
body += f"{DocumentToken.END_CAPTION.value}"
1013+
body += f"</{DocItemLabel.CAPTION.value}>"
10151014
body += f"{new_line}"
10161015

1017-
body += f"{DocumentToken.END_PICTURE.value}\n"
1016+
body += f"</{self.label.value}>\n"
10181017

10191018
return body
10201019

@@ -1294,8 +1293,11 @@ def export_to_document_tokens(
12941293
:param add_cell_location: bool: (Default value = True)
12951294
:param add_cell_text: bool: (Default value = True)
12961295
:param add_caption: bool: (Default value = True)
1296+
12971297
"""
1298-
body = f"{DocumentToken.BEG_OTSL.value}{new_line}"
1298+
otsl_tag = DocumentToken.OTSL.value
1299+
1300+
body = f"<{otsl_tag}>{new_line}"
12991301

13001302
if add_location:
13011303
body += self.get_location_tokens(
@@ -1311,7 +1313,7 @@ def export_to_document_tokens(
13111313
text = self.caption_text(doc)
13121314

13131315
if len(text):
1314-
body += f"{DocumentToken.BEG_CAPTION.value}"
1316+
body += f"<{DocItemLabel.CAPTION.value}>"
13151317
for caption in self.captions:
13161318
body += caption.resolve(doc).get_location_tokens(
13171319
doc=doc,
@@ -1320,10 +1322,10 @@ def export_to_document_tokens(
13201322
ysize=ysize,
13211323
)
13221324
body += f"{text.strip()}"
1323-
body += f"{DocumentToken.END_CAPTION.value}"
1325+
body += f"</{DocItemLabel.CAPTION.value}>"
13241326
body += f"{new_line}"
13251327

1326-
body += f"{DocumentToken.END_OTSL.value}\n"
1328+
body += f"</{otsl_tag}>\n"
13271329

13281330
return body
13291331

@@ -2777,9 +2779,9 @@ def _close_lists(
27772779
while current_level < previous_level and ordered_list_stack:
27782780
last_is_ordered = ordered_list_stack.pop()
27792781
if last_is_ordered:
2780-
output_parts.append("</ordered_list>\n")
2782+
output_parts.append(f"</{DocumentToken.ORDERED_LIST.value}>\n")
27812783
else:
2782-
output_parts.append("</unordered_list>\n")
2784+
output_parts.append(f"</{DocumentToken.UNORDERED_LIST.value}>\n")
27832785
previous_level -= 1
27842786
return ordered_list_stack
27852787

@@ -2806,7 +2808,7 @@ def _add_page_break_if_needed(
28062808
return output_parts, current_page_no
28072809

28082810
if current_page_no != prev_page_no:
2809-
output_parts.append(f"{DocumentToken.PAGE_BREAK.value}\n")
2811+
output_parts.append(f"<{DocumentToken.PAGE_BREAK.value}>\n")
28102812

28112813
return output_parts, current_page_no
28122814

@@ -2832,7 +2834,7 @@ def _get_standalone_captions(document_body):
28322834
standalone_captions = _get_standalone_captions(self.body)
28332835

28342836
# Begin document
2835-
output_parts.append(f"{DocumentToken.BEG_DOCUMENT.value}{delim}")
2837+
output_parts.append(f"<{DocumentToken.DOCUMENT.value}>{delim}")
28362838

28372839
for ix, (item, current_level) in enumerate(
28382840
self.iterate_items(
@@ -2868,10 +2870,12 @@ def _get_standalone_captions(document_body):
28682870
# Handle list groups
28692871
if isinstance(item, GroupItem):
28702872
if item.label == GroupLabel.ORDERED_LIST:
2871-
output_parts.append(f"<ordered_list>{delim}")
2873+
output_parts.append(f"<{DocumentToken.ORDERED_LIST.value}>{delim}")
28722874
ordered_list_stack.append(True)
28732875
elif item.label == GroupLabel.LIST:
2874-
output_parts.append(f"<unordered_list>{delim}")
2876+
output_parts.append(
2877+
f"<{DocumentToken.UNORDERED_LIST.value}>{delim}"
2878+
)
28752879
ordered_list_stack.append(False)
28762880
continue
28772881

@@ -2945,7 +2949,7 @@ def _get_standalone_captions(document_body):
29452949
)
29462950

29472951
# End document
2948-
output_parts.append(DocumentToken.END_DOCUMENT.value)
2952+
output_parts.append(f"</{DocumentToken.DOCUMENT.value}>")
29492953

29502954
return "".join(output_parts)
29512955

docling_core/types/doc/tokens.py

Lines changed: 6 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -44,76 +44,21 @@ def is_known_token(label):
4444
class DocumentToken(Enum):
4545
"""Class to represent an LLM friendly representation of a Document."""
4646

47-
BEG_DOCUMENT = "<doctag>"
48-
END_DOCUMENT = "</doctag>"
49-
50-
BEG_TITLE = "<title>"
51-
END_TITLE = "</title>"
52-
53-
BEG_ABSTRACT = "<abstract>"
54-
END_ABSTRACT = "</abstract>"
55-
56-
BEG_DOI = "<doi>"
57-
END_DOI = "</doi>"
58-
BEG_DATE = "<date>"
59-
END_DATE = "</date>"
60-
61-
BEG_AUTHORS = "<authors>"
62-
END_AUTHORS = "</authors>"
63-
BEG_AUTHOR = "<author>"
64-
END_AUTHOR = "</author>"
65-
66-
BEG_AFFILIATIONS = "<affiliations>"
67-
END_AFFILIATIONS = "</affiliations>"
68-
BEG_AFFILIATION = "<affiliation>"
69-
END_AFFILIATION = "</affiliation>"
70-
BEG_TEXT = "<text>"
71-
END_TEXT = "</text>"
72-
BEG_PARAGRAPH = "<paragraph>"
73-
END_PARAGRAPH = "</paragraph>"
74-
BEG_TABLE = "<table>"
75-
END_TABLE = "</table>"
76-
BEG_OTSL = "<otsl>"
77-
END_OTSL = "</otsl>"
78-
BEG_PICTURE = "<picture>"
79-
END_PICTURE = "</picture>"
80-
BEG_CAPTION = "<caption>"
81-
END_CAPTION = "</caption>"
82-
BEG_EQUATION = "<formula>"
83-
END_EQUATION = "</formula>"
84-
BEG_CODE = "<code>"
85-
END_CODE = "</code>"
86-
BEG_LIST = "<list>"
87-
END_LIST = "</list>"
88-
BEG_LISTITEM = "<list-item>"
89-
END_LISTITEM = "</list-item>"
90-
BEG_LINE_NUMBER = "<line_number>"
91-
END_LINE_NUMBER = "</line_number>"
92-
BEG_LOCATION = "<location>"
93-
END_LOCATION = "</location>"
94-
BEG_GROUP = "<group>"
95-
END_GROUP = "</group>"
96-
97-
PAGE_BREAK = "<page_break>"
47+
DOCUMENT = "doctag"
48+
OTSL = "otsl"
49+
ORDERED_LIST = "ordered_list"
50+
UNORDERED_LIST = "unordered_list"
51+
LOC = "loc_"
52+
PAGE_BREAK = "page_break"
9853

9954
@classmethod
10055
def get_special_tokens(
10156
cls,
102-
max_rows: int = 100,
103-
max_cols: int = 100,
104-
max_pages: int = 1000,
10557
page_dimension: Tuple[int, int] = (100, 100),
10658
):
10759
"""Function to get all special document tokens."""
10860
special_tokens = [token.value for token in cls]
10961

110-
# Adding dynamically generated row and col tokens
111-
for i in range(0, max_rows + 1):
112-
special_tokens += [f"<row_{i}>", f"</row_{i}>"]
113-
114-
for i in range(0, max_cols + 1):
115-
special_tokens += [f"<col_{i}>", f"</col_{i}>"]
116-
11762
for i in range(6):
11863
special_tokens += [
11964
f"<section_header_level_{i}>",
@@ -135,22 +80,6 @@ def is_known_token(label):
13580
"""Function to check if label is in tokens."""
13681
return label in DocumentToken.get_special_tokens()
13782

138-
@staticmethod
139-
def get_row_token(row: int, beg=bool) -> str:
140-
"""Function to get page tokens."""
141-
if beg:
142-
return f"<row_{row}>"
143-
else:
144-
return f"</row_{row}>"
145-
146-
@staticmethod
147-
def get_col_token(col: int, beg=bool) -> str:
148-
"""Function to get page tokens."""
149-
if beg:
150-
return f"<col_{col}>"
151-
else:
152-
return f"</col_{col}>"
153-
15483
@staticmethod
15584
def get_picture_classification_token(classification: str) -> str:
15685
"""Function to get picture classification tokens."""

0 commit comments

Comments
 (0)