Skip to content

Commit f751b45

Browse files
Matteo-OmenettiAhmed NassarMatteo-Omenetti
authored
feat: Implementation of doc tags (#138)
* Migration to new DocTags, gt missing Signed-off-by: Ahmed Nassar <[email protected]> * fixed the tests Signed-off-by: Matteo-Omenetti <[email protected]> * overloaded export to markdown in CodeItem to include code language Signed-off-by: Matteo-Omenetti <[email protected]> * minor change Signed-off-by: Matteo-Omenetti <[email protected]> * implemented page break token instead of page number Signed-off-by: Matteo-Omenetti <[email protected]> * added figure classification in doc tags Signed-off-by: Matteo-Omenetti <[email protected]> * changed table repr. to otsl Signed-off-by: Matteo-Omenetti <[email protected]> * changed table repr. to otsl fixed new line Signed-off-by: Matteo-Omenetti <[email protected]> * fixed new line bug Signed-off-by: Matteo-Omenetti <[email protected]> * fixed new line bug Signed-off-by: Matteo-Omenetti <[email protected]> * removed print in test Signed-off-by: Matteo-Omenetti <[email protected]> * new normalization to 500 and top left coord Signed-off-by: Matteo-Omenetti <[email protected]> * export of FOOTER AND HEADER Signed-off-by: Matteo-Omenetti <[email protected]> * export of FOOTER AND HEADER Signed-off-by: Matteo-Omenetti <[email protected]> * fix code bug Signed-off-by: Matteo-Omenetti <[email protected]> * added export of standalon captions Signed-off-by: Matteo-Omenetti <[email protected]> * added export of standalon captions Signed-off-by: Matteo-Omenetti <[email protected]> * changed generate to False Signed-off-by: Matteo-Omenetti <[email protected]> * added export of furnitures and fixed bug in iterate_items function Signed-off-by: Matteo-Omenetti <[email protected]> --------- Signed-off-by: Ahmed Nassar <[email protected]> Signed-off-by: Matteo-Omenetti <[email protected]> Co-authored-by: Ahmed Nassar <[email protected]> Co-authored-by: Matteo-Omenetti <[email protected]>
1 parent 021c63e commit f751b45

File tree

10 files changed

+489
-603
lines changed

10 files changed

+489
-603
lines changed

docling_core/types/doc/document.py

Lines changed: 293 additions & 280 deletions
Large diffs are not rendered by default.

docling_core/types/doc/labels.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@ class PictureClassificationLabel(str, Enum):
111111
SIGNATURE = "signature"
112112
STAMP = "stamp"
113113
QR_CODE = "qr_code"
114-
BAR_CODE = "bat_code"
114+
BAR_CODE = "bar_code"
115115
SCREENSHOT = "screenshot"
116116

117117
# Geology/Geography

docling_core/types/doc/tokens.py

Lines changed: 28 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,15 @@
88
from enum import Enum
99
from typing import Tuple
1010

11+
from docling_core.types.doc.labels import PictureClassificationLabel
12+
1113

1214
class TableToken(Enum):
1315
"""Class to represent an LLM friendly representation of a Table."""
1416

1517
CELL_LABEL_COLUMN_HEADER = "<column_header>"
1618
CELL_LABEL_ROW_HEADER = "<row_header>"
17-
CELL_LABEL_SECTION_HEADERE = "<section_header>"
19+
CELL_LABEL_SECTION_HEADER = "<shed>"
1820
CELL_LABEL_DATA = "<data>"
1921

2022
OTSL_ECEL = "<ecel>" # empty cell
@@ -42,8 +44,8 @@ def is_known_token(label):
4244
class DocumentToken(Enum):
4345
"""Class to represent an LLM friendly representation of a Document."""
4446

45-
BEG_DOCUMENT = "<document>"
46-
END_DOCUMENT = "</document>"
47+
BEG_DOCUMENT = "<doctag>"
48+
END_DOCUMENT = "</doctag>"
4749

4850
BEG_TITLE = "<title>"
4951
END_TITLE = "</title>"
@@ -65,31 +67,35 @@ class DocumentToken(Enum):
6567
END_AFFILIATIONS = "</affiliations>"
6668
BEG_AFFILIATION = "<affiliation>"
6769
END_AFFILIATION = "</affiliation>"
68-
69-
BEG_HEADER = "<section-header>"
70-
END_HEADER = "</section-header>"
7170
BEG_TEXT = "<text>"
7271
END_TEXT = "</text>"
7372
BEG_PARAGRAPH = "<paragraph>"
7473
END_PARAGRAPH = "</paragraph>"
7574
BEG_TABLE = "<table>"
7675
END_TABLE = "</table>"
77-
BEG_FIGURE = "<figure>"
78-
END_FIGURE = "</figure>"
76+
BEG_OTSL = "<otsl>"
77+
END_OTSL = "</otsl>"
78+
BEG_PICTURE = "<picture>"
79+
END_PICTURE = "</picture>"
7980
BEG_CAPTION = "<caption>"
8081
END_CAPTION = "</caption>"
81-
BEG_EQUATION = "<equation>"
82-
END_EQUATION = "</equation>"
82+
BEG_EQUATION = "<formula>"
83+
END_EQUATION = "</formula>"
84+
BEG_CODE = "<code>"
85+
END_CODE = "</code>"
8386
BEG_LIST = "<list>"
8487
END_LIST = "</list>"
8588
BEG_LISTITEM = "<list-item>"
8689
END_LISTITEM = "</list-item>"
87-
90+
BEG_LINE_NUMBER = "<line_number>"
91+
END_LINE_NUMBER = "</line_number>"
8892
BEG_LOCATION = "<location>"
8993
END_LOCATION = "</location>"
9094
BEG_GROUP = "<group>"
9195
END_GROUP = "</group>"
9296

97+
PAGE_BREAK = "<page_break>"
98+
9399
@classmethod
94100
def get_special_tokens(
95101
cls,
@@ -109,16 +115,14 @@ def get_special_tokens(
109115
special_tokens += [f"<col_{i}>", f"</col_{i}>"]
110116

111117
for i in range(6):
112-
special_tokens += [f"<section-header-{i}>", f"</section-header-{i}>"]
118+
special_tokens += [
119+
f"<section_header_level_{i}>",
120+
f"</section_header_level_{i}>",
121+
]
113122

114-
# FIXME: this is synonym of section header
115-
for i in range(6):
116-
special_tokens += [f"<subtitle-level-{i}>", f"</subtitle-level-{i}>"]
117-
118-
# Adding dynamically generated page-tokens
119-
for i in range(0, max_pages + 1):
120-
special_tokens.append(f"<page_{i}>")
121-
special_tokens.append(f"</page_{i}>")
123+
# Add dynamically picture classification tokens
124+
for _, member in PictureClassificationLabel.__members__.items():
125+
special_tokens.append(f"<{member}>")
122126

123127
# Adding dynamically generated location-tokens
124128
for i in range(0, max(page_dimension[0] + 1, page_dimension[1] + 1)):
@@ -148,9 +152,9 @@ def get_col_token(col: int, beg=bool) -> str:
148152
return f"</col_{col}>"
149153

150154
@staticmethod
151-
def get_page_token(page: int):
152-
"""Function to get page tokens."""
153-
return f"<page_{page}>"
155+
def get_picture_classification_token(classification: str) -> str:
156+
"""Function to get picture classification tokens."""
157+
return f"<{classification}>"
154158

155159
@staticmethod
156160
def get_location_token(val: float, rnorm: int = 100):
@@ -172,7 +176,6 @@ def get_location(
172176
page_h: float,
173177
xsize: int = 100,
174178
ysize: int = 100,
175-
page_i: int = -1,
176179
):
177180
"""Get the location string give bbox and page-dim."""
178181
assert bbox[0] <= bbox[2], f"bbox[0]<=bbox[2] => {bbox[0]}<={bbox[2]}"
@@ -183,17 +186,11 @@ def get_location(
183186
x1 = bbox[2] / page_w
184187
y1 = bbox[3] / page_h
185188

186-
page_tok = ""
187-
if page_i != -1:
188-
page_tok = DocumentToken.get_page_token(page=page_i)
189-
190189
x0_tok = DocumentToken.get_location_token(val=min(x0, x1), rnorm=xsize)
191190
y0_tok = DocumentToken.get_location_token(val=min(y0, y1), rnorm=ysize)
192191
x1_tok = DocumentToken.get_location_token(val=max(x0, x1), rnorm=xsize)
193192
y1_tok = DocumentToken.get_location_token(val=max(y0, y1), rnorm=ysize)
194193

195-
loc_str = f"{DocumentToken.BEG_LOCATION.value}"
196-
loc_str += f"{page_tok}{x0_tok}{y0_tok}{x1_tok}{y1_tok}"
197-
loc_str += f"{DocumentToken.END_LOCATION.value}"
194+
loc_str = f"{x0_tok}{y0_tok}{x1_tok}{y1_tok}"
198195

199196
return loc_str

test/data/doc/2206.01062.yaml.dt

Lines changed: 137 additions & 217 deletions
Large diffs are not rendered by default.

test/data/doc/bad_doc.yaml.dt

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
<document>
2-
<title>This is the title</title>
1+
<doctag><title>This is the title</title>
32
<section_header_level_1>This is the first section</section_header_level_1>
4-
</document>
3+
</doctag>

test/data/doc/constructed_doc.dt

Lines changed: 7 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,32 +1,20 @@
1-
<document>
2-
<title>Title of the Document</title>
1+
<doctag><title>Title of the Document</title>
32
<text>Author 1
43
Affiliation 1</text>
54
<text>Author 2
65
Affiliation 2</text>
76
<section_header_level_1>1. Introduction</section_header_level_1>
87
<text>This paper introduces the biggest invention ever made. ...</text>
9-
<unordered_list>
10-
<list_item>list item 1</list_item>
8+
<unordered_list><list_item>list item 1</list_item>
119
<list_item>list item 2</list_item>
1210
<list_item>list item 3</list_item>
13-
<ordered_list>
14-
<list_item>list item 3.a</list_item>
11+
<ordered_list><list_item>list item 3.a</list_item>
1512
<list_item>list item 3.b</list_item>
1613
<list_item>list item 3.c</list_item>
1714
</ordered_list>
1815
<list_item>list item 4</list_item>
1916
</unordered_list>
20-
<table>
21-
<caption>This is the caption of table 1.</caption>
22-
<row_0><col_0><body>Product</col_0><col_1><body>Years</col_1><col_2><body>Years</col_2></row_0>
23-
<row_1><col_0><body>Product</col_0><col_1><body>2016</col_1><col_2><body>2017</col_2></row_1>
24-
<row_2><col_0><body>Apple</col_0><col_1><body>49823</col_1><col_2><body>695944</col_2></row_2>
25-
</table>
26-
<figure>
27-
<caption>This is the caption of figure 1.</caption>
28-
</figure>
29-
<figure>
30-
<caption>This is the caption of figure 2.</caption>
31-
</figure>
32-
</document>
17+
<otsl><fcel>Product<fcel>Years<lcel><nl><ucel><fcel>2016<fcel>2017<nl><fcel>Apple<fcel>49823<fcel>695944<nl><caption>This is the caption of table 1.</caption></otsl>
18+
<picture><caption>This is the caption of figure 1.</caption></picture>
19+
<picture><caption>This is the caption of figure 2.</caption></picture>
20+
</doctag>
Lines changed: 7 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,32 +1,20 @@
1-
<document>
2-
<title>Title of the Document</title>
1+
<doctag><title>Title of the Document</title>
32
<text>Author 1
43
Affiliation 1</text>
54
<text>Author 2
65
Affiliation 2</text>
76
<section_header_level_1>1. Introduction</section_header_level_1>
87
<text>This paper introduces the biggest invention ever made. ...</text>
9-
<unordered_list>
10-
<list_item>list item 1</list_item>
8+
<unordered_list><list_item>list item 1</list_item>
119
<list_item>list item 2</list_item>
1210
<list_item>list item 3</list_item>
13-
<ordered_list>
14-
<list_item>list item 3.a</list_item>
11+
<ordered_list><list_item>list item 3.a</list_item>
1512
<list_item>list item 3.b</list_item>
1613
<list_item>list item 3.c</list_item>
1714
</ordered_list>
1815
<list_item>list item 4</list_item>
1916
</unordered_list>
20-
<table>
21-
<caption>This is the caption of table 1.</caption>
22-
<row_0><col_0><body>Product</col_0><col_1><body>Years</col_1><col_2><body>Years</col_2></row_0>
23-
<row_1><col_0><body>Product</col_0><col_1><body>2016</col_1><col_2><body>2017</col_2></row_1>
24-
<row_2><col_0><body>Apple</col_0><col_1><body>49823</col_1><col_2><body>695944</col_2></row_2>
25-
</table>
26-
<figure>
27-
<caption>This is the caption of figure 1.</caption>
28-
</figure>
29-
<figure>
30-
<caption>This is the caption of figure 2.</caption>
31-
</figure>
32-
</document>
17+
<otsl><fcel>Product<fcel>Years<lcel><nl><ucel><fcel>2016<fcel>2017<nl><fcel>Apple<fcel>49823<fcel>695944<nl><caption>This is the caption of table 1.</caption></otsl>
18+
<picture><caption>This is the caption of figure 1.</caption></picture>
19+
<picture><caption>This is the caption of figure 2.</caption></picture>
20+
</doctag>
Lines changed: 7 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,32 +1,20 @@
1-
<document>
2-
<title>Title of the Document</title>
1+
<doctag><title>Title of the Document</title>
32
<text>Author 1
43
Affiliation 1</text>
54
<text>Author 2
65
Affiliation 2</text>
76
<section_header_level_1>1. Introduction</section_header_level_1>
87
<text>This paper introduces the biggest invention ever made. ...</text>
9-
<unordered_list>
10-
<list_item>list item 1</list_item>
8+
<unordered_list><list_item>list item 1</list_item>
119
<list_item>list item 2</list_item>
1210
<list_item>list item 3</list_item>
13-
<ordered_list>
14-
<list_item>list item 3.a</list_item>
11+
<ordered_list><list_item>list item 3.a</list_item>
1512
<list_item>list item 3.b</list_item>
1613
<list_item>list item 3.c</list_item>
1714
</ordered_list>
1815
<list_item>list item 4</list_item>
1916
</unordered_list>
20-
<table>
21-
<caption>This is the caption of table 1.</caption>
22-
<row_0><col_0><body>Product</col_0><col_1><body>Years</col_1><col_2><body>Years</col_2></row_0>
23-
<row_1><col_0><body>Product</col_0><col_1><body>2016</col_1><col_2><body>2017</col_2></row_1>
24-
<row_2><col_0><body>Apple</col_0><col_1><body>49823</col_1><col_2><body>695944</col_2></row_2>
25-
</table>
26-
<figure>
27-
<caption>This is the caption of figure 1.</caption>
28-
</figure>
29-
<figure>
30-
<caption>This is the caption of figure 2.</caption>
31-
</figure>
32-
</document>
17+
<otsl><fcel>Product<fcel>Years<lcel><nl><ucel><fcel>2016<fcel>2017<nl><fcel>Apple<fcel>49823<fcel>695944<nl><caption>This is the caption of table 1.</caption></otsl>
18+
<picture><caption>This is the caption of figure 1.</caption></picture>
19+
<picture><caption>This is the caption of figure 2.</caption></picture>
20+
</doctag>

test/data/doc/dummy_doc.yaml.dt

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,4 @@
1-
<document>
2-
<title><location><page_1><loc_8><loc_91><loc_81><loc_95></location>DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis</title>
3-
<figure>
4-
<location><page_1><loc_59><loc_0><loc_91><loc_75></location>
5-
<caption>Figure 1: Four examples of complex page layouts across different document categories</caption>
6-
</figure>
7-
<table>
8-
<location><page_1><loc_42><loc_57><loc_49><loc_61></location>
9-
</table>
10-
</document>
1+
<doctag><title><loc_42><loc_26><loc_406><loc_46>DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis</title>
2+
<picture><loc_297><loc_125><loc_457><loc_500><illustration><caption><loc_210><loc_196><loc_245><loc_213>Figure 1: Four examples of complex page layouts across different document categories</caption></picture>
3+
<otsl><loc_210><loc_196><loc_245><loc_213></otsl>
4+
</doctag>

test/test_docling_doc.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -377,7 +377,6 @@ def _test_serialize_and_reload(doc):
377377

378378

379379
def _verify_regression_test(pred: str, filename: str, ext: str):
380-
381380
if os.path.exists(filename + f".{ext}") and not GENERATE:
382381
with open(filename + f".{ext}", "r", encoding="utf-8") as fr:
383382
gt_true = fr.read()
@@ -389,15 +388,15 @@ def _verify_regression_test(pred: str, filename: str, ext: str):
389388

390389

391390
def _test_export_methods(doc: DoclingDocument, filename: str):
392-
### Iterate all elements
391+
# Iterate all elements
393392
et_pred = doc.export_to_element_tree()
394393
_verify_regression_test(et_pred, filename=filename, ext="et")
395394

396-
## Export stuff
395+
# Export stuff
397396
md_pred = doc.export_to_markdown()
398397
_verify_regression_test(md_pred, filename=filename, ext="md")
399398

400-
# Test HTML export ...
399+
# Test sHTML export ...
401400
html_pred = doc.export_to_html()
402401
_verify_regression_test(html_pred, filename=filename, ext="html")
403402

0 commit comments

Comments
 (0)