Skip to content

Commit 180e294

Browse files
maxmnemonicMaksym Lysak
andauthored
feat: Export to OTSL method for docling doc tables (#86)
Signed-off-by: Maksym Lysak <[email protected]> Co-authored-by: Maksym Lysak <[email protected]>
1 parent 4e0cb3f commit 180e294

File tree

4 files changed

+580
-3
lines changed

4 files changed

+580
-3
lines changed

docling_core/types/doc/document.py

Lines changed: 94 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,8 +37,8 @@
3737
from docling_core.types.doc import BoundingBox, Size
3838
from docling_core.types.doc.base import ImageRefMode
3939
from docling_core.types.doc.labels import DocItemLabel, GroupLabel
40+
from docling_core.types.doc.tokens import DocumentToken, TableToken
4041
from docling_core.types.doc.utils import relative_path
41-
from docling_core.types.legacy_doc.tokens import DocumentToken
4242

4343
Uint64 = typing.Annotated[int, Field(ge=0, le=(2**64 - 1))]
4444
LevelNumber = typing.Annotated[int, Field(ge=1, le=100)]
@@ -1008,7 +1008,6 @@ def export_to_html(
10081008
DeprecationWarning,
10091009
)
10101010

1011-
body = ""
10121011
nrows = self.data.num_rows
10131012
ncols = self.data.num_cols
10141013

@@ -1065,6 +1064,99 @@ def export_to_html(
10651064

10661065
return body
10671066

1067+
def export_to_otsl(
1068+
self,
1069+
doc: "DoclingDocument",
1070+
add_cell_location: bool = True,
1071+
add_cell_text: bool = True,
1072+
xsize: int = 100,
1073+
ysize: int = 100,
1074+
) -> str:
1075+
"""Export the table as OTSL."""
1076+
# Possible OTSL tokens...
1077+
#
1078+
# Empty and full cells:
1079+
# "ecel", "fcel"
1080+
#
1081+
# Cell spans (horisontal, vertical, 2d):
1082+
# "lcel", "ucel", "xcel"
1083+
#
1084+
# New line:
1085+
# "nl"
1086+
#
1087+
# Headers (column, row, section row):
1088+
# "ched", "rhed", "srow"
1089+
1090+
body = []
1091+
nrows = self.data.num_rows
1092+
ncols = self.data.num_cols
1093+
if len(self.data.table_cells) == 0:
1094+
return ""
1095+
1096+
page_no = 0
1097+
if len(self.prov) > 0:
1098+
page_no = self.prov[0].page_no
1099+
1100+
for i in range(nrows):
1101+
for j in range(ncols):
1102+
cell: TableCell = self.data.grid[i][j]
1103+
content = cell.text.strip()
1104+
rowspan, rowstart = (
1105+
cell.row_span,
1106+
cell.start_row_offset_idx,
1107+
)
1108+
colspan, colstart = (
1109+
cell.col_span,
1110+
cell.start_col_offset_idx,
1111+
)
1112+
1113+
if len(doc.pages.keys()):
1114+
page_w, page_h = doc.pages[page_no].size.as_tuple()
1115+
cell_loc = ""
1116+
if cell.bbox is not None:
1117+
cell_loc = DocumentToken.get_location(
1118+
bbox=cell.bbox.to_bottom_left_origin(page_h).as_tuple(),
1119+
page_w=page_w,
1120+
page_h=page_h,
1121+
xsize=xsize,
1122+
ysize=ysize,
1123+
page_i=page_no,
1124+
)
1125+
1126+
if rowstart == i and colstart == j:
1127+
if len(content) > 0:
1128+
if cell.column_header:
1129+
body.append(str(TableToken.OTSL_CHED.value))
1130+
elif cell.row_header:
1131+
body.append(str(TableToken.OTSL_RHED.value))
1132+
elif cell.row_section:
1133+
body.append(str(TableToken.OTSL_SROW.value))
1134+
else:
1135+
body.append(str(TableToken.OTSL_FCEL.value))
1136+
if add_cell_location:
1137+
body.append(str(cell_loc))
1138+
if add_cell_text:
1139+
body.append(str(content))
1140+
else:
1141+
body.append(str(TableToken.OTSL_ECEL.value))
1142+
else:
1143+
add_cross_cell = False
1144+
if rowstart != i:
1145+
if colspan == 1:
1146+
body.append(str(TableToken.OTSL_UCEL.value))
1147+
else:
1148+
add_cross_cell = True
1149+
if colstart != j:
1150+
if rowspan == 1:
1151+
body.append(str(TableToken.OTSL_LCEL.value))
1152+
else:
1153+
add_cross_cell = True
1154+
if add_cross_cell:
1155+
body.append(str(TableToken.OTSL_XCEL.value))
1156+
body.append(str(TableToken.OTSL_NL.value))
1157+
body_str = "".join(body)
1158+
return body_str
1159+
10681160
def export_to_document_tokens(
10691161
self,
10701162
doc: "DoclingDocument",

docling_core/types/doc/tokens.py

Lines changed: 202 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,202 @@
1+
#
2+
# Copyright IBM Corp. 2024 - 2024
3+
# SPDX-License-Identifier: MIT
4+
#
5+
6+
"""Tokens used in the docling document model."""
7+
8+
from enum import Enum
9+
from typing import Annotated, Tuple
10+
11+
from pydantic import Field
12+
13+
14+
class TableToken(Enum):
15+
"""Class to represent an LLM friendly representation of a Table."""
16+
17+
CELL_LABEL_COLUMN_HEADER = "<column_header>"
18+
CELL_LABEL_ROW_HEADER = "<row_header>"
19+
CELL_LABEL_SECTION_HEADERE = "<section_header>"
20+
CELL_LABEL_DATA = "<data>"
21+
22+
OTSL_ECEL = "<ecel>" # empty cell
23+
OTSL_FCEL = "<fcel>" # cell with content
24+
OTSL_LCEL = "<lcel>" # left looking cell,
25+
OTSL_UCEL = "<ucel>" # up looking cell,
26+
OTSL_XCEL = "<xcel>" # 2d extension cell (cross cell),
27+
OTSL_NL = "<nl>" # new line,
28+
OTSL_CHED = "<ched>" # - column header cell,
29+
OTSL_RHED = "<rhed>" # - row header cell,
30+
OTSL_SROW = "<srow>" # - section row cell
31+
32+
@classmethod
33+
def get_special_tokens(cls):
34+
"""Function to get all special document tokens."""
35+
special_tokens = [token.value for token in cls]
36+
return special_tokens
37+
38+
@staticmethod
39+
def is_known_token(label):
40+
"""Function to check if label is in tokens."""
41+
return label in TableToken.get_special_tokens()
42+
43+
44+
class DocumentToken(Enum):
45+
"""Class to represent an LLM friendly representation of a Document."""
46+
47+
BEG_DOCUMENT = "<document>"
48+
END_DOCUMENT = "</document>"
49+
50+
BEG_TITLE = "<title>"
51+
END_TITLE = "</title>"
52+
53+
BEG_ABSTRACT = "<abstract>"
54+
END_ABSTRACT = "</abstract>"
55+
56+
BEG_DOI = "<doi>"
57+
END_DOI = "</doi>"
58+
BEG_DATE = "<date>"
59+
END_DATE = "</date>"
60+
61+
BEG_AUTHORS = "<authors>"
62+
END_AUTHORS = "</authors>"
63+
BEG_AUTHOR = "<author>"
64+
END_AUTHOR = "</author>"
65+
66+
BEG_AFFILIATIONS = "<affiliations>"
67+
END_AFFILIATIONS = "</affiliations>"
68+
BEG_AFFILIATION = "<affiliation>"
69+
END_AFFILIATION = "</affiliation>"
70+
71+
BEG_HEADER = "<section-header>"
72+
END_HEADER = "</section-header>"
73+
BEG_TEXT = "<text>"
74+
END_TEXT = "</text>"
75+
BEG_PARAGRAPH = "<paragraph>"
76+
END_PARAGRAPH = "</paragraph>"
77+
BEG_TABLE = "<table>"
78+
END_TABLE = "</table>"
79+
BEG_FIGURE = "<figure>"
80+
END_FIGURE = "</figure>"
81+
BEG_CAPTION = "<caption>"
82+
END_CAPTION = "</caption>"
83+
BEG_EQUATION = "<equation>"
84+
END_EQUATION = "</equation>"
85+
BEG_LIST = "<list>"
86+
END_LIST = "</list>"
87+
BEG_LISTITEM = "<list-item>"
88+
END_LISTITEM = "</list-item>"
89+
90+
BEG_LOCATION = "<location>"
91+
END_LOCATION = "</location>"
92+
BEG_GROUP = "<group>"
93+
END_GROUP = "</group>"
94+
95+
@classmethod
96+
def get_special_tokens(
97+
cls,
98+
max_rows: int = 100,
99+
max_cols: int = 100,
100+
max_pages: int = 1000,
101+
page_dimension: Tuple[int, int] = (100, 100),
102+
):
103+
"""Function to get all special document tokens."""
104+
special_tokens = [token.value for token in cls]
105+
106+
# Adding dynamically generated row and col tokens
107+
for i in range(0, max_rows + 1):
108+
special_tokens += [f"<row_{i}>", f"</row_{i}>"]
109+
110+
for i in range(0, max_cols + 1):
111+
special_tokens += [f"<col_{i}>", f"</col_{i}>"]
112+
113+
for i in range(6):
114+
special_tokens += [f"<section-header-{i}>", f"</section-header-{i}>"]
115+
116+
# FIXME: this is synonym of section header
117+
for i in range(6):
118+
special_tokens += [f"<subtitle-level-{i}>", f"</subtitle-level-{i}>"]
119+
120+
# Adding dynamically generated page-tokens
121+
for i in range(0, max_pages + 1):
122+
special_tokens.append(f"<page_{i}>")
123+
special_tokens.append(f"</page_{i}>")
124+
125+
# Adding dynamically generated location-tokens
126+
for i in range(0, max(page_dimension[0] + 1, page_dimension[1] + 1)):
127+
special_tokens.append(f"<loc_{i}>")
128+
129+
return special_tokens
130+
131+
@staticmethod
132+
def is_known_token(label):
133+
"""Function to check if label is in tokens."""
134+
return label in DocumentToken.get_special_tokens()
135+
136+
@staticmethod
137+
def get_row_token(row: int, beg=bool) -> str:
138+
"""Function to get page tokens."""
139+
if beg:
140+
return f"<row_{row}>"
141+
else:
142+
return f"</row_{row}>"
143+
144+
@staticmethod
145+
def get_col_token(col: int, beg=bool) -> str:
146+
"""Function to get page tokens."""
147+
if beg:
148+
return f"<col_{col}>"
149+
else:
150+
return f"</col_{col}>"
151+
152+
@staticmethod
153+
def get_page_token(page: int):
154+
"""Function to get page tokens."""
155+
return f"<page_{page}>"
156+
157+
@staticmethod
158+
def get_location_token(val: float, rnorm: int = 100):
159+
"""Function to get location tokens."""
160+
val_ = round(rnorm * val)
161+
162+
if val_ < 0:
163+
return "<loc_0>"
164+
165+
if val_ > rnorm:
166+
return f"<loc_{rnorm}>"
167+
168+
return f"<loc_{val_}>"
169+
170+
@staticmethod
171+
def get_location(
172+
# bbox: Tuple[float, float, float, float],
173+
bbox: Annotated[list[float], Field(min_length=4, max_length=4)],
174+
page_w: float,
175+
page_h: float,
176+
xsize: int = 100,
177+
ysize: int = 100,
178+
page_i: int = -1,
179+
):
180+
"""Get the location string give bbox and page-dim."""
181+
assert bbox[0] <= bbox[2], f"bbox[0]<=bbox[2] => {bbox[0]}<={bbox[2]}"
182+
assert bbox[1] <= bbox[3], f"bbox[1]<=bbox[3] => {bbox[1]}<={bbox[3]}"
183+
184+
x0 = bbox[0] / page_w
185+
y0 = bbox[1] / page_h
186+
x1 = bbox[2] / page_w
187+
y1 = bbox[3] / page_h
188+
189+
page_tok = ""
190+
if page_i != -1:
191+
page_tok = DocumentToken.get_page_token(page=page_i)
192+
193+
x0_tok = DocumentToken.get_location_token(val=min(x0, x1), rnorm=xsize)
194+
y0_tok = DocumentToken.get_location_token(val=min(y0, y1), rnorm=ysize)
195+
x1_tok = DocumentToken.get_location_token(val=max(x0, x1), rnorm=xsize)
196+
y1_tok = DocumentToken.get_location_token(val=max(y0, y1), rnorm=ysize)
197+
198+
loc_str = f"{DocumentToken.BEG_LOCATION.value}"
199+
loc_str += f"{page_tok}{x0_tok}{y0_tok}{x1_tok}{y1_tok}"
200+
loc_str += f"{DocumentToken.END_LOCATION.value}"
201+
202+
return loc_str

test/test_docling_doc.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,6 @@
3636

3737

3838
def test_doc_origin():
39-
4039
doc_origin = DocumentOrigin(
4140
mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
4241
filename="myfile.pdf",

0 commit comments

Comments
 (0)