Skip to content

Commit 891530f

Browse files
feat: add export to doctags for document components (#25)
* feat: refactoring doc-tokens in new file and add new export function to table Signed-off-by: Peter Staar <[email protected]> * reformatted and fixed bugs Signed-off-by: Peter Staar <[email protected]> * working on table exporting in document tokens Signed-off-by: Peter Staar <[email protected]> * updated code and tests, need to decide what to commit as test-cases Signed-off-by: Peter Staar <[email protected]> * updated the test cases Signed-off-by: Peter Staar <[email protected]> * refactored the test-files Signed-off-by: Peter Staar <[email protected]> * refactored duplicate code Signed-off-by: Peter Staar <[email protected]> * refactored duplicate code (boundingbox to figure and table) Signed-off-by: Peter Staar <[email protected]> * refactored duplicate code (making BaseCell the base-class of BaseText) Signed-off-by: Peter Staar <[email protected]> * refactored duplicate code (re-use of function in BaseCell) Signed-off-by: Peter Staar <[email protected]> * fixed issues from review (2) Signed-off-by: Peter Staar <[email protected]> --------- Signed-off-by: Peter Staar <[email protected]>
1 parent 752cbc3 commit 891530f

15 files changed

+1706
-1273
lines changed

docling_core/types/doc/base.py

Lines changed: 205 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from pydantic import BaseModel, Field, PositiveInt, StrictStr
1111

1212
from docling_core.search.mapping import es_field
13+
from docling_core.types.doc.tokens import DocumentToken
1314
from docling_core.utils.alias import AliasModel
1415

1516
CellData = tuple[float, float, float, float, str, str]
@@ -132,10 +133,6 @@ class GlmTableCell(TableCell):
132133
class BaseCell(AliasModel):
133134
"""Base cell."""
134135

135-
# FIXME: we need to check why we have bounding_box (this should be in prov)
136-
bounding_box: Optional[BoundingBoxContainer] = Field(
137-
default=None, alias="bounding-box", json_schema_extra=es_field(suppress=True)
138-
)
139136
prov: Optional[list[Prov]] = None
140137
text: Optional[str] = Field(
141138
default=None, json_schema_extra=es_field(term_vector="with_positions_offsets")
@@ -144,6 +141,38 @@ class BaseCell(AliasModel):
144141
alias="type", json_schema_extra=es_field(type="keyword", ignore_above=8191)
145142
)
146143

144+
def get_location_tokens(
145+
self,
146+
new_line: str,
147+
page_w: float,
148+
page_h: float,
149+
xsize: int = 100,
150+
ysize: int = 100,
151+
add_page_index: bool = True,
152+
) -> str:
153+
"""Get the location string for the BaseCell."""
154+
if self.prov is None:
155+
return ""
156+
157+
location = ""
158+
for prov in self.prov:
159+
160+
page_i = -1
161+
if add_page_index:
162+
page_i = prov.page
163+
164+
loc_str = DocumentToken.get_location(
165+
bbox=prov.bbox,
166+
page_w=page_w,
167+
page_h=page_h,
168+
xsize=xsize,
169+
ysize=ysize,
170+
page_i=page_i,
171+
)
172+
location += f"{loc_str}{new_line}"
173+
174+
return location
175+
147176

148177
class Table(BaseCell):
149178
"""Table."""
@@ -153,6 +182,11 @@ class Table(BaseCell):
153182
data: Optional[list[list[Union[GlmTableCell, TableCell]]]] = None
154183
model: Optional[str] = None
155184

185+
# FIXME: we need to check why we have bounding_box (this should be in prov)
186+
bounding_box: Optional[BoundingBoxContainer] = Field(
187+
default=None, alias="bounding-box", json_schema_extra=es_field(suppress=True)
188+
)
189+
156190
def _get_tablecell_span(self, cell: TableCell, ix: int):
157191
if cell.spans is None:
158192
span = set()
@@ -249,26 +283,185 @@ def export_to_html(self) -> str:
249283

250284
return body
251285

286+
def export_to_document_tokens(
287+
self,
288+
new_line: str = "\n",
289+
page_w: float = 0.0,
290+
page_h: float = 0.0,
291+
xsize: int = 100,
292+
ysize: int = 100,
293+
add_location: bool = True,
294+
add_caption: bool = True,
295+
add_content: bool = True,
296+
add_cell_location: bool = True,
297+
add_cell_label: bool = True,
298+
add_cell_text: bool = True,
299+
add_page_index: bool = True,
300+
):
301+
"""Export table to document tokens format."""
302+
body = f"{DocumentToken.BEG_TABLE.value}{new_line}"
303+
304+
if add_location:
305+
body += self.get_location_tokens(
306+
new_line=new_line,
307+
page_w=page_w,
308+
page_h=page_h,
309+
xsize=xsize,
310+
ysize=ysize,
311+
add_page_index=add_page_index,
312+
)
313+
314+
if add_caption and self.text is not None and len(self.text) > 0:
315+
body += f"{DocumentToken.BEG_CAPTION.value}"
316+
body += f"{self.text.strip()}"
317+
body += f"{DocumentToken.END_CAPTION.value}"
318+
body += f"{new_line}"
319+
320+
if add_content and self.data is not None and len(self.data) > 0:
321+
for i, row in enumerate(self.data):
322+
body += f"<row_{i}>"
323+
for j, col in enumerate(row):
324+
325+
text = ""
326+
if add_cell_text:
327+
text = col.text.strip()
328+
329+
cell_loc = ""
330+
if (
331+
col.bbox is not None
332+
and add_cell_location
333+
and add_page_index
334+
and self.prov is not None
335+
and len(self.prov) > 0
336+
):
337+
cell_loc = DocumentToken.get_location(
338+
bbox=col.bbox,
339+
page_w=page_w,
340+
page_h=page_h,
341+
xsize=xsize,
342+
ysize=ysize,
343+
page_i=self.prov[0].page,
344+
)
345+
elif (
346+
col.bbox is not None
347+
and add_cell_location
348+
and not add_page_index
349+
):
350+
cell_loc = DocumentToken.get_location(
351+
bbox=col.bbox,
352+
page_w=page_w,
353+
page_h=page_h,
354+
xsize=xsize,
355+
ysize=ysize,
356+
page_i=-1,
357+
)
358+
359+
cell_label = ""
360+
if (
361+
add_cell_label
362+
and col.obj_type is not None
363+
and len(col.obj_type) > 0
364+
):
365+
cell_label = f"<{col.obj_type}>"
366+
367+
body += f"<col_{j}>{cell_loc}{cell_label}{text}</col_{j}>"
368+
369+
body += f"</row_{i}>{new_line}"
370+
371+
body += f"{DocumentToken.END_TABLE.value}{new_line}"
372+
373+
return body
374+
252375

253376
# FIXME: let's add some figure specific data-types later
254377
class Figure(BaseCell):
255378
"""Figure."""
256379

380+
# FIXME: we need to check why we have bounding_box (this should be in prov)
381+
bounding_box: Optional[BoundingBoxContainer] = Field(
382+
default=None, alias="bounding-box", json_schema_extra=es_field(suppress=True)
383+
)
384+
385+
def export_to_document_tokens(
386+
self,
387+
new_line: str = "\n",
388+
page_w: float = 0.0,
389+
page_h: float = 0.0,
390+
xsize: int = 100,
391+
ysize: int = 100,
392+
add_location: bool = True,
393+
add_caption: bool = True,
394+
add_content: bool = True, # not used at the moment
395+
add_page_index: bool = True,
396+
):
397+
"""Export figure to document tokens format."""
398+
body = f"{DocumentToken.BEG_FIGURE.value}{new_line}"
399+
400+
if add_location:
401+
body += self.get_location_tokens(
402+
new_line=new_line,
403+
page_w=page_w,
404+
page_h=page_h,
405+
xsize=xsize,
406+
ysize=ysize,
407+
add_page_index=add_page_index,
408+
)
409+
410+
if add_caption and self.text is not None and len(self.text) > 0:
411+
body += f"{DocumentToken.BEG_CAPTION.value}"
412+
body += f"{self.text.strip()}"
413+
body += f"{DocumentToken.END_CAPTION.value}"
414+
body += f"{new_line}"
415+
416+
body += f"{DocumentToken.END_FIGURE.value}{new_line}"
417+
418+
return body
419+
257420

258-
class BaseText(AliasModel):
421+
class BaseText(BaseCell):
259422
"""Base model for text objects."""
260423

261-
text: StrictStr = Field(
262-
json_schema_extra=es_field(term_vector="with_positions_offsets")
263-
)
264-
obj_type: StrictStr = Field(
265-
alias="type", json_schema_extra=es_field(type="keyword", ignore_above=8191)
266-
)
424+
# FIXME: do we need these ???
267425
name: Optional[StrictStr] = Field(
268426
default=None, json_schema_extra=es_field(type="keyword", ignore_above=8191)
269427
)
270428
font: Optional[str] = None
271-
prov: Optional[list[Prov]] = None
429+
430+
def export_to_document_tokens(
431+
self,
432+
new_line: str = "\n",
433+
page_w: float = 0.0,
434+
page_h: float = 0.0,
435+
xsize: int = 100,
436+
ysize: int = 100,
437+
add_location: bool = True,
438+
add_content: bool = True,
439+
add_page_index: bool = True,
440+
):
441+
"""Export text element to document tokens format."""
442+
body = f"<{self.obj_type}>"
443+
# body = f"<{self.name}>"
444+
445+
assert DocumentToken.is_known_token(
446+
body
447+
), f"failed DocumentToken.is_known_token({body})"
448+
449+
if add_location:
450+
body += self.get_location_tokens(
451+
new_line="",
452+
page_w=page_w,
453+
page_h=page_h,
454+
xsize=xsize,
455+
ysize=ysize,
456+
add_page_index=add_page_index,
457+
)
458+
459+
if add_content and self.text is not None:
460+
body += self.text.strip()
461+
462+
body += f"</{self.obj_type}>{new_line}"
463+
464+
return body
272465

273466

274467
class ListItem(BaseText):

0 commit comments

Comments
 (0)