Skip to content

Commit acdf816

Browse files
feat: added the XML export (#16)
* added the XML export Signed-off-by: Peter Staar <[email protected]> * reformatted all Signed-off-by: Peter Staar <[email protected]> * fixed tests Signed-off-by: Peter Staar <[email protected]> * added the DocumentTokens class Signed-off-by: Peter Staar <[email protected]> * updating the to-xml method Signed-off-by: Peter Staar <[email protected]> * updating the to-xml method Signed-off-by: Peter Staar <[email protected]> * fixed the to-md method Signed-off-by: Peter Staar <[email protected]> * added the strict-text in the to-md method Signed-off-by: Peter Staar <[email protected]> * added page-tokens Signed-off-by: Peter Staar <[email protected]> * updated the location/page tokens Signed-off-by: Peter Staar <[email protected]> --------- Signed-off-by: Peter Staar <[email protected]>
1 parent 25af125 commit acdf816

File tree

8 files changed

+565
-44
lines changed

8 files changed

+565
-44
lines changed

docling_core/types/doc/base.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,7 @@ class GlmTableCell(TableCell):
131131
class BaseCell(AliasModel):
132132
"""Base cell."""
133133

134+
# FIXME: we need to check why we have bounding_box (this should be in prov)
134135
bounding_box: Optional[BoundingBoxContainer] = Field(
135136
default=None, alias="bounding-box", json_schema_extra=es_field(suppress=True)
136137
)
@@ -152,6 +153,11 @@ class Table(BaseCell):
152153
model: Optional[str] = None
153154

154155

156+
# FIXME: let's add some figure specific data-types later
157+
class Figure(BaseCell):
158+
"""Figure."""
159+
160+
155161
class BaseText(AliasModel):
156162
"""Base model for text objects."""
157163

docling_core/types/doc/document.py

Lines changed: 282 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,8 @@
66
"""Models for the Docling Document data type."""
77

88
from datetime import datetime
9-
from typing import Generic, Optional, Union
9+
from enum import Enum
10+
from typing import Generic, Optional, Tuple, Union
1011

1112
from pydantic import (
1213
AnyHttpUrl,
@@ -35,6 +36,7 @@
3536
BaseCell,
3637
BaseText,
3738
BitmapObject,
39+
Figure,
3840
PageDimensions,
3941
PageReference,
4042
Ref,
@@ -275,7 +277,7 @@ class MinimalDocument(
275277
main_text: Optional[list[Union[Ref, BaseText]]] = Field(
276278
default=None, alias="main-text"
277279
)
278-
figures: Optional[list[BaseCell]] = None
280+
figures: Optional[list[Figure]] = None
279281
tables: Optional[list[Table]] = None
280282

281283

@@ -345,6 +347,107 @@ def from_dict(cls, data):
345347
return data
346348

347349

350+
class DocumentToken(Enum):
351+
"""Class to represent an LLM friendly representation of a Document."""
352+
353+
BEG_DOCUMENT = "<document>"
354+
END_DOCUMENT = "</document>"
355+
356+
BEG_TITLE = "<title>"
357+
END_TITLE = "</title>"
358+
359+
BEG_ABSTRACT = "<abstract>"
360+
END_ABSTRACT = "</abstract>"
361+
362+
BEG_DOI = "<doi>"
363+
END_DOI = "</doi>"
364+
BEG_DATE = "<date>"
365+
END_DATE = "</date>"
366+
367+
BEG_AUTHORS = "<authors>"
368+
END_AUTHORS = "</authors>"
369+
BEG_AUTHOR = "<author>"
370+
END_AUTHOR = "</author>"
371+
372+
BEG_AFFILIATIONS = "<affiliations>"
373+
END_AFFILIATIONS = "</affiliations>"
374+
BEG_AFFILIATION = "<affiliation>"
375+
END_AFFILIATION = "</affiliation>"
376+
377+
BEG_HEADER = "<section-header>"
378+
END_HEADER = "</section-header>"
379+
BEG_TEXT = "<text>"
380+
END_TEXT = "</text>"
381+
BEG_PARAGRAPH = "<paragraph>"
382+
END_PARAGRAPH = "</paragraph>"
383+
BEG_TABLE = "<table>"
384+
END_TABLE = "</table>"
385+
BEG_FIGURE = "<figure>"
386+
END_FIGURE = "</figure>"
387+
BEG_CAPTION = "<caption>"
388+
END_CAPTION = "</caption>"
389+
BEG_EQUATION = "<equation>"
390+
END_EQUATION = "</equation>"
391+
BEG_LIST = "<list>"
392+
END_LIST = "</list>"
393+
BEG_LISTITEM = "<list-item>"
394+
END_LISTITEM = "</list-item>"
395+
396+
BEG_LOCATION = "<location>"
397+
END_LOCATION = "</location>"
398+
BEG_GROUP = "<group>"
399+
END_GROUP = "</group>"
400+
401+
@classmethod
402+
def get_special_tokens(
403+
cls,
404+
max_rows: int = 100,
405+
max_cols: int = 100,
406+
max_pages: int = 1000,
407+
page_dimension: Tuple[int, int] = (100, 100),
408+
):
409+
"""Function to get all special document tokens."""
410+
special_tokens = [token.value for token in cls]
411+
412+
# Adding dynamically generated row and col tokens
413+
for i in range(0, max_rows):
414+
special_tokens += [f"<row_{i}>", f"</row_{i}>"]
415+
416+
for i in range(0, max_cols):
417+
special_tokens += [f"<col_{i}>", f"</col_{i}>"]
418+
419+
for i in range(6):
420+
special_tokens += [f"<section-header-{i}>", f"</section-header-{i}>"]
421+
422+
# Adding dynamically generated page-tokens
423+
for i in range(0, max_pages):
424+
special_tokens.append(f"<page_{i}>")
425+
426+
# Adding dynamically generated location-tokens
427+
for i in range(0, max(page_dimension[0], page_dimension[1])):
428+
special_tokens.append(f"<loc_{i}>")
429+
430+
return special_tokens
431+
432+
@staticmethod
433+
def get_page_token(page: int):
434+
"""Function to get page tokens."""
435+
return f"<page_{page}>"
436+
437+
@staticmethod
438+
def get_location_token(val: float, rnorm: int = 100):
439+
"""Function to get location tokens."""
440+
val_ = round(rnorm * val)
441+
442+
if val_ < 0:
443+
return "<loc_0>"
444+
445+
if val_ > rnorm:
446+
return f"<loc_{rnorm}>"
447+
448+
return f"<loc_{val_}>"
449+
450+
348451
class ExportedCCSDocument(
349452
MinimalDocument,
350453
Generic[
@@ -427,6 +530,14 @@ def export_to_markdown(
427530
delim: str = "\n\n",
428531
main_text_start: int = 0,
429532
main_text_stop: Optional[int] = None,
533+
main_text_labels: list[str] = [
534+
"title",
535+
"subtitle-level-1",
536+
"paragraph",
537+
"caption",
538+
"table",
539+
],
540+
strict_text: bool = False,
430541
) -> str:
431542
r"""Serialize to Markdown.
432543
@@ -461,12 +572,7 @@ def export_to_markdown(
461572
continue
462573

463574
item_type = item.obj_type
464-
if isinstance(item, BaseText) and item_type in {
465-
"title",
466-
"subtitle-level-1",
467-
"paragraph",
468-
"caption",
469-
}:
575+
if isinstance(item, BaseText) and item_type in main_text_labels:
470576
text = item.text
471577

472578
# ignore repeated text
@@ -477,20 +583,31 @@ def export_to_markdown(
477583

478584
# first title match
479585
if item_type == "title" and not has_title:
480-
markdown_text = f"# {text}"
586+
if strict_text:
587+
markdown_text = f"{text}"
588+
else:
589+
markdown_text = f"# {text}"
481590
has_title = True
482591

483592
# secondary titles
484593
elif item_type in {"title", "subtitle-level-1"} or (
485594
has_title and item_type == "title"
486595
):
487-
markdown_text = f"## {text}"
596+
if strict_text:
597+
markdown_text = f"{text}"
598+
else:
599+
markdown_text = f"## {text}"
488600

489601
# normal text
490602
else:
491603
markdown_text = text
492604

493-
elif isinstance(item, Table) and item.data:
605+
elif (
606+
isinstance(item, Table)
607+
and item.data
608+
and item_type in main_text_labels
609+
and not strict_text
610+
):
494611
table = []
495612
for row in item.data:
496613
tmp = []
@@ -518,3 +635,157 @@ def export_to_markdown(
518635

519636
result = delim.join(md_texts)
520637
return result
638+
639+
def export_to_document_tokens(
640+
self,
641+
delim: str = "\n\n",
642+
main_text_start: int = 0,
643+
main_text_stop: Optional[int] = None,
644+
main_text_labels: list[str] = [
645+
"title",
646+
"subtitle-level-1",
647+
"paragraph",
648+
"caption",
649+
"table",
650+
"figure",
651+
],
652+
page_tagging: bool = True,
653+
location_tagging: bool = True,
654+
location_dimensions: Tuple[int, int] = (100, 100),
655+
add_new_line: bool = True,
656+
) -> str:
657+
r"""Exports the document content to an DocumentToken format.
658+
659+
Operates on a slice of the document's main_text as defined through arguments
660+
main_text_start and main_text_stop; defaulting to the whole main_text.
661+
662+
Args:
663+
delim (str, optional): The delimiter used to separate text blocks in the
664+
exported XML. Default is two newline characters ("\n\n").
665+
main_text_start (int, optional): The starting index of the main text to
666+
be included in the XML. Default is 0 (the beginning of the text).
667+
main_text_stop (Optional[int], optional): The stopping index of the main
668+
text. If set to None, the export includes text up to the end.
669+
Default is None.
670+
main_text_labels (list[str], optional): A list of text labels that
671+
categorize the different sections of the document (e.g., "title",
672+
"subtitle-level-1", "paragraph", "caption"). Default labels are
673+
"title", "subtitle-level-1", "paragraph", and "caption".
674+
location_tagging (bool, optional): Determines whether to include
675+
location-based tagging in the XML. If True, the exported XML will
676+
contain information about the locations of the text elements.
677+
Default is True.
678+
location_dimensions (Tuple[int, int], optional): Specifies the dimensions
679+
(width and height) for the location tagging, if enabled.
680+
Default is [100, 100].
681+
add_new_line (bool, optional): Whether to add new line characters after
682+
each text block. If True, a new line is added after each block of
683+
text in the XML. Default is True.
684+
685+
Returns:
686+
str: The content of the document formatted as an XML string.
687+
"""
688+
xml_str = DocumentToken.BEG_DOCUMENT.value
689+
690+
new_line = ""
691+
if add_new_line:
692+
new_line = "\n"
693+
694+
if self.main_text is not None:
695+
for orig_item in self.main_text[main_text_start:main_text_stop]:
696+
697+
item = (
698+
self._resolve_ref(orig_item)
699+
if isinstance(orig_item, Ref)
700+
else orig_item
701+
)
702+
703+
if item is None:
704+
continue
705+
706+
prov = item.prov
707+
708+
loc_str = "" # default is zero
709+
if (
710+
location_tagging
711+
and self.page_dimensions is not None
712+
and prov is not None
713+
and len(prov) > 0
714+
):
715+
716+
page = prov[0].page
717+
page_dim = self.page_dimensions[page - 1]
718+
719+
page_w = float(page_dim.width)
720+
page_h = float(page_dim.height)
721+
722+
x0 = float(prov[0].bbox[0]) / float(page_w)
723+
y0 = float(prov[0].bbox[1]) / float(page_h)
724+
x1 = float(prov[0].bbox[2]) / float(page_w)
725+
y1 = float(prov[0].bbox[3]) / float(page_h)
726+
727+
page_tok = ""
728+
if page_tagging:
729+
page_tok = DocumentToken.get_page_token(page=page)
730+
731+
x0_tok = DocumentToken.get_location_token(
732+
val=min(x0, x1), rnorm=location_dimensions[0]
733+
)
734+
y0_tok = DocumentToken.get_location_token(
735+
val=min(y0, y1), rnorm=location_dimensions[1]
736+
)
737+
x1_tok = DocumentToken.get_location_token(
738+
val=max(x0, x1), rnorm=location_dimensions[0]
739+
)
740+
y1_tok = DocumentToken.get_location_token(
741+
val=max(y0, y1), rnorm=location_dimensions[1]
742+
)
743+
744+
# update
745+
loc_str = f"{DocumentToken.BEG_LOCATION.value}"
746+
loc_str += f"{page_tok}"
747+
loc_str += f"{x0_tok}{y0_tok}{x1_tok}{y1_tok}"
748+
loc_str += f"{DocumentToken.END_LOCATION.value}"
749+
750+
item_type = item.obj_type
751+
if isinstance(item, BaseText) and (item_type in main_text_labels):
752+
text = item.text
753+
754+
xml_str += f"<{item_type}>{loc_str}{text}</{item_type}>{new_line}"
755+
756+
elif isinstance(item, Table) and (item_type in main_text_labels):
757+
758+
xml_str += f"<{item_type}>{loc_str}"
759+
760+
if item.text is not None and len(item.text) > 0:
761+
xml_str += f"{DocumentToken.BEG_CAPTION.value}"
762+
xml_str += (
763+
f"{item.text}{DocumentToken.END_CAPTION.value}{new_line}"
764+
)
765+
766+
if item.data is not None and len(item.data) > 0:
767+
for i, row in enumerate(item.data):
768+
xml_str += f"<row_{i}>"
769+
for j, col in enumerate(row):
770+
text = col.text
771+
xml_str += f"<col_{j}>{text}</col_{j}>"
772+
773+
xml_str += f"</row_{i}>{new_line}"
774+
775+
xml_str += f"</{item_type}>{new_line}"
776+
777+
elif isinstance(item, Figure) and (item_type in main_text_labels):
778+
779+
xml_str += f"<{item_type}>{loc_str}"
780+
781+
if item.text is not None and len(item.text) > 0:
782+
xml_str += f"{DocumentToken.BEG_CAPTION.value}"
783+
xml_str += (
784+
f"{item.text}{DocumentToken.END_CAPTION.value}{new_line}"
785+
)
786+
787+
xml_str += f"</{item_type}>{new_line}"
788+
789+
xml_str += DocumentToken.END_DOCUMENT.value
790+
791+
return xml_str

0 commit comments

Comments
 (0)