Skip to content

Commit d0ffc85

Browse files
authored
feat: add document Markdown export (#4)
Signed-off-by: Panos Vagenas <[email protected]>
1 parent 09d3576 commit d0ffc85

File tree

6 files changed

+6378
-2
lines changed

6 files changed

+6378
-2
lines changed

docling_core/types/doc/document.py

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
StrictStr,
1717
model_validator,
1818
)
19+
from tabulate import tabulate
1920

2021
from docling_core.search.mapping import es_field
2122
from docling_core.types.base import (
@@ -391,3 +392,111 @@ def from_dict(cls, data):
391392
item["$ref"] = ref
392393

393394
return data
395+
396+
def _resolve_ref(self, item: Ref) -> Optional[Table]:
397+
"""Return the resolved reference in case of table reference, otherwise None."""
398+
result: Optional[Table] = None
399+
400+
# NOTE: currently only resolves table refs & makes assumptions on ref parts
401+
if item.obj_type == "table" and self.tables:
402+
parts = item.ref.split("/")
403+
result = self.tables[int(parts[2])]
404+
405+
return result
406+
407+
def export_to_markdown(
408+
self,
409+
delim: str = "\n\n",
410+
main_text_start: int = 0,
411+
main_text_stop: Optional[int] = None,
412+
) -> str:
413+
r"""Serialize to Markdown.
414+
415+
Operates on a slice of the document's main_text as defined through arguments
416+
main_text_start and main_text_stop; defaulting to the whole main_text.
417+
418+
Args:
419+
delim (str, optional): Delimiter to use when concatenating the various
420+
Markdown parts. Defaults to "\n\n".
421+
main_text_start (int, optional): Main-text slicing start index (inclusive).
422+
Defaults to 0.
423+
main_text_end (Optional[int], optional): Main-text slicing stop index
424+
(exclusive). Defaults to None.
425+
426+
Returns:
427+
str: The exported Markdown representation.
428+
"""
429+
has_title = False
430+
prev_text = ""
431+
md_texts: list[str] = []
432+
433+
if self.main_text is not None:
434+
for orig_item in self.main_text[main_text_start:main_text_stop]:
435+
markdown_text = ""
436+
437+
item = (
438+
self._resolve_ref(orig_item)
439+
if isinstance(orig_item, Ref)
440+
else orig_item
441+
)
442+
if item is None:
443+
continue
444+
445+
item_type = item.obj_type
446+
if isinstance(item, BaseText) and item_type in {
447+
"title",
448+
"subtitle-level-1",
449+
"paragraph",
450+
"caption",
451+
}:
452+
text = item.text
453+
454+
# ignore repeated text
455+
if prev_text == text:
456+
continue
457+
else:
458+
prev_text = text
459+
460+
# first title match
461+
if item_type == "title" and not has_title:
462+
markdown_text = f"# {text}"
463+
has_title = True
464+
465+
# secondary titles
466+
elif item_type in {"title", "subtitle-level-1"} or (
467+
has_title and item_type == "title"
468+
):
469+
markdown_text = f"## {text}"
470+
471+
# normal text
472+
else:
473+
markdown_text = text
474+
475+
elif isinstance(item, Table) and item.data:
476+
table = []
477+
for row in item.data:
478+
tmp = []
479+
for col in row:
480+
tmp.append(col.text)
481+
table.append(tmp)
482+
483+
if len(table) > 1 and len(table[0]) > 0:
484+
try:
485+
md_table = tabulate(
486+
table[1:], headers=table[0], tablefmt="github"
487+
)
488+
except ValueError:
489+
md_table = tabulate(
490+
table[1:],
491+
headers=table[0],
492+
tablefmt="github",
493+
disable_numparse=True,
494+
)
495+
496+
markdown_text = md_table
497+
498+
if markdown_text:
499+
md_texts.append(markdown_text)
500+
501+
result = delim.join(md_texts)
502+
return result

poetry.lock

Lines changed: 15 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ jsonref = "^1.1.0"
5353
json-schema-for-humans = "^1.0.0"
5454
poetry = "^1.8.3"
5555
pyproject-toml = "^0.0.10"
56+
tabulate = "^0.9.0"
5657

5758
[tool.poetry.group.dev.dependencies]
5859
black = "^24.4.2"
@@ -111,7 +112,7 @@ python_version = "3.9"
111112
plugins = ["pydantic.mypy"]
112113

113114
[[tool.mypy.overrides]]
114-
module = ["jsondiff.*", "jsonref.*", "jsonschema.*", "json_schema_for_humans.*"]
115+
module = ["jsondiff.*", "jsonref.*", "jsonschema.*", "json_schema_for_humans.*", "tabulate.*"]
115116
ignore_missing_imports = true
116117

117118
[tool.semantic_release]

0 commit comments

Comments
 (0)