|
16 | 16 | StrictStr, |
17 | 17 | model_validator, |
18 | 18 | ) |
| 19 | +from tabulate import tabulate |
19 | 20 |
|
20 | 21 | from docling_core.search.mapping import es_field |
21 | 22 | from docling_core.types.base import ( |
@@ -391,3 +392,111 @@ def from_dict(cls, data): |
391 | 392 | item["$ref"] = ref |
392 | 393 |
|
393 | 394 | return data |
| 395 | + |
| 396 | + def _resolve_ref(self, item: Ref) -> Optional[Table]: |
| 397 | + """Return the resolved reference in case of table reference, otherwise None.""" |
| 398 | + result: Optional[Table] = None |
| 399 | + |
| 400 | + # NOTE: currently only resolves table refs & makes assumptions on ref parts |
| 401 | + if item.obj_type == "table" and self.tables: |
| 402 | + parts = item.ref.split("/") |
| 403 | + result = self.tables[int(parts[2])] |
| 404 | + |
| 405 | + return result |
| 406 | + |
| 407 | + def export_to_markdown( |
| 408 | + self, |
| 409 | + delim: str = "\n\n", |
| 410 | + main_text_start: int = 0, |
| 411 | + main_text_stop: Optional[int] = None, |
| 412 | + ) -> str: |
| 413 | + r"""Serialize to Markdown. |
| 414 | +
|
| 415 | + Operates on a slice of the document's main_text as defined through arguments |
| 416 | + main_text_start and main_text_stop; defaulting to the whole main_text. |
| 417 | +
|
| 418 | + Args: |
| 419 | + delim (str, optional): Delimiter to use when concatenating the various |
| 420 | + Markdown parts. Defaults to "\n\n". |
| 421 | + main_text_start (int, optional): Main-text slicing start index (inclusive). |
| 422 | + Defaults to 0. |
| 423 | + main_text_end (Optional[int], optional): Main-text slicing stop index |
| 424 | + (exclusive). Defaults to None. |
| 425 | +
|
| 426 | + Returns: |
| 427 | + str: The exported Markdown representation. |
| 428 | + """ |
| 429 | + has_title = False |
| 430 | + prev_text = "" |
| 431 | + md_texts: list[str] = [] |
| 432 | + |
| 433 | + if self.main_text is not None: |
| 434 | + for orig_item in self.main_text[main_text_start:main_text_stop]: |
| 435 | + markdown_text = "" |
| 436 | + |
| 437 | + item = ( |
| 438 | + self._resolve_ref(orig_item) |
| 439 | + if isinstance(orig_item, Ref) |
| 440 | + else orig_item |
| 441 | + ) |
| 442 | + if item is None: |
| 443 | + continue |
| 444 | + |
| 445 | + item_type = item.obj_type |
| 446 | + if isinstance(item, BaseText) and item_type in { |
| 447 | + "title", |
| 448 | + "subtitle-level-1", |
| 449 | + "paragraph", |
| 450 | + "caption", |
| 451 | + }: |
| 452 | + text = item.text |
| 453 | + |
| 454 | + # ignore repeated text |
| 455 | + if prev_text == text: |
| 456 | + continue |
| 457 | + else: |
| 458 | + prev_text = text |
| 459 | + |
| 460 | + # first title match |
| 461 | + if item_type == "title" and not has_title: |
| 462 | + markdown_text = f"# {text}" |
| 463 | + has_title = True |
| 464 | + |
| 465 | + # secondary titles |
| 466 | + elif item_type in {"title", "subtitle-level-1"} or ( |
| 467 | + has_title and item_type == "title" |
| 468 | + ): |
| 469 | + markdown_text = f"## {text}" |
| 470 | + |
| 471 | + # normal text |
| 472 | + else: |
| 473 | + markdown_text = text |
| 474 | + |
| 475 | + elif isinstance(item, Table) and item.data: |
| 476 | + table = [] |
| 477 | + for row in item.data: |
| 478 | + tmp = [] |
| 479 | + for col in row: |
| 480 | + tmp.append(col.text) |
| 481 | + table.append(tmp) |
| 482 | + |
| 483 | + if len(table) > 1 and len(table[0]) > 0: |
| 484 | + try: |
| 485 | + md_table = tabulate( |
| 486 | + table[1:], headers=table[0], tablefmt="github" |
| 487 | + ) |
| 488 | + except ValueError: |
| 489 | + md_table = tabulate( |
| 490 | + table[1:], |
| 491 | + headers=table[0], |
| 492 | + tablefmt="github", |
| 493 | + disable_numparse=True, |
| 494 | + ) |
| 495 | + |
| 496 | + markdown_text = md_table |
| 497 | + |
| 498 | + if markdown_text: |
| 499 | + md_texts.append(markdown_text) |
| 500 | + |
| 501 | + result = delim.join(md_texts) |
| 502 | + return result |
0 commit comments