|
3 | 3 | import base64 |
4 | 4 | import copy |
5 | 5 | import hashlib |
6 | | -import itertools |
7 | 6 | import json |
8 | 7 | import logging |
9 | 8 | import mimetypes |
|
54 | 53 | GroupLabel, |
55 | 54 | PictureClassificationLabel, |
56 | 55 | ) |
57 | | -from docling_core.types.doc.tokens import _LOC_PREFIX, DocumentToken, TableToken |
58 | | -from docling_core.types.doc.utils import relative_path |
| 56 | +from docling_core.types.doc.tokens import DocumentToken, TableToken |
| 57 | +from docling_core.types.doc.utils import parse_otsl_table_content, relative_path |
59 | 58 |
|
60 | 59 | _logger = logging.getLogger(__name__) |
61 | 60 |
|
@@ -4688,181 +4687,6 @@ def extract_caption( |
4688 | 4687 | bbox = None |
4689 | 4688 | return caption_item, bbox |
4690 | 4689 |
|
4691 | | - def otsl_parse_texts(texts, tokens): |
4692 | | - split_word = TableToken.OTSL_NL.value |
4693 | | - # CLEAN tokens from extra tags, only structural OTSL allowed |
4694 | | - clean_tokens = [] |
4695 | | - for t in tokens: |
4696 | | - if t in [ |
4697 | | - TableToken.OTSL_ECEL.value, |
4698 | | - TableToken.OTSL_FCEL.value, |
4699 | | - TableToken.OTSL_LCEL.value, |
4700 | | - TableToken.OTSL_UCEL.value, |
4701 | | - TableToken.OTSL_XCEL.value, |
4702 | | - TableToken.OTSL_NL.value, |
4703 | | - TableToken.OTSL_CHED.value, |
4704 | | - TableToken.OTSL_RHED.value, |
4705 | | - TableToken.OTSL_SROW.value, |
4706 | | - ]: |
4707 | | - clean_tokens.append(t) |
4708 | | - tokens = clean_tokens |
4709 | | - split_row_tokens = [ |
4710 | | - list(y) |
4711 | | - for x, y in itertools.groupby(tokens, lambda z: z == split_word) |
4712 | | - if not x |
4713 | | - ] |
4714 | | - |
4715 | | - table_cells = [] |
4716 | | - r_idx = 0 |
4717 | | - c_idx = 0 |
4718 | | - |
4719 | | - def count_right(tokens, c_idx, r_idx, which_tokens): |
4720 | | - span = 0 |
4721 | | - c_idx_iter = c_idx |
4722 | | - while tokens[r_idx][c_idx_iter] in which_tokens: |
4723 | | - c_idx_iter += 1 |
4724 | | - span += 1 |
4725 | | - if c_idx_iter >= len(tokens[r_idx]): |
4726 | | - return span |
4727 | | - return span |
4728 | | - |
4729 | | - def count_down(tokens, c_idx, r_idx, which_tokens): |
4730 | | - span = 0 |
4731 | | - r_idx_iter = r_idx |
4732 | | - while tokens[r_idx_iter][c_idx] in which_tokens: |
4733 | | - r_idx_iter += 1 |
4734 | | - span += 1 |
4735 | | - if r_idx_iter >= len(tokens): |
4736 | | - return span |
4737 | | - return span |
4738 | | - |
4739 | | - for i, text in enumerate(texts): |
4740 | | - cell_text = "" |
4741 | | - if text in [ |
4742 | | - TableToken.OTSL_FCEL.value, |
4743 | | - TableToken.OTSL_ECEL.value, |
4744 | | - TableToken.OTSL_CHED.value, |
4745 | | - TableToken.OTSL_RHED.value, |
4746 | | - TableToken.OTSL_SROW.value, |
4747 | | - ]: |
4748 | | - row_span = 1 |
4749 | | - col_span = 1 |
4750 | | - right_offset = 1 |
4751 | | - if text != TableToken.OTSL_ECEL.value: |
4752 | | - cell_text = texts[i + 1] |
4753 | | - right_offset = 2 |
4754 | | - |
4755 | | - # Check next element(s) for lcel / ucel / xcel, |
4756 | | - # set properly row_span, col_span |
4757 | | - next_right_cell = "" |
4758 | | - if i + right_offset < len(texts): |
4759 | | - next_right_cell = texts[i + right_offset] |
4760 | | - |
4761 | | - next_bottom_cell = "" |
4762 | | - if r_idx + 1 < len(split_row_tokens): |
4763 | | - if c_idx < len(split_row_tokens[r_idx + 1]): |
4764 | | - next_bottom_cell = split_row_tokens[r_idx + 1][c_idx] |
4765 | | - |
4766 | | - if next_right_cell in [ |
4767 | | - TableToken.OTSL_LCEL.value, |
4768 | | - TableToken.OTSL_XCEL.value, |
4769 | | - ]: |
4770 | | - # we have horisontal spanning cell or 2d spanning cell |
4771 | | - col_span += count_right( |
4772 | | - split_row_tokens, |
4773 | | - c_idx + 1, |
4774 | | - r_idx, |
4775 | | - [TableToken.OTSL_LCEL.value, TableToken.OTSL_XCEL.value], |
4776 | | - ) |
4777 | | - if next_bottom_cell in [ |
4778 | | - TableToken.OTSL_UCEL.value, |
4779 | | - TableToken.OTSL_XCEL.value, |
4780 | | - ]: |
4781 | | - # we have a vertical spanning cell or 2d spanning cell |
4782 | | - row_span += count_down( |
4783 | | - split_row_tokens, |
4784 | | - c_idx, |
4785 | | - r_idx + 1, |
4786 | | - [TableToken.OTSL_UCEL.value, TableToken.OTSL_XCEL.value], |
4787 | | - ) |
4788 | | - |
4789 | | - table_cells.append( |
4790 | | - TableCell( |
4791 | | - text=cell_text.strip(), |
4792 | | - row_span=row_span, |
4793 | | - col_span=col_span, |
4794 | | - start_row_offset_idx=r_idx, |
4795 | | - end_row_offset_idx=r_idx + row_span, |
4796 | | - start_col_offset_idx=c_idx, |
4797 | | - end_col_offset_idx=c_idx + col_span, |
4798 | | - ) |
4799 | | - ) |
4800 | | - if text in [ |
4801 | | - TableToken.OTSL_FCEL.value, |
4802 | | - TableToken.OTSL_ECEL.value, |
4803 | | - TableToken.OTSL_CHED.value, |
4804 | | - TableToken.OTSL_RHED.value, |
4805 | | - TableToken.OTSL_SROW.value, |
4806 | | - TableToken.OTSL_LCEL.value, |
4807 | | - TableToken.OTSL_UCEL.value, |
4808 | | - TableToken.OTSL_XCEL.value, |
4809 | | - ]: |
4810 | | - c_idx += 1 |
4811 | | - if text == TableToken.OTSL_NL.value: |
4812 | | - r_idx += 1 |
4813 | | - c_idx = 0 |
4814 | | - return table_cells, split_row_tokens |
4815 | | - |
4816 | | - def otsl_extract_tokens_and_text(s: str): |
4817 | | - # Pattern to match anything enclosed by < > |
4818 | | - # (including the angle brackets themselves) |
4819 | | - pattern = r"(<[^>]+>)" |
4820 | | - # Find all tokens (e.g. "<otsl>", "<loc_140>", etc.) |
4821 | | - tokens = re.findall(pattern, s) |
4822 | | - # Remove any tokens that start with "<loc_" |
4823 | | - tokens = [ |
4824 | | - token |
4825 | | - for token in tokens |
4826 | | - if not ( |
4827 | | - token.startswith(rf"<{_LOC_PREFIX}") |
4828 | | - or token |
4829 | | - in [ |
4830 | | - rf"<{DocumentToken.OTSL.value}>", |
4831 | | - rf"</{DocumentToken.OTSL.value}>", |
4832 | | - ] |
4833 | | - ) |
4834 | | - ] |
4835 | | - # Split the string by those tokens to get the in-between text |
4836 | | - text_parts = re.split(pattern, s) |
4837 | | - text_parts = [ |
4838 | | - token |
4839 | | - for token in text_parts |
4840 | | - if not ( |
4841 | | - token.startswith(rf"<{_LOC_PREFIX}") |
4842 | | - or token |
4843 | | - in [ |
4844 | | - rf"<{DocumentToken.OTSL.value}>", |
4845 | | - rf"</{DocumentToken.OTSL.value}>", |
4846 | | - ] |
4847 | | - ) |
4848 | | - ] |
4849 | | - # Remove any empty or purely whitespace strings from text_parts |
4850 | | - text_parts = [part for part in text_parts if part.strip()] |
4851 | | - |
4852 | | - return tokens, text_parts |
4853 | | - |
4854 | | - def parse_table_content(otsl_content: str) -> TableData: |
4855 | | - tokens, mixed_texts = otsl_extract_tokens_and_text(otsl_content) |
4856 | | - table_cells, split_row_tokens = otsl_parse_texts(mixed_texts, tokens) |
4857 | | - |
4858 | | - return TableData( |
4859 | | - num_rows=len(split_row_tokens), |
4860 | | - num_cols=( |
4861 | | - max(len(row) for row in split_row_tokens) if split_row_tokens else 0 |
4862 | | - ), |
4863 | | - table_cells=table_cells, |
4864 | | - ) |
4865 | | - |
4866 | 4690 | def extract_chart_type(text_chunk: str): |
4867 | 4691 | label = None |
4868 | 4692 | chart_labels = [ |
@@ -5094,7 +4918,7 @@ def _add_text( |
5094 | 4918 | doc_label = tag_to_doclabel.get(tag_name, DocItemLabel.TEXT) |
5095 | 4919 |
|
5096 | 4920 | if tag_name == DocumentToken.OTSL.value: |
5097 | | - table_data = parse_table_content(full_chunk) |
| 4921 | + table_data = parse_otsl_table_content(full_chunk) |
5098 | 4922 | caption, caption_bbox = extract_caption(full_chunk) |
5099 | 4923 | if caption is not None and caption_bbox is not None: |
5100 | 4924 | caption.prov.append( |
@@ -5137,7 +4961,7 @@ def _add_text( |
5137 | 4961 | table_data = None |
5138 | 4962 | chart_type = None |
5139 | 4963 | if tag_name == DocumentToken.CHART.value: |
5140 | | - table_data = parse_table_content(full_chunk) |
| 4964 | + table_data = parse_otsl_table_content(full_chunk) |
5141 | 4965 | chart_type = extract_chart_type(full_chunk) |
5142 | 4966 | if image: |
5143 | 4967 | if bbox: |
|
0 commit comments