Skip to content

Commit 8405d1d

Browse files
authored
refactor: Expose OTSL methods as utilities (#370)
Refactoring OTSL methods Signed-off-by: Christoph Auer <[email protected]>
1 parent bbe6243 commit 8405d1d

File tree

2 files changed

+201
-181
lines changed

2 files changed

+201
-181
lines changed

docling_core/types/doc/document.py

Lines changed: 4 additions & 180 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
import base64
44
import copy
55
import hashlib
6-
import itertools
76
import json
87
import logging
98
import mimetypes
@@ -54,8 +53,8 @@
5453
GroupLabel,
5554
PictureClassificationLabel,
5655
)
57-
from docling_core.types.doc.tokens import _LOC_PREFIX, DocumentToken, TableToken
58-
from docling_core.types.doc.utils import relative_path
56+
from docling_core.types.doc.tokens import DocumentToken, TableToken
57+
from docling_core.types.doc.utils import parse_otsl_table_content, relative_path
5958

6059
_logger = logging.getLogger(__name__)
6160

@@ -4688,181 +4687,6 @@ def extract_caption(
46884687
bbox = None
46894688
return caption_item, bbox
46904689

4691-
def otsl_parse_texts(texts, tokens):
4692-
split_word = TableToken.OTSL_NL.value
4693-
# CLEAN tokens from extra tags, only structural OTSL allowed
4694-
clean_tokens = []
4695-
for t in tokens:
4696-
if t in [
4697-
TableToken.OTSL_ECEL.value,
4698-
TableToken.OTSL_FCEL.value,
4699-
TableToken.OTSL_LCEL.value,
4700-
TableToken.OTSL_UCEL.value,
4701-
TableToken.OTSL_XCEL.value,
4702-
TableToken.OTSL_NL.value,
4703-
TableToken.OTSL_CHED.value,
4704-
TableToken.OTSL_RHED.value,
4705-
TableToken.OTSL_SROW.value,
4706-
]:
4707-
clean_tokens.append(t)
4708-
tokens = clean_tokens
4709-
split_row_tokens = [
4710-
list(y)
4711-
for x, y in itertools.groupby(tokens, lambda z: z == split_word)
4712-
if not x
4713-
]
4714-
4715-
table_cells = []
4716-
r_idx = 0
4717-
c_idx = 0
4718-
4719-
def count_right(tokens, c_idx, r_idx, which_tokens):
4720-
span = 0
4721-
c_idx_iter = c_idx
4722-
while tokens[r_idx][c_idx_iter] in which_tokens:
4723-
c_idx_iter += 1
4724-
span += 1
4725-
if c_idx_iter >= len(tokens[r_idx]):
4726-
return span
4727-
return span
4728-
4729-
def count_down(tokens, c_idx, r_idx, which_tokens):
4730-
span = 0
4731-
r_idx_iter = r_idx
4732-
while tokens[r_idx_iter][c_idx] in which_tokens:
4733-
r_idx_iter += 1
4734-
span += 1
4735-
if r_idx_iter >= len(tokens):
4736-
return span
4737-
return span
4738-
4739-
for i, text in enumerate(texts):
4740-
cell_text = ""
4741-
if text in [
4742-
TableToken.OTSL_FCEL.value,
4743-
TableToken.OTSL_ECEL.value,
4744-
TableToken.OTSL_CHED.value,
4745-
TableToken.OTSL_RHED.value,
4746-
TableToken.OTSL_SROW.value,
4747-
]:
4748-
row_span = 1
4749-
col_span = 1
4750-
right_offset = 1
4751-
if text != TableToken.OTSL_ECEL.value:
4752-
cell_text = texts[i + 1]
4753-
right_offset = 2
4754-
4755-
# Check next element(s) for lcel / ucel / xcel,
4756-
# set properly row_span, col_span
4757-
next_right_cell = ""
4758-
if i + right_offset < len(texts):
4759-
next_right_cell = texts[i + right_offset]
4760-
4761-
next_bottom_cell = ""
4762-
if r_idx + 1 < len(split_row_tokens):
4763-
if c_idx < len(split_row_tokens[r_idx + 1]):
4764-
next_bottom_cell = split_row_tokens[r_idx + 1][c_idx]
4765-
4766-
if next_right_cell in [
4767-
TableToken.OTSL_LCEL.value,
4768-
TableToken.OTSL_XCEL.value,
4769-
]:
4770-
# we have horisontal spanning cell or 2d spanning cell
4771-
col_span += count_right(
4772-
split_row_tokens,
4773-
c_idx + 1,
4774-
r_idx,
4775-
[TableToken.OTSL_LCEL.value, TableToken.OTSL_XCEL.value],
4776-
)
4777-
if next_bottom_cell in [
4778-
TableToken.OTSL_UCEL.value,
4779-
TableToken.OTSL_XCEL.value,
4780-
]:
4781-
# we have a vertical spanning cell or 2d spanning cell
4782-
row_span += count_down(
4783-
split_row_tokens,
4784-
c_idx,
4785-
r_idx + 1,
4786-
[TableToken.OTSL_UCEL.value, TableToken.OTSL_XCEL.value],
4787-
)
4788-
4789-
table_cells.append(
4790-
TableCell(
4791-
text=cell_text.strip(),
4792-
row_span=row_span,
4793-
col_span=col_span,
4794-
start_row_offset_idx=r_idx,
4795-
end_row_offset_idx=r_idx + row_span,
4796-
start_col_offset_idx=c_idx,
4797-
end_col_offset_idx=c_idx + col_span,
4798-
)
4799-
)
4800-
if text in [
4801-
TableToken.OTSL_FCEL.value,
4802-
TableToken.OTSL_ECEL.value,
4803-
TableToken.OTSL_CHED.value,
4804-
TableToken.OTSL_RHED.value,
4805-
TableToken.OTSL_SROW.value,
4806-
TableToken.OTSL_LCEL.value,
4807-
TableToken.OTSL_UCEL.value,
4808-
TableToken.OTSL_XCEL.value,
4809-
]:
4810-
c_idx += 1
4811-
if text == TableToken.OTSL_NL.value:
4812-
r_idx += 1
4813-
c_idx = 0
4814-
return table_cells, split_row_tokens
4815-
4816-
def otsl_extract_tokens_and_text(s: str):
4817-
# Pattern to match anything enclosed by < >
4818-
# (including the angle brackets themselves)
4819-
pattern = r"(<[^>]+>)"
4820-
# Find all tokens (e.g. "<otsl>", "<loc_140>", etc.)
4821-
tokens = re.findall(pattern, s)
4822-
# Remove any tokens that start with "<loc_"
4823-
tokens = [
4824-
token
4825-
for token in tokens
4826-
if not (
4827-
token.startswith(rf"<{_LOC_PREFIX}")
4828-
or token
4829-
in [
4830-
rf"<{DocumentToken.OTSL.value}>",
4831-
rf"</{DocumentToken.OTSL.value}>",
4832-
]
4833-
)
4834-
]
4835-
# Split the string by those tokens to get the in-between text
4836-
text_parts = re.split(pattern, s)
4837-
text_parts = [
4838-
token
4839-
for token in text_parts
4840-
if not (
4841-
token.startswith(rf"<{_LOC_PREFIX}")
4842-
or token
4843-
in [
4844-
rf"<{DocumentToken.OTSL.value}>",
4845-
rf"</{DocumentToken.OTSL.value}>",
4846-
]
4847-
)
4848-
]
4849-
# Remove any empty or purely whitespace strings from text_parts
4850-
text_parts = [part for part in text_parts if part.strip()]
4851-
4852-
return tokens, text_parts
4853-
4854-
def parse_table_content(otsl_content: str) -> TableData:
4855-
tokens, mixed_texts = otsl_extract_tokens_and_text(otsl_content)
4856-
table_cells, split_row_tokens = otsl_parse_texts(mixed_texts, tokens)
4857-
4858-
return TableData(
4859-
num_rows=len(split_row_tokens),
4860-
num_cols=(
4861-
max(len(row) for row in split_row_tokens) if split_row_tokens else 0
4862-
),
4863-
table_cells=table_cells,
4864-
)
4865-
48664690
def extract_chart_type(text_chunk: str):
48674691
label = None
48684692
chart_labels = [
@@ -5094,7 +4918,7 @@ def _add_text(
50944918
doc_label = tag_to_doclabel.get(tag_name, DocItemLabel.TEXT)
50954919

50964920
if tag_name == DocumentToken.OTSL.value:
5097-
table_data = parse_table_content(full_chunk)
4921+
table_data = parse_otsl_table_content(full_chunk)
50984922
caption, caption_bbox = extract_caption(full_chunk)
50994923
if caption is not None and caption_bbox is not None:
51004924
caption.prov.append(
@@ -5137,7 +4961,7 @@ def _add_text(
51374961
table_data = None
51384962
chart_type = None
51394963
if tag_name == DocumentToken.CHART.value:
5140-
table_data = parse_table_content(full_chunk)
4964+
table_data = parse_otsl_table_content(full_chunk)
51414965
chart_type = extract_chart_type(full_chunk)
51424966
if image:
51434967
if bbox:

0 commit comments

Comments
 (0)