Skip to content

Commit 5cee486

Browse files
authored
fix: Add caption to the table in load_from_doctags (#197)
add caption to the table in load_from_doctags Signed-off-by: Saidgurbuz <[email protected]>
1 parent 65a82a1 commit 5cee486

File tree

1 file changed

+30
-3
lines changed

1 file changed

+30
-3
lines changed

docling_core/types/doc/document.py

Lines changed: 30 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3051,6 +3051,25 @@ def extract_inner_text(text_chunk: str) -> str:
30513051
"""Strip all <...> tags inside the chunk to get the raw text content."""
30523052
return re.sub(r"<.*?>", "", text_chunk, flags=re.DOTALL).strip()
30533053

3054+
def extract_caption(
3055+
text_chunk: str,
3056+
) -> tuple[Optional[TextItem], Optional[BoundingBox]]:
3057+
"""Extract caption text from the chunk."""
3058+
caption = re.search(r"<caption>(.*?)</caption>", text_chunk)
3059+
if caption is not None:
3060+
caption_content = caption.group(1)
3061+
bbox = extract_bounding_box(caption_content)
3062+
caption_text = extract_inner_text(caption_content)
3063+
caption_item = self.add_text(
3064+
label=DocItemLabel.CAPTION,
3065+
text=caption_text,
3066+
parent=None,
3067+
)
3068+
else:
3069+
caption_item = None
3070+
bbox = None
3071+
return caption_item, bbox
3072+
30543073
def otsl_parse_texts(texts, tokens):
30553074
split_word = TableToken.OTSL_NL.value
30563075
split_row_tokens = [
@@ -3261,16 +3280,24 @@ def parse_table_content(otsl_content: str) -> TableData:
32613280
if tag_name == DocumentToken.OTSL.value:
32623281
table_data = parse_table_content(full_chunk)
32633282
bbox = extract_bounding_box(full_chunk) if image else None
3264-
3283+
caption, caption_bbox = extract_caption(full_chunk)
3284+
if caption is not None and caption_bbox is not None:
3285+
caption.prov.append(
3286+
ProvenanceItem(
3287+
bbox=caption_bbox.resize_by_scale(pg_width, pg_height),
3288+
charspan=(0, 0),
3289+
page_no=page_no,
3290+
)
3291+
)
32653292
if bbox:
32663293
prov = ProvenanceItem(
32673294
bbox=bbox.resize_by_scale(pg_width, pg_height),
32683295
charspan=(0, 0),
32693296
page_no=page_no,
32703297
)
3271-
self.add_table(data=table_data, prov=prov)
3298+
self.add_table(data=table_data, prov=prov, caption=caption)
32723299
else:
3273-
self.add_table(data=table_data)
3300+
self.add_table(data=table_data, caption=caption)
32743301

32753302
elif tag_name == DocItemLabel.PICTURE:
32763303
text_caption_content = extract_inner_text(full_chunk)

0 commit comments

Comments
 (0)