@@ -3051,6 +3051,25 @@ def extract_inner_text(text_chunk: str) -> str:
30513051 """Strip all <...> tags inside the chunk to get the raw text content."""
30523052 return re .sub (r"<.*?>" , "" , text_chunk , flags = re .DOTALL ).strip ()
30533053
3054+ def extract_caption (
3055+ text_chunk : str ,
3056+ ) -> tuple [Optional [TextItem ], Optional [BoundingBox ]]:
3057+ """Extract caption text from the chunk."""
3058+ caption = re .search (r"<caption>(.*?)</caption>" , text_chunk )
3059+ if caption is not None :
3060+ caption_content = caption .group (1 )
3061+ bbox = extract_bounding_box (caption_content )
3062+ caption_text = extract_inner_text (caption_content )
3063+ caption_item = self .add_text (
3064+ label = DocItemLabel .CAPTION ,
3065+ text = caption_text ,
3066+ parent = None ,
3067+ )
3068+ else :
3069+ caption_item = None
3070+ bbox = None
3071+ return caption_item , bbox
3072+
30543073 def otsl_parse_texts (texts , tokens ):
30553074 split_word = TableToken .OTSL_NL .value
30563075 split_row_tokens = [
@@ -3261,16 +3280,24 @@ def parse_table_content(otsl_content: str) -> TableData:
32613280 if tag_name == DocumentToken .OTSL .value :
32623281 table_data = parse_table_content (full_chunk )
32633282 bbox = extract_bounding_box (full_chunk ) if image else None
3264-
3283+ caption , caption_bbox = extract_caption (full_chunk )
3284+ if caption is not None and caption_bbox is not None :
3285+ caption .prov .append (
3286+ ProvenanceItem (
3287+ bbox = caption_bbox .resize_by_scale (pg_width , pg_height ),
3288+ charspan = (0 , 0 ),
3289+ page_no = page_no ,
3290+ )
3291+ )
32653292 if bbox :
32663293 prov = ProvenanceItem (
32673294 bbox = bbox .resize_by_scale (pg_width , pg_height ),
32683295 charspan = (0 , 0 ),
32693296 page_no = page_no ,
32703297 )
3271- self .add_table (data = table_data , prov = prov )
3298+ self .add_table (data = table_data , prov = prov , caption = caption )
32723299 else :
3273- self .add_table (data = table_data )
3300+ self .add_table (data = table_data , caption = caption )
32743301
32753302 elif tag_name == DocItemLabel .PICTURE :
32763303 text_caption_content = extract_inner_text (full_chunk )
0 commit comments