@@ -783,14 +783,17 @@ def _clean_unicode_artifacts(text: str) -> str:
783783 return text .strip ()
784784
785785
786- def _normalize_captions (captions_input ) -> list :
787- """Normalize captions to object format with id and caption fields."""
786+ def _normalize_captions (captions_input , caption_type : str = "fig" ) -> list :
787+ """Normalize captions to object format with id and caption fields.
788+
789+ Auto-generates missing IDs as {caption_type}1, {caption_type}2, etc.
790+ """
788791 normalized = []
789792 seen_texts = set ()
790793
791794 # Handle various input formats
792795 if isinstance (captions_input , str ):
793- return [{"caption" : captions_input }] if captions_input .strip () else []
796+ return [{"id" : f" { caption_type } 1" , " caption" : captions_input }] if captions_input .strip () else []
794797
795798 if not isinstance (captions_input , list ):
796799 return []
@@ -827,10 +830,15 @@ def _normalize_captions(captions_input) -> list:
827830 seen_texts .add (key )
828831 normalized .append (caption_obj )
829832
833+ # Auto-generate missing IDs
834+ for i , cap in enumerate (normalized , start = 1 ):
835+ if "id" not in cap :
836+ cap ["id" ] = f"{ caption_type } { i } "
837+
830838 return normalized
831839
832840
833- def _postprocess_json (data : dict ) -> dict :
841+ def _postprocess_json (data : dict , raw_text : str = "" ) -> dict :
834842 """Comprehensive post-processing for extracted JSON."""
835843 result = data .copy ()
836844
@@ -847,12 +855,12 @@ def _postprocess_json(data: dict) -> dict:
847855 if "domain" in result and "researchField" not in result :
848856 result ["researchField" ] = result .pop ("domain" )
849857
850- # Ensure caption fields exist and normalize to string arrays
851- for key in ["imageCaptions" , "tableCaptions" ]:
858+ # Ensure caption fields exist and normalize with auto-generated IDs
859+ for key , ctype in [( "imageCaptions" , "fig" ), ( " tableCaptions", "table" ) ]:
852860 if key not in result :
853861 result [key ] = []
854862 elif isinstance (result [key ], (dict , list )):
855- result [key ] = _normalize_captions (result [key ])
863+ result [key ] = _normalize_captions (result [key ], caption_type = ctype )
856864
857865 # Clean Unicode from string fields
858866 for key in ["researchField" ]:
@@ -890,6 +898,12 @@ def _postprocess_json(data: dict) -> dict:
890898 if isinstance (title_obj , dict ) and "title" in title_obj :
891899 title_obj ["title" ] = _clean_unicode_artifacts (title_obj .get ("title" , "" ))
892900
901+ # Enrich with identifiers from raw text
902+ if raw_text :
903+ from .identifiers import enrich_json_with_identifiers
904+
905+ result = enrich_json_with_identifiers (result , raw_text )
906+
893907 return result
894908
895909
@@ -931,7 +945,7 @@ def extract_json_with_retry(raw_text: str, model, tokenizer) -> dict:
931945 response = _generate (model , tokenizer , fallback_prompt , MAX_RETRY_TOKENS )
932946 result = _robust_json_parse (response )
933947
934- result = _postprocess_json (result )
948+ result = _postprocess_json (result , raw_text = raw_text )
935949 return result
936950
937951
0 commit comments