@@ -618,13 +618,22 @@ def _extract_first_json_object(s: str) -> str:
618618
619619
620620def _repair_unescaped_quotes (s : str ) -> str :
621- """Fix quotes that appear after / which are not properly escaped."""
621+ """Fix unescaped double-quotes inside JSON string values.
622+
623+ Handles two patterns:
624+ 1. Quotes after units: e.g. ``16.7 pc/"`` → ``16.7 pc/\\ "``
625+ 2. Inline scare-quotes: e.g. ``M dwarf "twin" binaries``
626+ → ``M dwarf 'twin' binaries``
627+ """
628+ # Unit-slash pattern (original)
622629 s = re .sub (
623630 r'(\d+\s*(?:pc|km|m|cm|mm|Hz|kHz|MHz|GHz|s|ms|ns|arcsec|arcmin|deg))/"' ,
624631 r'\1/\\"' ,
625632 s ,
626633 )
627634 s = re .sub (r'\((\d+\.?\d*\s*\w+)/"\)' , r'(\1/\\")' , s )
635+ # Inline scare-quotes: word "quoted word(s)" word → single quotes
636+ s = re .sub (r'(?<=\w)\s*"\s*(\w+(?:\s+\w+)*)\s*"\s*(?=\w)' , r" '\1' " , s )
628637 return s
629638
630639
@@ -912,16 +921,48 @@ def _postprocess_json(data: dict, raw_text: str = "") -> dict:
912921# ============================
913922
914923
924+ def _normalize_raw_text_for_model (text : str ) -> str :
925+ """Normalize Unicode in raw OCR text before feeding to the model.
926+
927+ NFKD decomposition converts superscripts (¹²³⁺), subscripts (ₛ),
928+ and other compatibility characters to their ASCII equivalents,
929+ reducing token count and improving model JSON generation.
930+
931+ Smart/curly quotes are replaced with single quotes to prevent the
932+ model from outputting unescaped straight double-quotes inside JSON
933+ string values (e.g. OCR ``\u201c twin\u201d `` → model ``"twin"`` →
934+ broken JSON).
935+ """
936+ # NFKD: ¹→1, ²→2, ³→3, ⁺→+, ₛ→s, etc.
937+ text = unicodedata .normalize ("NFKD" , text )
938+ # Remove combining marks left over from decomposition
939+ text = "" .join (c for c in text if not unicodedata .combining (c ))
940+ # Replace smart/curly quotes with straight single quotes
941+ text = text .replace ("\u201c " , "'" ).replace ("\u201d " , "'" ) # " " → '
942+ text = text .replace ("\u2018 " , "'" ).replace ("\u2019 " , "'" ) # ' ' → '
943+ text = text .replace ("\u00ab " , "'" ).replace ("\u00bb " , "'" ) # « » → '
944+ # Fix mixed-pair leftovers: curly open (now ') + straight close (")
945+ # e.g. 'twin" → 'twin'
946+ text = re .sub (r"'(\w+(?:\s+\w+)*)\"" , r"'\1'" , text )
947+ # Fix remaining inline scare-quotes: word "quoted" word → word 'quoted' word
948+ text = re .sub (
949+ r'(?<=\w)\s"(\w+(?:\s+\w+)*)"\s(?=\w)' , r" '\1' " , text
950+ )
951+ return text
952+
953+
915954def extract_json_with_retry (raw_text : str , model , tokenizer ) -> dict :
916955 """
917956 Send raw poster text to the LLM and robustly parse the JSON response.
918957
919958 This function:
920- 1. Calls the model with a full prompt
921- 2. Retries with more tokens if truncation is detected
922- 3. Falls back to a shorter prompt if needed
923- 4. Runs repair passes to make the JSON parseable
959+ 1. Normalizes Unicode in the raw text
960+ 2. Calls the model with a full prompt
961+ 3. Retries with more tokens if truncation is detected
962+ 4. Falls back to a shorter prompt if needed
963+ 5. Runs repair passes to make the JSON parseable
924964 """
965+ raw_text = _normalize_raw_text_for_model (raw_text )
925966 prompt = EXTRACTION_PROMPT .format (raw_text = raw_text )
926967
927968 log ("Starting primary JSON extraction with full prompt" )
0 commit comments