@@ -1084,6 +1084,63 @@ def _clean_unicode_artifacts(text: str) -> str:
10841084 return text .strip ()
10851085
10861086
1087+ _SMALL_WORDS = frozenset (
1088+ "a an and as at but by for in nor of on or so the to up vs via with" .split ()
1089+ )
1090+ def _smart_title_case (title : str ) -> str :
1091+ """Convert an ALL-CAPS title to title case, preserving acronyms.
1092+
1093+ Only applies when >50% of alpha characters are uppercase, indicating
1094+ the poster rendered its title in all-caps for visual emphasis.
1095+
1096+ Tokens that look like acronyms are kept uppercase: ≤4 alpha chars,
1097+ all-caps, and not a common English word. Hyphenated tokens are handled
1098+ part-by-part so "SARS-COV-2" → "SARS-COV-2".
1099+ """
1100+ if not title or not isinstance (title , str ):
1101+ return title
1102+
1103+ alpha = [c for c in title if c .isalpha ()]
1104+ if not alpha or sum (c .isupper () for c in alpha ) / len (alpha ) <= 0.5 :
1105+ return title # not all-caps, leave as-is
1106+
1107+ # Common short English words that are NOT acronyms even when ≤5 chars
1108+ _NOT_ACRONYMS = _SMALL_WORDS | frozenset (
1109+ "also back base been both case come data does done each even from "
1110+ "gene goes good have here high into just like long made make "
1111+ "many more most much must need new next once only over part past "
1112+ "rate role same self side some sub such take than that them then "
1113+ "this thus time type upon used uses very well were what when will "
1114+ "work year "
1115+ "about after based below early every first found great group "
1116+ "human known large level local model multi never newly novel "
1117+ "lower major means might occur often open other plant point "
1118+ "right scale shall since small space state still study their "
1119+ "these three total under until upper urban using value water "
1120+ "which while whole world would young" .split ()
1121+ )
1122+
1123+ def _case_part (part : str ) -> str :
1124+ alpha_only = re .sub (r"[^A-Za-z]" , "" , part )
1125+ if (
1126+ alpha_only .isupper ()
1127+ and 2 <= len (alpha_only ) <= 5
1128+ and alpha_only .lower () not in _NOT_ACRONYMS
1129+ ):
1130+ return part # likely acronym — keep as-is
1131+ return part .capitalize ()
1132+
1133+ words = title .split ()
1134+ result = []
1135+ for i , word in enumerate (words ):
1136+ new_word = "-" .join (_case_part (p ) for p in word .split ("-" ))
1137+ # lowercase small words unless first or last
1138+ if i != 0 and i != len (words ) - 1 and new_word .lower () in _SMALL_WORDS :
1139+ new_word = new_word .lower ()
1140+ result .append (new_word )
1141+ return " " .join (result )
1142+
1143+
10871144def _normalize_captions (captions_input , caption_type : str = "fig" ) -> list :
10881145 """Normalize captions to object format with id and caption fields.
10891146
@@ -1242,6 +1299,7 @@ def _postprocess_json(data: dict, raw_text: str = "") -> dict:
12421299 for title_obj in result ["titles" ]:
12431300 if isinstance (title_obj , dict ) and "title" in title_obj :
12441301 title_obj ["title" ] = _clean_unicode_artifacts (title_obj .get ("title" , "" ))
1302+ title_obj ["title" ] = _smart_title_case (title_obj ["title" ])
12451303
12461304 # Enrich with identifiers from raw text
12471305 if raw_text :
0 commit comments