fix: Smart title-case for ALL-CAPS poster titles (v0.1.9)

jimnoneill · jimnoneill · commit 97f3c4d8e3c3 · 2026-03-11T14:22:01.000-07:00
Posters often render titles in all-caps for visual emphasis. The model
extracts these verbatim. Post-processing now detects &gt;50% uppercase
titles and converts to title case while preserving acronyms (DNA, SARS,
COVID-19, FAIR, HIV, etc.) via a length + exclusion-list heuristic.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.1.9] - 2026-03-11
+
+### Fixed
+
+- Post-process: convert ALL-CAPS titles to title case, preserving acronyms (DNA, SARS-COV-2, COVID-19, FAIR, etc.)
+
 ## [0.1.8] - 2026-03-11
 
 ### Fixed
diff --git a/poster2json/extract.py b/poster2json/extract.py
@@ -1084,6 +1084,63 @@ def _clean_unicode_artifacts(text: str) -> str:
     return text.strip()
 
 
+_SMALL_WORDS = frozenset(
+    "a an and as at but by for in nor of on or so the to up vs via with".split()
+)
+def _smart_title_case(title: str) -> str:
+    """Convert an ALL-CAPS title to title case, preserving acronyms.
+
+    Only applies when >50% of alpha characters are uppercase, indicating
+    the poster rendered its title in all-caps for visual emphasis.
+
+    Tokens that look like acronyms are kept uppercase: ≤4 alpha chars,
+    all-caps, and not a common English word.  Hyphenated tokens are handled
+    part-by-part so "SARS-COV-2" → "SARS-COV-2".
+    """
+    if not title or not isinstance(title, str):
+        return title
+
+    alpha = [c for c in title if c.isalpha()]
+    if not alpha or sum(c.isupper() for c in alpha) / len(alpha) <= 0.5:
+        return title  # not all-caps, leave as-is
+
+    # Common short English words that are NOT acronyms even when ≤5 chars
+    _NOT_ACRONYMS = _SMALL_WORDS | frozenset(
+        "also back base been both case come data does done each even from "
+        "gene goes good have here high into just like long made make "
+        "many more most much must need new next once only over part past "
+        "rate role same self side some sub such take than that them then "
+        "this thus time type upon used uses very well were what when will "
+        "work year "
+        "about after based below early every first found great group "
+        "human known large level local model multi never newly novel "
+        "lower major means might occur often open other plant point "
+        "right scale shall since small space state still study their "
+        "these three total under until upper urban using value water "
+        "which while whole world would young".split()
+    )
+
+    def _case_part(part: str) -> str:
+        alpha_only = re.sub(r"[^A-Za-z]", "", part)
+        if (
+            alpha_only.isupper()
+            and 2 <= len(alpha_only) <= 5
+            and alpha_only.lower() not in _NOT_ACRONYMS
+        ):
+            return part  # likely acronym — keep as-is
+        return part.capitalize()
+
+    words = title.split()
+    result = []
+    for i, word in enumerate(words):
+        new_word = "-".join(_case_part(p) for p in word.split("-"))
+        # lowercase small words unless first or last
+        if i != 0 and i != len(words) - 1 and new_word.lower() in _SMALL_WORDS:
+            new_word = new_word.lower()
+        result.append(new_word)
+    return " ".join(result)
+
+
 def _normalize_captions(captions_input, caption_type: str = "fig") -> list:
     """Normalize captions to object format with id and caption fields.
 
@@ -1242,6 +1299,7 @@ def _postprocess_json(data: dict, raw_text: str = "") -> dict:
         for title_obj in result["titles"]:
             if isinstance(title_obj, dict) and "title" in title_obj:
                 title_obj["title"] = _clean_unicode_artifacts(title_obj.get("title", ""))
+                title_obj["title"] = _smart_title_case(title_obj["title"])
 
     # Enrich with identifiers from raw text
     if raw_text:
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,7 +1,7 @@
 [tool.poetry]
 
 name = "poster2json"
-version = "0.1.8"
+version = "0.1.9"
 description = "Convert scientific posters (PDF/images) to structured JSON metadata using Large Language Models"
 
 packages = [{ include = "poster2json" }]