Skip to content

Commit 97f3c4d

Browse files
committed
fix: Smart title-case for ALL-CAPS poster titles (v0.1.9)
Posters often render titles in all-caps for visual emphasis. The model extracts these verbatim. Post-processing now detects >50% uppercase titles and converts to title case while preserving acronyms (DNA, SARS, COVID-19, FAIR, HIV, etc.) via a length + exclusion-list heuristic.
1 parent 4da6712 commit 97f3c4d

File tree

3 files changed

+65
-1
lines changed

3 files changed

+65
-1
lines changed

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file.
55
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
66
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
77

8+
## [0.1.9] - 2026-03-11
9+
10+
### Fixed
11+
12+
- Post-process: convert ALL-CAPS titles to title case, preserving acronyms (DNA, SARS-COV-2, COVID-19, FAIR, etc.)
13+
814
## [0.1.8] - 2026-03-11
915

1016
### Fixed

poster2json/extract.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1084,6 +1084,63 @@ def _clean_unicode_artifacts(text: str) -> str:
10841084
return text.strip()
10851085

10861086

1087+
_SMALL_WORDS = frozenset(
1088+
"a an and as at but by for in nor of on or so the to up vs via with".split()
1089+
)
1090+
def _smart_title_case(title: str) -> str:
1091+
"""Convert an ALL-CAPS title to title case, preserving acronyms.
1092+
1093+
Only applies when >50% of alpha characters are uppercase, indicating
1094+
the poster rendered its title in all-caps for visual emphasis.
1095+
1096+
Tokens that look like acronyms are kept uppercase: ≤4 alpha chars,
1097+
all-caps, and not a common English word. Hyphenated tokens are handled
1098+
part-by-part so "SARS-COV-2" → "SARS-COV-2".
1099+
"""
1100+
if not title or not isinstance(title, str):
1101+
return title
1102+
1103+
alpha = [c for c in title if c.isalpha()]
1104+
if not alpha or sum(c.isupper() for c in alpha) / len(alpha) <= 0.5:
1105+
return title # not all-caps, leave as-is
1106+
1107+
# Common short English words that are NOT acronyms even when ≤5 chars
1108+
_NOT_ACRONYMS = _SMALL_WORDS | frozenset(
1109+
"also back base been both case come data does done each even from "
1110+
"gene goes good have here high into just like long made make "
1111+
"many more most much must need new next once only over part past "
1112+
"rate role same self side some sub such take than that them then "
1113+
"this thus time type upon used uses very well were what when will "
1114+
"work year "
1115+
"about after based below early every first found great group "
1116+
"human known large level local model multi never newly novel "
1117+
"lower major means might occur often open other plant point "
1118+
"right scale shall since small space state still study their "
1119+
"these three total under until upper urban using value water "
1120+
"which while whole world would young".split()
1121+
)
1122+
1123+
def _case_part(part: str) -> str:
1124+
alpha_only = re.sub(r"[^A-Za-z]", "", part)
1125+
if (
1126+
alpha_only.isupper()
1127+
and 2 <= len(alpha_only) <= 5
1128+
and alpha_only.lower() not in _NOT_ACRONYMS
1129+
):
1130+
return part # likely acronym — keep as-is
1131+
return part.capitalize()
1132+
1133+
words = title.split()
1134+
result = []
1135+
for i, word in enumerate(words):
1136+
new_word = "-".join(_case_part(p) for p in word.split("-"))
1137+
# lowercase small words unless first or last
1138+
if i != 0 and i != len(words) - 1 and new_word.lower() in _SMALL_WORDS:
1139+
new_word = new_word.lower()
1140+
result.append(new_word)
1141+
return " ".join(result)
1142+
1143+
10871144
def _normalize_captions(captions_input, caption_type: str = "fig") -> list:
10881145
"""Normalize captions to object format with id and caption fields.
10891146
@@ -1242,6 +1299,7 @@ def _postprocess_json(data: dict, raw_text: str = "") -> dict:
12421299
for title_obj in result["titles"]:
12431300
if isinstance(title_obj, dict) and "title" in title_obj:
12441301
title_obj["title"] = _clean_unicode_artifacts(title_obj.get("title", ""))
1302+
title_obj["title"] = _smart_title_case(title_obj["title"])
12451303

12461304
# Enrich with identifiers from raw text
12471305
if raw_text:

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
[tool.poetry]
22

33
name = "poster2json"
4-
version = "0.1.8"
4+
version = "0.1.9"
55
description = "Convert scientific posters (PDF/images) to structured JSON metadata using Large Language Models"
66

77
packages = [{ include = "poster2json" }]

0 commit comments

Comments
 (0)