Skip to content

Commit 1a41d29

Browse files
committed
fix: Strip prompt-placeholder hallucinations from conference metadata
The "Unknown" cleanup only caught ^unknown values. The model also echoes prompt placeholders verbatim ("Name of Conference", "City, Country", "YYYY-MM-DD", etc.) when it can't find real conference info on the poster. Expand the filter to catch these plus publisher dict placeholders.
1 parent e642823 commit 1a41d29

File tree

1 file changed

+26
-3
lines changed

1 file changed

+26
-3
lines changed

poster2json/extract.py

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1249,17 +1249,40 @@ def _postprocess_json(data: dict, raw_text: str = "") -> dict:
12491249

12501250
result = enrich_json_with_identifiers(result, raw_text)
12511251

1252-
# Strip "Unknown" placeholder values the LLM likes to hallucinate.
1252+
# Strip "Unknown" and prompt-placeholder values the LLM likes to hallucinate.
12531253
# These violate metadata quality expectations — better to omit than guess.
12541254
_UNKNOWN_RE = re.compile(r"^unknown\b", re.IGNORECASE)
1255+
# Prompt placeholders that the model echoes back verbatim when it can't
1256+
# find real conference metadata on the poster.
1257+
_PLACEHOLDER_VALS = {
1258+
"name of conference",
1259+
"conference name",
1260+
"city, country",
1261+
"location",
1262+
"conference organizer or institution name",
1263+
"conference or institution",
1264+
}
1265+
_PLACEHOLDER_DATE_RE = re.compile(r"^[Yy]{4}-[Mm]{2}-[Dd]{2}$")
1266+
1267+
def _is_placeholder(val: str) -> bool:
1268+
s = val.strip()
1269+
return (
1270+
_UNKNOWN_RE.match(s)
1271+
or s.lower() in _PLACEHOLDER_VALS
1272+
or bool(_PLACEHOLDER_DATE_RE.match(s))
1273+
)
1274+
12551275
if "conference" in result and isinstance(result["conference"], dict):
12561276
for key in list(result["conference"]):
12571277
val = result["conference"][key]
1258-
if isinstance(val, str) and _UNKNOWN_RE.match(val.strip()):
1278+
if isinstance(val, str) and _is_placeholder(val):
12591279
del result["conference"][key]
12601280
# Top-level optional string fields
12611281
for key in ("conferenceLocation", "publisher", "researchField"):
1262-
if key in result and isinstance(result[key], str) and _UNKNOWN_RE.match(result[key].strip()):
1282+
val = result.get(key)
1283+
if isinstance(val, str) and _is_placeholder(val):
1284+
del result[key]
1285+
elif isinstance(val, dict) and "name" in val and isinstance(val["name"], str) and _is_placeholder(val["name"]):
12631286
del result[key]
12641287

12651288
return result

0 commit comments

Comments
 (0)