@@ -1184,7 +1184,10 @@ def _postprocess_json(data: dict, raw_text: str = "") -> dict:
11841184 content .strip () if isinstance (content , str ) else ""
11851185 )
11861186 if content and len (content ) > 10 :
1187- cleaned_sections .append ({"sectionTitle" : title , "sectionContent" : content })
1187+ entry = {"sectionContent" : content }
1188+ if title :
1189+ entry ["sectionTitle" ] = title
1190+ cleaned_sections .append (entry )
11881191 # Recover uncaptured raw text as untitled section(s).
11891192 # The LLM sometimes drops footer content (contact info, URLs).
11901193 # Compare raw text lines against section content and reclaim
@@ -1223,7 +1226,6 @@ def _postprocess_json(data: dict, raw_text: str = "") -> dict:
12231226
12241227 if uncaptured and len (" " .join (uncaptured )) > 10 :
12251228 cleaned_sections .append ({
1226- "sectionTitle" : "" ,
12271229 "sectionContent" : "\n " .join (uncaptured ),
12281230 })
12291231
@@ -1247,6 +1249,19 @@ def _postprocess_json(data: dict, raw_text: str = "") -> dict:
12471249
12481250 result = enrich_json_with_identifiers (result , raw_text )
12491251
1252+ # Strip "Unknown" placeholder values the LLM likes to hallucinate.
1253+ # These violate metadata quality expectations — better to omit than guess.
1254+ _UNKNOWN_RE = re .compile (r"^unknown\b" , re .IGNORECASE )
1255+ if "conference" in result and isinstance (result ["conference" ], dict ):
1256+ for key in list (result ["conference" ]):
1257+ val = result ["conference" ][key ]
1258+ if isinstance (val , str ) and _UNKNOWN_RE .match (val .strip ()):
1259+ del result ["conference" ][key ]
1260+ # Top-level optional string fields
1261+ for key in ("conferenceLocation" , "publisher" , "researchField" ):
1262+ if key in result and isinstance (result [key ], str ) and _UNKNOWN_RE .match (result [key ].strip ()):
1263+ del result [key ]
1264+
12501265 return result
12511266
12521267
0 commit comments