Post-process: strip "Unknown" placeholder values from conference metadata and optional string fields

jimnoneill · jimnoneill · commit e64282359793 · 2026-02-28T09:59:32.000-08:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,13 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.1.7] - 2026-02-28
+
+### Fixed
+
+- Post-process: omit `sectionTitle` when empty instead of writing `""` (violates schema `minLength: 1`)
+- Post-process: strip "Unknown" placeholder values from conference metadata and optional string fields
+
 ## [0.1.6] - 2026-02-26
 
 ### Changed
diff --git a/poster2json/extract.py b/poster2json/extract.py
@@ -1184,7 +1184,10 @@ def _postprocess_json(data: dict, raw_text: str = "") -> dict:
                     content.strip() if isinstance(content, str) else ""
                 )
                 if content and len(content) > 10:
-                    cleaned_sections.append({"sectionTitle": title, "sectionContent": content})
+                    entry = {"sectionContent": content}
+                    if title:
+                        entry["sectionTitle"] = title
+                    cleaned_sections.append(entry)
             # Recover uncaptured raw text as untitled section(s).
             # The LLM sometimes drops footer content (contact info, URLs).
             # Compare raw text lines against section content and reclaim
@@ -1223,7 +1226,6 @@ def _postprocess_json(data: dict, raw_text: str = "") -> dict:
 
                 if uncaptured and len(" ".join(uncaptured)) > 10:
                     cleaned_sections.append({
-                        "sectionTitle": "",
                         "sectionContent": "\n".join(uncaptured),
                     })
 
@@ -1247,6 +1249,19 @@ def _postprocess_json(data: dict, raw_text: str = "") -> dict:
 
         result = enrich_json_with_identifiers(result, raw_text)
 
+    # Strip "Unknown" placeholder values the LLM likes to hallucinate.
+    # These violate metadata quality expectations — better to omit than guess.
+    _UNKNOWN_RE = re.compile(r"^unknown\b", re.IGNORECASE)
+    if "conference" in result and isinstance(result["conference"], dict):
+        for key in list(result["conference"]):
+            val = result["conference"][key]
+            if isinstance(val, str) and _UNKNOWN_RE.match(val.strip()):
+                del result["conference"][key]
+    # Top-level optional string fields
+    for key in ("conferenceLocation", "publisher", "researchField"):
+        if key in result and isinstance(result[key], str) and _UNKNOWN_RE.match(result[key].strip()):
+            del result[key]
+
     return result
 
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,7 +1,7 @@
 [tool.poetry]
 
 name = "poster2json"
-version = "0.1.6"
+version = "0.1.7"
 description = "Convert scientific posters (PDF/images) to structured JSON metadata using Large Language Models"
 
 packages = [{ include = "poster2json" }]