Skip to content

Commit 86f9b6f

Browse files
committed
feat: regex identifier extraction and caption ID auto-generation
Extract ORCID, DOI, arXiv, ROR, ISNI, GRID, Crossref Funder IDs from raw poster text via regex post-processing (no LLM token cost). Auto- populate scheme/schemeURI for existing identifiers. Auto-generate missing caption IDs (fig1, fig2, table1, ...).
1 parent 69b5109 commit 86f9b6f

File tree

3 files changed

+780
-8
lines changed

3 files changed

+780
-8
lines changed

poster2json/extract.py

Lines changed: 22 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -783,14 +783,17 @@ def _clean_unicode_artifacts(text: str) -> str:
783783
return text.strip()
784784

785785

786-
def _normalize_captions(captions_input) -> list:
787-
"""Normalize captions to object format with id and caption fields."""
786+
def _normalize_captions(captions_input, caption_type: str = "fig") -> list:
787+
"""Normalize captions to object format with id and caption fields.
788+
789+
Auto-generates missing IDs as {caption_type}1, {caption_type}2, etc.
790+
"""
788791
normalized = []
789792
seen_texts = set()
790793

791794
# Handle various input formats
792795
if isinstance(captions_input, str):
793-
return [{"caption": captions_input}] if captions_input.strip() else []
796+
return [{"id": f"{caption_type}1", "caption": captions_input}] if captions_input.strip() else []
794797

795798
if not isinstance(captions_input, list):
796799
return []
@@ -827,10 +830,15 @@ def _normalize_captions(captions_input) -> list:
827830
seen_texts.add(key)
828831
normalized.append(caption_obj)
829832

833+
# Auto-generate missing IDs
834+
for i, cap in enumerate(normalized, start=1):
835+
if "id" not in cap:
836+
cap["id"] = f"{caption_type}{i}"
837+
830838
return normalized
831839

832840

833-
def _postprocess_json(data: dict) -> dict:
841+
def _postprocess_json(data: dict, raw_text: str = "") -> dict:
834842
"""Comprehensive post-processing for extracted JSON."""
835843
result = data.copy()
836844

@@ -847,12 +855,12 @@ def _postprocess_json(data: dict) -> dict:
847855
if "domain" in result and "researchField" not in result:
848856
result["researchField"] = result.pop("domain")
849857

850-
# Ensure caption fields exist and normalize to string arrays
851-
for key in ["imageCaptions", "tableCaptions"]:
858+
# Ensure caption fields exist and normalize with auto-generated IDs
859+
for key, ctype in [("imageCaptions", "fig"), ("tableCaptions", "table")]:
852860
if key not in result:
853861
result[key] = []
854862
elif isinstance(result[key], (dict, list)):
855-
result[key] = _normalize_captions(result[key])
863+
result[key] = _normalize_captions(result[key], caption_type=ctype)
856864

857865
# Clean Unicode from string fields
858866
for key in ["researchField"]:
@@ -890,6 +898,12 @@ def _postprocess_json(data: dict) -> dict:
890898
if isinstance(title_obj, dict) and "title" in title_obj:
891899
title_obj["title"] = _clean_unicode_artifacts(title_obj.get("title", ""))
892900

901+
# Enrich with identifiers from raw text
902+
if raw_text:
903+
from .identifiers import enrich_json_with_identifiers
904+
905+
result = enrich_json_with_identifiers(result, raw_text)
906+
893907
return result
894908

895909

@@ -931,7 +945,7 @@ def extract_json_with_retry(raw_text: str, model, tokenizer) -> dict:
931945
response = _generate(model, tokenizer, fallback_prompt, MAX_RETRY_TOKENS)
932946
result = _robust_json_parse(response)
933947

934-
result = _postprocess_json(result)
948+
result = _postprocess_json(result, raw_text=raw_text)
935949
return result
936950

937951

0 commit comments

Comments
 (0)