Skip to content

Commit e505f8e

Browse files
committed
curly quote normalization
1 parent d5b4f5a commit e505f8e

File tree

1 file changed

+46
-5
lines changed

1 file changed

+46
-5
lines changed

poster2json/extract.py

Lines changed: 46 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -618,13 +618,22 @@ def _extract_first_json_object(s: str) -> str:
618618

619619

620620
def _repair_unescaped_quotes(s: str) -> str:
621-
"""Fix quotes that appear after / which are not properly escaped."""
621+
"""Fix unescaped double-quotes inside JSON string values.
622+
623+
Handles two patterns:
624+
1. Quotes after units: e.g. ``16.7 pc/"`` → ``16.7 pc/\\"``
625+
2. Inline scare-quotes: e.g. ``M dwarf "twin" binaries``
626+
→ ``M dwarf 'twin' binaries``
627+
"""
628+
# Unit-slash pattern (original)
622629
s = re.sub(
623630
r'(\d+\s*(?:pc|km|m|cm|mm|Hz|kHz|MHz|GHz|s|ms|ns|arcsec|arcmin|deg))/"',
624631
r'\1/\\"',
625632
s,
626633
)
627634
s = re.sub(r'\((\d+\.?\d*\s*\w+)/"\)', r'(\1/\\")', s)
635+
# Inline scare-quotes: word "quoted word(s)" word → single quotes
636+
s = re.sub(r'(?<=\w)\s*"\s*(\w+(?:\s+\w+)*)\s*"\s*(?=\w)', r" '\1' ", s)
628637
return s
629638

630639

@@ -912,16 +921,48 @@ def _postprocess_json(data: dict, raw_text: str = "") -> dict:
912921
# ============================
913922

914923

924+
def _normalize_raw_text_for_model(text: str) -> str:
925+
"""Normalize Unicode in raw OCR text before feeding to the model.
926+
927+
NFKD decomposition converts superscripts (¹²³⁺), subscripts (ₛ),
928+
and other compatibility characters to their ASCII equivalents,
929+
reducing token count and improving model JSON generation.
930+
931+
Smart/curly quotes are replaced with single quotes to prevent the
932+
model from outputting unescaped straight double-quotes inside JSON
933+
string values (e.g. OCR ``\u201ctwin\u201d`` → model ``"twin"`` →
934+
broken JSON).
935+
"""
936+
# NFKD: ¹→1, ²→2, ³→3, ⁺→+, ₛ→s, etc.
937+
text = unicodedata.normalize("NFKD", text)
938+
# Remove combining marks left over from decomposition
939+
text = "".join(c for c in text if not unicodedata.combining(c))
940+
# Replace smart/curly quotes with straight single quotes
941+
text = text.replace("\u201c", "'").replace("\u201d", "'") # " " → '
942+
text = text.replace("\u2018", "'").replace("\u2019", "'") # ' ' → '
943+
text = text.replace("\u00ab", "'").replace("\u00bb", "'") # « » → '
944+
# Fix mixed-pair leftovers: curly open (now ') + straight close (")
945+
# e.g. 'twin" → 'twin'
946+
text = re.sub(r"'(\w+(?:\s+\w+)*)\"", r"'\1'", text)
947+
# Fix remaining inline scare-quotes: word "quoted" word → word 'quoted' word
948+
text = re.sub(
949+
r'(?<=\w)\s"(\w+(?:\s+\w+)*)"\s(?=\w)', r" '\1' ", text
950+
)
951+
return text
952+
953+
915954
def extract_json_with_retry(raw_text: str, model, tokenizer) -> dict:
916955
"""
917956
Send raw poster text to the LLM and robustly parse the JSON response.
918957
919958
This function:
920-
1. Calls the model with a full prompt
921-
2. Retries with more tokens if truncation is detected
922-
3. Falls back to a shorter prompt if needed
923-
4. Runs repair passes to make the JSON parseable
959+
1. Normalizes Unicode in the raw text
960+
2. Calls the model with a full prompt
961+
3. Retries with more tokens if truncation is detected
962+
4. Falls back to a shorter prompt if needed
963+
5. Runs repair passes to make the JSON parseable
924964
"""
965+
raw_text = _normalize_raw_text_for_model(raw_text)
925966
prompt = EXTRACTION_PROMPT.format(raw_text=raw_text)
926967

927968
log("Starting primary JSON extraction with full prompt")

0 commit comments

Comments
 (0)