Skip to content

Commit 40f93a2

Browse files
Umanistanclaude
andauthored
Juice Squeeze: Fix block overflow and layout spacing (#170)
- Fix block overflow by using worst-case sizing for shuffled words - Sort multipliers and use largest N to ensure any row distribution fits - Move bottle collection lower to avoid overlapping word blocks - Push target phrase down to prevent overlap with title on multi-line phrases - Add NFKC text normalization for consistent tokenization Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
1 parent 5c78a2f commit 40f93a2

File tree

4 files changed

+289
-149
lines changed

4 files changed

+289
-149
lines changed

corpan/packs/juice-squeeze/manifest.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,5 +8,5 @@
88
],
99
"entryType": "script",
1010
"sdkVersion": "0.1.0",
11-
"devRevision": "2026-01-29T19:15:48.776Z"
11+
"devRevision": "2026-02-01T04:33:03.799Z"
1212
}

corpan/packs/juice-squeeze/src/data.ts

Lines changed: 52 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,22 @@ const pickByLang = (translations: { language_code: string; text: string }[], lan
5959
return translations[0]?.text
6060
}
6161

62+
/**
63+
* Normalize text for consistent tokenization:
64+
* 1. Apply Unicode NFKC normalization (converts compatibility chars to canonical forms)
65+
* 2. Normalize remaining apostrophe-like characters to standard ' (U+0027)
66+
*/
67+
const normalizeForTokenization = (text: string): string => {
68+
// Step 1: NFKC normalization - converts many lookalike characters to canonical forms
69+
let normalized = text.normalize("NFKC")
70+
71+
// Step 2: Explicitly normalize any remaining apostrophe/quote variants to standard '
72+
// This catches characters that NFKC doesn't convert
73+
normalized = normalized.replace(/[''ʼʻʽʹ`ʾʿˈˊˋ˴]/g, "'")
74+
75+
return normalized
76+
}
77+
6278
/**
6379
* Split text into words and punctuation tokens
6480
* Handles punctuation marks as separate tokens
@@ -72,6 +88,9 @@ const tokenizeText = (text: string): string[] => {
7288
return tokenizeCJK(text)
7389
}
7490

91+
// Normalize text: NFKC + apostrophe variants → standard '
92+
const normalizedText = normalizeForTokenization(text)
93+
7594
const tokens: string[] = []
7695

7796
// Regex pattern that handles:
@@ -80,17 +99,18 @@ const tokenizeText = (text: string): string[] => {
8099
// 3. Regular words: "am", "hello", "the", "is" (letters, marks, numbers)
81100
// 4. Punctuation and symbols: ".", ",", "?", "!", etc.
82101

83-
// Apostrophe variants: ' (U+0027), ' (U+2019), ʼ (U+02BC), ʻ (U+02BB), ' (U+2018), ` (backtick)
102+
// Apostrophe variants: ' (U+0027), ' (U+2019), ' (U+2018), ʼ (U+02BC), ʻ (U+02BB), ʽ (U+02BD), ʹ (U+02B9), ′ (U+2032), ‚ (U+201A), ‛ (U+201B), ` (backtick)
84103
// Hyphen variants: - (U+002D), ‐ (U+2010), − (U+2212), – (U+2013), — (U+2014)
85-
const regex = /[\p{L}\p{M}\p{N}]+(?:['ʼʻ''`\-][\p{L}\p{M}\p{N}]+)*|[\p{P}\p{S}]/gu
86-
104+
// Simplified regex - only need standard apostrophe now since we normalized
105+
const regex = /[\p{L}\p{M}\p{N}]+(?:['\-][\p{L}\p{M}\p{N}]+)*|[\p{P}\p{S}]/gu
106+
87107
let match
88108
let lastIndex = 0
89-
90-
while ((match = regex.exec(text)) !== null) {
109+
110+
while ((match = regex.exec(normalizedText)) !== null) {
91111
// Skip whitespace between matches
92112
if (match.index > lastIndex) {
93-
const between = text.slice(lastIndex, match.index)
113+
const between = normalizedText.slice(lastIndex, match.index)
94114
// Only warn if there's non-whitespace content
95115
if (between.trim()) {
96116
// Commented out to reduce console noise
@@ -106,8 +126,8 @@ const tokenizeText = (text: string): string[] => {
106126
}
107127

108128
// Handle any remaining text
109-
if (lastIndex < text.length) {
110-
const remaining = text.slice(lastIndex).trim()
129+
if (lastIndex < normalizedText.length) {
130+
const remaining = normalizedText.slice(lastIndex).trim()
111131
if (remaining) {
112132
// Commented out to reduce console noise
113133
// console.warn(`[juice-squeeze:data] Remaining text after tokenization: "${remaining}"`)
@@ -119,17 +139,30 @@ const tokenizeText = (text: string): string[] => {
119139
}
120140

121141
const filtered = tokens.filter((token) => token.trim().length > 0)
122-
123-
// Debug logging for problematic cases (single letter tokens that should be part of words)
124-
// Commented out to reduce console noise
125-
// const singleLetterWords = filtered.filter(t => t.length === 1 && /[a-zA-Z]/.test(t))
126-
// if (singleLetterWords.length > 0) {
127-
// console.warn(`[juice-squeeze:data] ⚠️ WARNING: Single-letter tokens detected:`, singleLetterWords)
128-
// console.warn(`[juice-squeeze:data] Full tokenization result:`, filtered)
129-
// console.warn(`[juice-squeeze:data] Original text: "${text}"`)
130-
// }
131-
132-
return filtered
142+
143+
// Post-process: merge any remaining letter + punctuation + letter sequences
144+
// This catches ANY edge cases - if a single punctuation is between letters, merge them
145+
const merged: string[] = []
146+
147+
for (let i = 0; i < filtered.length; i++) {
148+
const current = filtered[i]
149+
const next = filtered[i + 1]
150+
const afterNext = filtered[i + 2]
151+
152+
// Check if this is: single letter + single punctuation + word starting with letter
153+
// This is a universal catch-all for apostrophes and similar contractions
154+
if (current.length === 1 && /^[\p{L}]$/u.test(current) &&
155+
next && next.length === 1 && /^[\p{P}\p{S}]$/u.test(next) &&
156+
afterNext && /^[\p{L}]/u.test(afterNext)) {
157+
// Merge all three into one token
158+
merged.push(current + next + afterNext)
159+
i += 2 // Skip the next two tokens
160+
} else {
161+
merged.push(current)
162+
}
163+
}
164+
165+
return merged
133166
}
134167

135168
/**

0 commit comments

Comments
 (0)