@@ -59,6 +59,22 @@ const pickByLang = (translations: { language_code: string; text: string }[], lan
5959 return translations [ 0 ] ?. text
6060}
6161
62+ /**
63+ * Normalize text for consistent tokenization:
64+ * 1. Apply Unicode NFKC normalization (converts compatibility chars to canonical forms)
65+ * 2. Normalize remaining apostrophe-like characters to standard ' (U+0027)
66+ */
67+ const normalizeForTokenization = ( text : string ) : string => {
68+ // Step 1: NFKC normalization - converts many lookalike characters to canonical forms
69+ let normalized = text . normalize ( "NFKC" )
70+
71+ // Step 2: Explicitly normalize any remaining apostrophe/quote variants to standard '
72+ // This catches characters that NFKC doesn't convert
73+ normalized = normalized . replace ( / [ ' ' ʼ ʻ ʽ ʹ ′ ‚ ‛ ` ʾ ʿ ˈ ˊ ˋ ˴ ꞌ ] / g, "'" )
74+
75+ return normalized
76+ }
77+
6278/**
6379 * Split text into words and punctuation tokens
6480 * Handles punctuation marks as separate tokens
@@ -72,6 +88,9 @@ const tokenizeText = (text: string): string[] => {
7288 return tokenizeCJK ( text )
7389 }
7490
91+ // Normalize text: NFKC + apostrophe variants → standard '
92+ const normalizedText = normalizeForTokenization ( text )
93+
7594 const tokens : string [ ] = [ ]
7695
7796 // Regex pattern that handles:
@@ -80,17 +99,18 @@ const tokenizeText = (text: string): string[] => {
8099 // 3. Regular words: "am", "hello", "the", "is" (letters, marks, numbers)
81100 // 4. Punctuation and symbols: ".", ",", "?", "!", etc.
82101
83- // Apostrophe variants: ' (U+0027), ' (U+2019), ʼ (U+02BC), ʻ (U+02BB), ' (U+2018 ), ` (backtick)
102+ // Apostrophe variants: ' (U+0027), ' (U+2019), ' (U+2018), ʼ (U+02BC), ʻ (U+02BB), ʽ (U+02BD), ʹ (U+02B9), ′ (U+2032), ‚ (U+201A), ‛ (U+201B ), ` (backtick)
84103 // Hyphen variants: - (U+002D), ‐ (U+2010), − (U+2212), – (U+2013), — (U+2014)
85- const regex = / [ \p{ L} \p{ M} \p{ N} ] + (?: [ ' ʼ ʻ ' ' ` \- ‐ − – — ] [ \p{ L} \p{ M} \p{ N} ] + ) * | [ \p{ P} \p{ S} ] / gu
86-
104+ // Simplified regex - only need standard apostrophe now since we normalized
105+ const regex = / [ \p{ L} \p{ M} \p{ N} ] + (?: [ ' \- ‐ − – — ] [ \p{ L} \p{ M} \p{ N} ] + ) * | [ \p{ P} \p{ S} ] / gu
106+
87107 let match
88108 let lastIndex = 0
89-
90- while ( ( match = regex . exec ( text ) ) !== null ) {
109+
110+ while ( ( match = regex . exec ( normalizedText ) ) !== null ) {
91111 // Skip whitespace between matches
92112 if ( match . index > lastIndex ) {
93- const between = text . slice ( lastIndex , match . index )
113+ const between = normalizedText . slice ( lastIndex , match . index )
94114 // Only warn if there's non-whitespace content
95115 if ( between . trim ( ) ) {
96116 // Commented out to reduce console noise
@@ -106,8 +126,8 @@ const tokenizeText = (text: string): string[] => {
106126 }
107127
108128 // Handle any remaining text
109- if ( lastIndex < text . length ) {
110- const remaining = text . slice ( lastIndex ) . trim ( )
129+ if ( lastIndex < normalizedText . length ) {
130+ const remaining = normalizedText . slice ( lastIndex ) . trim ( )
111131 if ( remaining ) {
112132 // Commented out to reduce console noise
113133 // console.warn(`[juice-squeeze:data] Remaining text after tokenization: "${remaining}"`)
@@ -119,17 +139,30 @@ const tokenizeText = (text: string): string[] => {
119139 }
120140
121141 const filtered = tokens . filter ( ( token ) => token . trim ( ) . length > 0 )
122-
123- // Debug logging for problematic cases (single letter tokens that should be part of words)
124- // Commented out to reduce console noise
125- // const singleLetterWords = filtered.filter(t => t.length === 1 && /[a-zA-Z]/.test(t))
126- // if (singleLetterWords.length > 0) {
127- // console.warn(`[juice-squeeze:data] ⚠️ WARNING: Single-letter tokens detected:`, singleLetterWords)
128- // console.warn(`[juice-squeeze:data] Full tokenization result:`, filtered)
129- // console.warn(`[juice-squeeze:data] Original text: "${text}"`)
130- // }
131-
132- return filtered
142+
143+ // Post-process: merge any remaining letter + punctuation + letter sequences
144+ // This catches ANY edge cases - if a single punctuation is between letters, merge them
145+ const merged : string [ ] = [ ]
146+
147+ for ( let i = 0 ; i < filtered . length ; i ++ ) {
148+ const current = filtered [ i ]
149+ const next = filtered [ i + 1 ]
150+ const afterNext = filtered [ i + 2 ]
151+
152+ // Check if this is: single letter + single punctuation + word starting with letter
153+ // This is a universal catch-all for apostrophes and similar contractions
154+ if ( current . length === 1 && / ^ [ \p{ L} ] $ / u. test ( current ) &&
155+ next && next . length === 1 && / ^ [ \p{ P} \p{ S} ] $ / u. test ( next ) &&
156+ afterNext && / ^ [ \p{ L} ] / u. test ( afterNext ) ) {
157+ // Merge all three into one token
158+ merged . push ( current + next + afterNext )
159+ i += 2 // Skip the next two tokens
160+ } else {
161+ merged . push ( current )
162+ }
163+ }
164+
165+ return merged
133166}
134167
135168/**
0 commit comments