@@ -37,6 +37,10 @@ function isWhitespace(char: string): boolean {
3737 return / ^ \s + $ / . test ( char ) ;
3838}
3939
40+ function exhaustive ( a : never ) : never {
41+ return a ;
42+ }
43+
4044
4145const tagRegExp = / ^ \s * < ( [ ^ ! > ] [ ^ > ] * ) > \s * $ / ;
4246/**
@@ -95,6 +99,38 @@ function isEndOfAtomicTag(word: string, tag: string){
9599 return word . substring ( word . length - tag . length - 2 ) === ( '</' + tag ) ;
96100}
97101
102+ const styleTagsRegExp = / ^ < ( s t r o n g | e m | b | i | q | c i t e | b l o c k q u o t e | m a r k | d f n | s u p | s u b | u | s ) ( ^ (? ! \w ) | > ) / ;
103+
104+ /**
105+ * Checks if the current word is the beginning of an style tag. An style tag is one whose
106+ * child nodes should be compared, but the entire tag should be treated as one token. This
107+ * is useful for tags where it does not make sense to insert <ins> and <del> tags.
108+ *
109+ * @param {string } word The characters of the current token read so far.
110+ *
111+ * @return {string|null } The name of the atomic tag if the word will be an atomic tag,
112+ * null otherwise
113+ */
114+
115+ function isStartOfStyleTag ( word : string ) {
116+ const result = styleTagsRegExp . exec ( word ) ;
117+ return result && result [ 1 ] ;
118+ }
119+
120+ /**
121+ * Checks if the current word is the end of an style tag (i.e. it has all the characters,
122+ * except for the end bracket of the closing tag, such as '<strong></strong').
123+ *
124+ * @param {string } word The characters of the current token read so far.
125+ * @param {string } tag The ending tag to look for.
126+ *
127+ * @return {boolean } True if the word is now a complete token (including the end tag),
128+ * false otherwise.
129+ */
130+ function isEndOfStyleTag ( word : string , tag : string ) {
131+ return word . substring ( word . length - tag . length - 2 ) === ( '</' + tag ) ;
132+ }
133+
98134/**
99135 * Checks if a tag is a void tag.
100136 *
@@ -173,6 +209,7 @@ function makeMatch(startInBefore: number, startInAfter: number, length: number,
173209 segmentEndInAfter : startInAfter + length - 1
174210 } ; }
175211
212+ type ParseMode = 'char' | 'tag' | 'atomic_tag' | 'style_tag' | 'html_comment' | 'whitespace' ;
176213/**
177214 * Tokenizes a string of HTML.
178215 *
@@ -181,22 +218,28 @@ function makeMatch(startInBefore: number, startInAfter: number, length: number,
181218 * @return {Array.<string> } The list of tokens.
182219 */
183220export function htmlToTokens ( html : string ) : Token [ ] {
184- let mode = 'char' ;
221+ let mode : ParseMode = 'char' ;
185222 let currentWord = '' ;
186223 let currentAtomicTag = '' ;
224+ let currentStyleTag = '' ;
187225 const words = [ ] ;
188226
189227 for ( const char of html ) {
190228 switch ( mode ) {
191229 case 'tag' : {
192230 const atomicTag = isStartOfAtomicTag ( currentWord ) ;
231+ const styleTag = isStartOfStyleTag ( currentWord + char ) ;
193232 if ( atomicTag ) {
194233 mode = 'atomic_tag' ;
195234 currentAtomicTag = atomicTag ;
196235 currentWord += char ;
197236 } else if ( isStartOfHTMLComment ( currentWord ) ) {
198237 mode = 'html_comment' ;
199238 currentWord += char ;
239+ } else if ( styleTag ) {
240+ mode = 'style_tag' ;
241+ currentStyleTag = styleTag ;
242+ currentWord = '<nobr>' + currentWord + char ;
200243 } else if ( isEndOfTag ( char ) ) {
201244 currentWord += '>' ;
202245 words . push ( createToken ( currentWord ) ) ;
@@ -229,6 +272,26 @@ export function htmlToTokens(html: string): Token[] {
229272 mode = 'char' ;
230273 }
231274 break ;
275+ case 'style_tag' :
276+ if ( isEndOfTag ( char ) && isEndOfStyleTag ( currentWord , currentStyleTag ) ) {
277+ currentWord += '>' + '</nobr>' ;
278+ words . push ( createToken ( currentWord ) ) ;
279+ currentWord = '' ;
280+ currentStyleTag = '' ;
281+ mode = 'char' ;
282+ }
283+ else {
284+ // break up styled blocks into individual styled words
285+ if ( / ( \s + | & n b s p ; | & # 1 6 0 ; ) / . test ( char ) ) {
286+ currentWord += '</' + currentStyleTag + '>' ;
287+ if ( currentWord ) {
288+ words . push ( createToken ( currentWord ) ) ;
289+ }
290+ currentWord = '<' + currentStyleTag + '>' ;
291+ }
292+ currentWord += char ;
293+ }
294+ break ;
232295 case 'char' :
233296 if ( isStartOfTag ( char ) ) {
234297 if ( currentWord ) {
@@ -273,7 +336,7 @@ export function htmlToTokens(html: string): Token[] {
273336 }
274337 break ;
275338 default :
276- throw new Error ( 'Unknown mode ' + mode ) ;
339+ return exhaustive ( mode ) ;
277340 }
278341 }
279342 if ( currentWord ) {
@@ -330,6 +393,12 @@ function getKeyForToken(token: string){
330393 return `<iframe src="${ iframe [ 1 ] } "></iframe>` ;
331394 }
332395
396+ // Treat entire style tag as needing to be compared
397+ const styleTag = styleTagsRegExp . exec ( token ) ;
398+ if ( styleTag ) {
399+ return token ;
400+ }
401+
333402 // If the token is any other element, just grab the tag name.
334403 const tagName = / < ( [ ^ \s > ] + ) [ \s > ] / . exec ( token ) ;
335404 if ( tagName ) {
0 commit comments