Skip to content

Commit 7a9694b

Browse files
authored
Merge pull request mblink#3 from mblink/ks-style-tags-diff
html diff support style tags
2 parents 3e2482b + ea1675a commit 7a9694b

File tree

2 files changed

+73
-4
lines changed

2 files changed

+73
-4
lines changed

src/htmldiff.ts

Lines changed: 71 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,10 @@ function isWhitespace(char: string): boolean {
3737
return /^\s+$/.test(char);
3838
}
3939

40+
function exhaustive(a: never): never {
41+
return a;
42+
}
43+
4044

4145
const tagRegExp = /^\s*<([^!>][^>]*)>\s*$/;
4246
/**
@@ -95,6 +99,38 @@ function isEndOfAtomicTag(word: string, tag: string){
9599
return word.substring(word.length - tag.length - 2) === ('</' + tag);
96100
}
97101

102+
const styleTagsRegExp = /^<(strong|em|b|i|q|cite|blockquote|mark|dfn|sup|sub|u|s)(^(?!\w)|>)/;
103+
104+
/**
105+
* Checks if the current word is the beginning of an style tag. An style tag is one whose
106+
* child nodes should be compared, but the entire tag should be treated as one token. This
107+
* is useful for tags where it does not make sense to insert <ins> and <del> tags.
108+
*
109+
* @param {string} word The characters of the current token read so far.
110+
*
111+
* @return {string|null} The name of the atomic tag if the word will be an atomic tag,
112+
* null otherwise
113+
*/
114+
115+
function isStartOfStyleTag(word: string) {
116+
const result = styleTagsRegExp.exec(word);
117+
return result && result[1];
118+
}
119+
120+
/**
121+
* Checks if the current word is the end of an style tag (i.e. it has all the characters,
122+
* except for the end bracket of the closing tag, such as '<strong></strong').
123+
*
124+
* @param {string} word The characters of the current token read so far.
125+
* @param {string} tag The ending tag to look for.
126+
*
127+
* @return {boolean} True if the word is now a complete token (including the end tag),
128+
* false otherwise.
129+
*/
130+
function isEndOfStyleTag(word: string, tag: string) {
131+
return word.substring(word.length - tag.length - 2) === ('</' + tag);
132+
}
133+
98134
/**
99135
* Checks if a tag is a void tag.
100136
*
@@ -173,6 +209,7 @@ function makeMatch(startInBefore: number, startInAfter: number, length: number,
173209
segmentEndInAfter: startInAfter + length - 1
174210
};}
175211

212+
type ParseMode = 'char' | 'tag' | 'atomic_tag' | 'style_tag' | 'html_comment' | 'whitespace';
176213
/**
177214
* Tokenizes a string of HTML.
178215
*
@@ -181,22 +218,28 @@ function makeMatch(startInBefore: number, startInAfter: number, length: number,
181218
* @return {Array.<string>} The list of tokens.
182219
*/
183220
export function htmlToTokens(html: string): Token[] {
184-
let mode = 'char';
221+
let mode: ParseMode = 'char';
185222
let currentWord = '';
186223
let currentAtomicTag = '';
224+
let currentStyleTag = '';
187225
const words = [];
188226

189227
for (const char of html) {
190228
switch (mode){
191229
case 'tag': {
192230
const atomicTag = isStartOfAtomicTag(currentWord);
231+
const styleTag = isStartOfStyleTag(currentWord + char);
193232
if (atomicTag){
194233
mode = 'atomic_tag';
195234
currentAtomicTag = atomicTag;
196235
currentWord += char;
197236
} else if (isStartOfHTMLComment(currentWord)){
198237
mode = 'html_comment';
199238
currentWord += char;
239+
} else if (styleTag) {
240+
mode = 'style_tag';
241+
currentStyleTag = styleTag;
242+
currentWord = '<nobr>' + currentWord + char;
200243
} else if (isEndOfTag(char)){
201244
currentWord += '>';
202245
words.push(createToken(currentWord));
@@ -229,6 +272,26 @@ export function htmlToTokens(html: string): Token[] {
229272
mode = 'char';
230273
}
231274
break;
275+
case 'style_tag':
276+
if (isEndOfTag(char) && isEndOfStyleTag(currentWord, currentStyleTag)) {
277+
currentWord += '>' + '</nobr>';
278+
words.push(createToken(currentWord));
279+
currentWord = '';
280+
currentStyleTag = '';
281+
mode = 'char';
282+
}
283+
else {
284+
// break up styled blocks into individual styled words
285+
if (/(\s+|&nbsp;|&#160;)/.test(char)) {
286+
currentWord += '</' + currentStyleTag + '>';
287+
if (currentWord) {
288+
words.push(createToken(currentWord));
289+
}
290+
currentWord = '<' + currentStyleTag + '>';
291+
}
292+
currentWord += char;
293+
}
294+
break;
232295
case 'char':
233296
if (isStartOfTag(char)){
234297
if (currentWord){
@@ -273,7 +336,7 @@ export function htmlToTokens(html: string): Token[] {
273336
}
274337
break;
275338
default:
276-
throw new Error('Unknown mode ' + mode);
339+
return exhaustive(mode);
277340
}
278341
}
279342
if (currentWord){
@@ -330,6 +393,12 @@ function getKeyForToken(token: string){
330393
return `<iframe src="${iframe[1]}"></iframe>`;
331394
}
332395

396+
// Treat entire style tag as needing to be compared
397+
const styleTag = styleTagsRegExp.exec(token);
398+
if (styleTag) {
399+
return token;
400+
}
401+
333402
// If the token is any other element, just grab the tag name.
334403
const tagName = /<([^\s>]+)[\s>]/.exec(token);
335404
if (tagName){

test/html_to_tokens.spec.js

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,8 @@ describe('htmlToTokens', function(){
3333
res = htmlToTokens('<p>this is a <strong>test</strong></p>');
3434
});
3535

36-
it('should return 11', function(){
37-
expect(res.length).to.equal(11);
36+
it('should return 9', function(){
37+
expect(res.length).to.equal(9);
3838
});
3939

4040
it('should remove any html comments', function(){

0 commit comments

Comments
 (0)