Merge pull request mblink#3 from mblink/ks-style-tags-diff

SuttonKyle · web-flow · commit 7a9694bf6d4f · 2021-12-22T10:58:03.000-05:00
html diff support style tags
diff --git a/src/htmldiff.ts b/src/htmldiff.ts
@@ -37,6 +37,10 @@ function isWhitespace(char: string): boolean {
   return /^\s+$/.test(char);
 }
 
+function exhaustive(a: never): never {
+  return a;
+}
+
 
 const tagRegExp = /^\s*<([^!>][^>]*)>\s*$/;
 /**
@@ -95,6 +99,38 @@ function isEndOfAtomicTag(word: string, tag: string){
   return word.substring(word.length - tag.length - 2) === ('</' + tag);
 }
 
+const styleTagsRegExp = /^<(strong|em|b|i|q|cite|blockquote|mark|dfn|sup|sub|u|s)(^(?!\w)|>)/;
+
+/**
+ * Checks if the current word is the beginning of an style tag. An style tag is one whose
+ * child nodes should be compared, but the entire tag should be treated as one token. This
+ * is useful for tags where it does not make sense to insert <ins> and <del> tags.
+ *
+ * @param {string} word The characters of the current token read so far.
+ *
+ * @return {string|null} The name of the atomic tag if the word will be an atomic tag,
+ *    null otherwise
+ */
+
+function isStartOfStyleTag(word: string) {
+    const result = styleTagsRegExp.exec(word);
+    return result && result[1];
+}
+
+/**
+ * Checks if the current word is the end of an style tag (i.e. it has all the characters,
+ * except for the end bracket of the closing tag, such as '<strong></strong').
+ *
+ * @param {string} word The characters of the current token read so far.
+ * @param {string} tag The ending tag to look for.
+ *
+ * @return {boolean} True if the word is now a complete token (including the end tag),
+ *    false otherwise.
+ */
+function isEndOfStyleTag(word: string, tag: string) {
+    return word.substring(word.length - tag.length - 2) === ('</' + tag);
+}
+
 /**
  * Checks if a tag is a void tag.
  *
@@ -173,6 +209,7 @@ function makeMatch(startInBefore: number, startInAfter: number, length: number,
     segmentEndInAfter: startInAfter + length - 1
   };}
 
+type ParseMode = 'char' | 'tag' | 'atomic_tag' | 'style_tag' | 'html_comment' | 'whitespace';
 /**
  * Tokenizes a string of HTML.
  *
@@ -181,22 +218,28 @@ function makeMatch(startInBefore: number, startInAfter: number, length: number,
  * @return {Array.<string>} The list of tokens.
  */
 export function htmlToTokens(html: string): Token[] {
-  let mode = 'char';
+  let mode: ParseMode = 'char';
   let currentWord = '';
   let currentAtomicTag = '';
+  let currentStyleTag = '';
   const words = [];
 
   for (const char of html) {
     switch (mode){
       case 'tag': {
         const atomicTag = isStartOfAtomicTag(currentWord);
+        const styleTag = isStartOfStyleTag(currentWord + char);
         if (atomicTag){
           mode = 'atomic_tag';
           currentAtomicTag = atomicTag;
           currentWord += char;
         } else if (isStartOfHTMLComment(currentWord)){
           mode = 'html_comment';
           currentWord += char;
+        } else if (styleTag) {
+          mode = 'style_tag';
+          currentStyleTag = styleTag;
+          currentWord = '<nobr>' + currentWord + char;
         } else if (isEndOfTag(char)){
           currentWord += '>';
           words.push(createToken(currentWord));
@@ -229,6 +272,26 @@ export function htmlToTokens(html: string): Token[] {
           mode = 'char';
         }
         break;
+      case 'style_tag':
+        if (isEndOfTag(char) && isEndOfStyleTag(currentWord, currentStyleTag)) {
+            currentWord += '>' + '</nobr>';
+            words.push(createToken(currentWord));
+            currentWord = '';
+            currentStyleTag = '';
+            mode = 'char';
+        }
+        else {
+          // break up styled blocks into individual styled words
+          if (/(\s+|&nbsp;|&#160;)/.test(char)) {
+            currentWord += '</' + currentStyleTag + '>';
+            if (currentWord) {
+                words.push(createToken(currentWord));
+            }
+            currentWord = '<' + currentStyleTag + '>';
+          }
+          currentWord += char;
+        }
+        break;
       case 'char':
         if (isStartOfTag(char)){
           if (currentWord){
@@ -273,7 +336,7 @@ export function htmlToTokens(html: string): Token[] {
         }
         break;
       default:
-        throw new Error('Unknown mode ' + mode);
+        return exhaustive(mode);
     }
   }
   if (currentWord){
@@ -330,6 +393,12 @@ function getKeyForToken(token: string){
     return `<iframe src="${iframe[1]}"></iframe>`;
   }
 
+  // Treat entire style tag as needing to be compared
+  const styleTag = styleTagsRegExp.exec(token);
+  if (styleTag) {
+      return token;
+  }
+
   // If the token is any other element, just grab the tag name.
   const tagName = /<([^\s>]+)[\s>]/.exec(token);
   if (tagName){
diff --git a/test/html_to_tokens.spec.js b/test/html_to_tokens.spec.js
@@ -33,8 +33,8 @@ describe('htmlToTokens', function(){
       res = htmlToTokens('<p>this is a <strong>test</strong></p>');
     });
 
-    it('should return 11', function(){
-      expect(res.length).to.equal(11);
+    it('should return 9', function(){
+      expect(res.length).to.equal(9);
     });
 
     it('should remove any html comments', function(){