html diff support style tags

SuttonKyle · SuttonKyle · commit e817cc331ea6 · 2021-12-21T16:00:01.000-05:00
diff --git a/src/htmldiff.ts b/src/htmldiff.ts
@@ -95,6 +95,38 @@ function isEndOfAtomicTag(word: string, tag: string){
   return word.substring(word.length - tag.length - 2) === ('</' + tag);
 }
 
+const styleTagsRegExp = /^<(strong|em)/;
+
+/**
+ * Checks if the current word is the beginning of an style tag. An style tag is one whose
+ * child nodes should be compared, but the entire tag should be treated as one token. This
+ * is useful for tags where it does not make sense to insert <ins> and <del> tags.
+ *
+ * @param {string} word The characters of the current token read so far.
+ *
+ * @return {string|null} The name of the atomic tag if the word will be an atomic tag,
+ *    null otherwise
+ */
+
+function isStartOfStyleTag(word: string) {
+    var result = styleTagsRegExp.exec(word);
+    return result && result[1];
+}
+
+/**
+ * Checks if the current word is the end of an style tag (i.e. it has all the characters,
+ * except for the end bracket of the closing tag, such as '<strong></strong').
+ *
+ * @param {string} word The characters of the current token read so far.
+ * @param {string} tag The ending tag to look for.
+ *
+ * @return {boolean} True if the word is now a complete token (including the end tag),
+ *    false otherwise.
+ */
+function isEndOfStyleTag(word: string, tag: string) {
+    return word.substring(word.length - tag.length - 2) === ('</' + tag);
+}
+
 /**
  * Checks if a tag is a void tag.
  *
@@ -184,19 +216,25 @@ export function htmlToTokens(html: string): Token[] {
   let mode = 'char';
   let currentWord = '';
   let currentAtomicTag = '';
+  let currentStyleTag = '';
   const words = [];
 
   for (const char of html) {
     switch (mode){
       case 'tag': {
         const atomicTag = isStartOfAtomicTag(currentWord);
+        const styleTag = isStartOfStyleTag(currentWord);
         if (atomicTag){
           mode = 'atomic_tag';
           currentAtomicTag = atomicTag;
           currentWord += char;
         } else if (isStartOfHTMLComment(currentWord)){
           mode = 'html_comment';
           currentWord += char;
+        } else if (styleTag) {
+          mode = 'style_tag';
+          currentStyleTag = styleTag;
+          currentWord = '<nobr>' + currentWord + char;
         } else if (isEndOfTag(char)){
           currentWord += '>';
           words.push(createToken(currentWord));
@@ -229,6 +267,26 @@ export function htmlToTokens(html: string): Token[] {
           mode = 'char';
         }
         break;
+      case 'style_tag':
+        if (isEndOfTag(char) && isEndOfStyleTag(currentWord, currentStyleTag)) {
+            currentWord += '>' + '</nobr>';
+            words.push(createToken(currentWord));
+            currentWord = '';
+            currentStyleTag = '';
+            mode = 'char';
+        }
+        else {
+          // break up styled blocks into individual styled words
+          if (/(\s+|&nbsp;|&#160;)/.test(char)) {
+            currentWord += '</' + currentStyleTag + '>';
+            if (currentWord) {
+                words.push(createToken(currentWord));
+            }
+            currentWord = '<' + currentStyleTag + '>';
+          }
+          currentWord += char;
+        }
+        break;
       case 'char':
         if (isStartOfTag(char)){
           if (currentWord){
@@ -330,6 +388,12 @@ function getKeyForToken(token: string){
     return `<iframe src="${iframe[1]}"></iframe>`;
   }
 
+  // Treat entire style tag as needing to be compared
+  var styleTag = /^<(strong|em)[\s>]/.exec(token);
+  if (styleTag) {
+      return token;
+  }
+
   // If the token is any other element, just grab the tag name.
   const tagName = /<([^\s>]+)[\s>]/.exec(token);
   if (tagName){