Refactored the html to word parser

Sven Hagemann · SavageTiger · commit a16330ad47ad · 2021-04-05T22:58:06.000+02:00
The old parser split up the html into a big string of characters, and then going over the string parsing it character by character, using a big complicated switch statement, and loads of if statements.

Most of this has been replaced by regular expression parsing greatly simplifying the code, and reducing the execution time by 98%
diff --git a/lib/Caxy/HtmlDiff/AbstractDiff.php b/lib/Caxy/HtmlDiff/AbstractDiff.php
@@ -398,8 +398,8 @@ protected function purifyHtml($html)
 
     protected function splitInputsToWords()
     {
-        $this->setOldWords($this->convertHtmlToListOfWords($this->explode($this->oldText)));
-        $this->setNewWords($this->convertHtmlToListOfWords($this->explode($this->newText)));
+        $this->setOldWords($this->convertHtmlToListOfWords($this->oldText));
+        $this->setNewWords($this->convertHtmlToListOfWords($this->newText));
     }
 
     /**
@@ -421,146 +421,84 @@ protected function setNewWords(array $newWords)
     }
 
     /**
-     * @param string $text
-     *
-     * @return bool
+     * @return string[]
      */
-    protected function isPartOfWord($text)
+    protected function convertHtmlToListOfWords(string $text) : array
     {
-        return $this->ctypeAlphanumUnicode(str_replace($this->config->getSpecialCaseChars(), '', $text));
-    }
+        $words            = [];
+        $sentencesAndTags = [];
 
-    /**
-     * @param array $characterString
-     *
-     * @return array
-     */
-    protected function convertHtmlToListOfWords($characterString)
-    {
-        $mode = 'character';
-        $current_word = '';
-        $words = array();
-        $keepNewLines = $this->getConfig()->isKeepNewLines();
-        foreach ($characterString as $i => $character) {
-            switch ($mode) {
-                case 'character':
-                if ($this->isStartOfTag($character)) {
-                    if ($current_word != '') {
-                        $words[] = $current_word;
-                    }
-
-                    $current_word = '<';
-                    $mode = 'tag';
-                } elseif (preg_match("/\s/u", $character)) {
-                    if ($current_word !== '') {
-                        $words[] = $current_word;
-                    }
-                    $current_word = $keepNewLines ? $character : preg_replace('/\s+/Su', ' ', $character);
-                    $mode = 'whitespace';
-                } else {
-                    if (
-                        (($this->ctypeAlphanumUnicode($character) === true) && ($this->stringUtil->strlen($current_word) === 0 || $this->isPartOfWord($current_word))) ||
-                        (in_array($character, $this->config->getSpecialCaseChars()) && isset($characterString[$i + 1]) && $this->isPartOfWord($characterString[$i + 1]))
-                    ) {
-                        $current_word .= $character;
-                    } else {
-                        $words[] = $current_word;
-                        $current_word = $character;
-                    }
-                }
-                break;
-                case 'tag' :
-                if ($this->isEndOfTag($character)) {
-                    $current_word .= '>';
-                    $words[] = $current_word;
-                    $current_word = '';
-
-                    if (!preg_match('[^\s]u', $character)) {
-                        $mode = 'whitespace';
-                    } else {
-                        $mode = 'character';
-                    }
-                } else {
-                    $current_word .= $character;
-                }
-                break;
-                case 'whitespace':
-                if ($this->isStartOfTag($character)) {
-                    if ($current_word !== '') {
-                        $words[] = $current_word;
-                    }
-                    $current_word = '<';
-                    $mode = 'tag';
-                } elseif (preg_match("/\s/u", $character)) {
-                    $current_word .= $character;
-                    if (!$keepNewLines) $current_word = preg_replace('/\s+/Su', ' ', $current_word);
-                } else {
-                    if ($current_word != '') {
-                        $words[] = $current_word;
-                    }
-                    $current_word = $character;
-                    $mode = 'character';
-                }
-                break;
-                default:
-                break;
-            }
+        $specialCharacters = '';
+
+        foreach ($this->config->getSpecialCaseChars() as $char) {
+            $specialCharacters .= '\\' . $char;
         }
-        if ($current_word != '') {
-            $words[] = $current_word;
+
+        // Normalize no-break-spaces to regular spaces
+        $text = str_replace("\xc2\xa0", ' ', $text);
+
+        preg_match_all('/<.+?>|[^<]+/mu', $text, $sentencesAndTags, PREG_SPLIT_NO_EMPTY);
+
+        foreach ($sentencesAndTags[0] as $sentenceOrHtmlTag) {
+            if ($sentenceOrHtmlTag === '') {
+                continue;
+            }
+
+            if ($sentenceOrHtmlTag[0] === '<') {
+                $words[] = $sentenceOrHtmlTag;
+
+                continue;
+            }
+
+            $sentenceOrHtmlTag = $this->normalizeWhitespaceInHtmlSentence($sentenceOrHtmlTag);
+
+            $sentenceSplitIntoWords = [];
+
+            // This regex splits up every word by separating it at every non alpha-numerical, it allows the specialChars
+            // in the middle of a word, but not at the beginning or the end of a word.
+            // Split regex compiles to this (in default config case);
+            // /\s|[\.\,\(\)\']|[a-zA-Z0-9\.\,\(\)'\pL]+[a-zA-Z0-9\pL]|[^\s]/mu
+            $regex = sprintf('/\s|[%s]|[a-zA-Z0-9%s\pL]+[a-zA-Z0-9\pL]|[^\s]/mu', $specialCharacters, $specialCharacters);
+
+            preg_match_all(
+                $regex,
+                $sentenceOrHtmlTag . ' ', // Inject a space at the end to make sure the last word is found by having a space behind it.
+                $sentenceSplitIntoWords,
+                PREG_SPLIT_NO_EMPTY
+            );
+
+            // Remove the last space, since that was added by us for the regex matcher
+            array_pop($sentenceSplitIntoWords[0]);
+
+            foreach ($sentenceSplitIntoWords[0] as $word) {
+                $words[] = $word;
+            }
         }
 
         return $words;
     }
 
-    /**
-     * @param string $val
-     *
-     * @return bool
-     */
-    protected function isStartOfTag($val)
+    protected function normalizeWhitespaceInHtmlSentence(string $sentence) : string
     {
-        return $val === '<';
-    }
+        if ($this->config->isKeepNewLines() === true) {
+            return $sentence;
+        }
 
-    /**
-     * @param string $val
-     *
-     * @return bool
-     */
-    protected function isEndOfTag($val)
-    {
-        return $val === '>';
-    }
+        $sentence = preg_replace('/\s\s+|\r+|\n+|\r\n+/', ' ', $sentence);
 
-    /**
-     * @param string $value
-     *
-     * @return bool
-     */
-    protected function isWhiteSpace($value)
-    {
-        return !preg_match('[^\s]u', $value);
-    }
 
-    /**
-     * @param string $value
-     *
-     * @return array
-     */
-    protected function explode($value)
-    {
-        // as suggested by @onassar
-        return preg_split('//u', $value, -1, PREG_SPLIT_NO_EMPTY);
-    }
+        $sentenceLength = $this->stringUtil->strlen($sentence);
+        $firstCharacter = $this->stringUtil->substr($sentence, 0, 1);
+        $lastCharacter  = $this->stringUtil->substr($sentence, $sentenceLength -1, 1);
 
-    /**
-     * @param string $str
-     *
-     * @return bool
-     */
-    protected function ctypeAlphanumUnicode($str)
-    {
-        return preg_match("/^[a-zA-Z0-9\pL]+$/u", $str) === 1;
+        if ($firstCharacter === ' ' || $firstCharacter === "\r" || $firstCharacter === "\n") {
+            $sentence = ' ' . ltrim($sentence);
+        }
+
+        if ($sentenceLength > 1 && ($lastCharacter === ' ' || $lastCharacter === "\r" || $lastCharacter === "\n")) {
+            $sentence = rtrim($sentence) . ' ';
+        }
+
+        return $sentence;
     }
 }
diff --git a/lib/Caxy/HtmlDiff/HtmlDiffConfig.php b/lib/Caxy/HtmlDiff/HtmlDiffConfig.php
@@ -13,7 +13,7 @@ class HtmlDiffConfig
     protected $specialCaseTags = array('strong', 'b', 'i', 'big', 'small', 'u', 'sub', 'sup', 'strike', 's', 'p');
 
     /**
-     * @var array
+     * @var string[]
      */
     protected $specialCaseChars = array('.', ',', '(', ')', '\'');
 
@@ -126,18 +126,12 @@ public function setMatchThreshold($matchThreshold)
         return $this;
     }
 
-    /**
-     * @param array $chars
-     */
     public function setSpecialCaseChars(array $chars)
     {
         $this->specialCaseChars = $chars;
     }
 
-    /**
-     * @return array|null
-     */
-    public function getSpecialCaseChars()
+    public function getSpecialCaseChars() : array
     {
         return $this->specialCaseChars;
     }

Original file line number	Diff line number	Diff line change
`@@ -13,7 +13,7 @@ class HtmlDiffConfig`
`13`	`13`	`protected $specialCaseTags = array('strong', 'b', 'i', 'big', 'small', 'u', 'sub', 'sup', 'strike', 's', 'p');`
`14`	`14`
`15`	`15`	`/**`
`16`		`- * @var array`
	`16`	`+ * @var string[]`
`17`	`17`	`*/`
`18`	`18`	`protected $specialCaseChars = array('.', ',', '(', ')', '\'');`
`19`	`19`
`@@ -126,18 +126,12 @@ public function setMatchThreshold($matchThreshold)`
`126`	`126`	`return $this;`
`127`	`127`	`}`
`128`	`128`
`129`		`- /**`
`130`		`- * @param array $chars`
`131`		`- */`
`132`	`129`	`public function setSpecialCaseChars(array $chars)`
`133`	`130`	`{`
`134`	`131`	`$this->specialCaseChars = $chars;`
`135`	`132`	`}`
`136`	`133`
`137`		`- /**`
`138`		`- * @return array\|null`
`139`		`- */`
`140`		`- public function getSpecialCaseChars()`
	`134`	`+ public function getSpecialCaseChars() : array`
`141`	`135`	`{`
`142`	`136`	`return $this->specialCaseChars;`
`143`	`137`	`}`