Skip to content

Commit a16330a

Browse files
Sven HagemannSavageTiger
authored andcommitted
Refactored the html to word parser
The old parser split up the html into a big string of characters, and then going over the string parsing it character by character, using a big complicated switch statement, and loads of if statements. Most of this has been replaced by regular expression parsing greatly simplifying the code, and reducing the execution time by 98%
1 parent 5f24007 commit a16330a

File tree

2 files changed

+69
-137
lines changed

2 files changed

+69
-137
lines changed

lib/Caxy/HtmlDiff/AbstractDiff.php

Lines changed: 67 additions & 129 deletions
Original file line numberDiff line numberDiff line change
@@ -398,8 +398,8 @@ protected function purifyHtml($html)
398398

399399
protected function splitInputsToWords()
400400
{
401-
$this->setOldWords($this->convertHtmlToListOfWords($this->explode($this->oldText)));
402-
$this->setNewWords($this->convertHtmlToListOfWords($this->explode($this->newText)));
401+
$this->setOldWords($this->convertHtmlToListOfWords($this->oldText));
402+
$this->setNewWords($this->convertHtmlToListOfWords($this->newText));
403403
}
404404

405405
/**
@@ -421,146 +421,84 @@ protected function setNewWords(array $newWords)
421421
}
422422

423423
/**
424-
* @param string $text
425-
*
426-
* @return bool
424+
* @return string[]
427425
*/
428-
protected function isPartOfWord($text)
426+
protected function convertHtmlToListOfWords(string $text) : array
429427
{
430-
return $this->ctypeAlphanumUnicode(str_replace($this->config->getSpecialCaseChars(), '', $text));
431-
}
428+
$words = [];
429+
$sentencesAndTags = [];
432430

433-
/**
434-
* @param array $characterString
435-
*
436-
* @return array
437-
*/
438-
protected function convertHtmlToListOfWords($characterString)
439-
{
440-
$mode = 'character';
441-
$current_word = '';
442-
$words = array();
443-
$keepNewLines = $this->getConfig()->isKeepNewLines();
444-
foreach ($characterString as $i => $character) {
445-
switch ($mode) {
446-
case 'character':
447-
if ($this->isStartOfTag($character)) {
448-
if ($current_word != '') {
449-
$words[] = $current_word;
450-
}
451-
452-
$current_word = '<';
453-
$mode = 'tag';
454-
} elseif (preg_match("/\s/u", $character)) {
455-
if ($current_word !== '') {
456-
$words[] = $current_word;
457-
}
458-
$current_word = $keepNewLines ? $character : preg_replace('/\s+/Su', ' ', $character);
459-
$mode = 'whitespace';
460-
} else {
461-
if (
462-
(($this->ctypeAlphanumUnicode($character) === true) && ($this->stringUtil->strlen($current_word) === 0 || $this->isPartOfWord($current_word))) ||
463-
(in_array($character, $this->config->getSpecialCaseChars()) && isset($characterString[$i + 1]) && $this->isPartOfWord($characterString[$i + 1]))
464-
) {
465-
$current_word .= $character;
466-
} else {
467-
$words[] = $current_word;
468-
$current_word = $character;
469-
}
470-
}
471-
break;
472-
case 'tag' :
473-
if ($this->isEndOfTag($character)) {
474-
$current_word .= '>';
475-
$words[] = $current_word;
476-
$current_word = '';
477-
478-
if (!preg_match('[^\s]u', $character)) {
479-
$mode = 'whitespace';
480-
} else {
481-
$mode = 'character';
482-
}
483-
} else {
484-
$current_word .= $character;
485-
}
486-
break;
487-
case 'whitespace':
488-
if ($this->isStartOfTag($character)) {
489-
if ($current_word !== '') {
490-
$words[] = $current_word;
491-
}
492-
$current_word = '<';
493-
$mode = 'tag';
494-
} elseif (preg_match("/\s/u", $character)) {
495-
$current_word .= $character;
496-
if (!$keepNewLines) $current_word = preg_replace('/\s+/Su', ' ', $current_word);
497-
} else {
498-
if ($current_word != '') {
499-
$words[] = $current_word;
500-
}
501-
$current_word = $character;
502-
$mode = 'character';
503-
}
504-
break;
505-
default:
506-
break;
507-
}
431+
$specialCharacters = '';
432+
433+
foreach ($this->config->getSpecialCaseChars() as $char) {
434+
$specialCharacters .= '\\' . $char;
508435
}
509-
if ($current_word != '') {
510-
$words[] = $current_word;
436+
437+
// Normalize no-break-spaces to regular spaces
438+
$text = str_replace("\xc2\xa0", ' ', $text);
439+
440+
preg_match_all('/<.+?>|[^<]+/mu', $text, $sentencesAndTags, PREG_SPLIT_NO_EMPTY);
441+
442+
foreach ($sentencesAndTags[0] as $sentenceOrHtmlTag) {
443+
if ($sentenceOrHtmlTag === '') {
444+
continue;
445+
}
446+
447+
if ($sentenceOrHtmlTag[0] === '<') {
448+
$words[] = $sentenceOrHtmlTag;
449+
450+
continue;
451+
}
452+
453+
$sentenceOrHtmlTag = $this->normalizeWhitespaceInHtmlSentence($sentenceOrHtmlTag);
454+
455+
$sentenceSplitIntoWords = [];
456+
457+
// This regex splits up every word by separating it at every non alpha-numerical, it allows the specialChars
458+
// in the middle of a word, but not at the beginning or the end of a word.
459+
// Split regex compiles to this (in default config case);
460+
// /\s|[\.\,\(\)\']|[a-zA-Z0-9\.\,\(\)'\pL]+[a-zA-Z0-9\pL]|[^\s]/mu
461+
$regex = sprintf('/\s|[%s]|[a-zA-Z0-9%s\pL]+[a-zA-Z0-9\pL]|[^\s]/mu', $specialCharacters, $specialCharacters);
462+
463+
preg_match_all(
464+
$regex,
465+
$sentenceOrHtmlTag . ' ', // Inject a space at the end to make sure the last word is found by having a space behind it.
466+
$sentenceSplitIntoWords,
467+
PREG_SPLIT_NO_EMPTY
468+
);
469+
470+
// Remove the last space, since that was added by us for the regex matcher
471+
array_pop($sentenceSplitIntoWords[0]);
472+
473+
foreach ($sentenceSplitIntoWords[0] as $word) {
474+
$words[] = $word;
475+
}
511476
}
512477

513478
return $words;
514479
}
515480

516-
/**
517-
* @param string $val
518-
*
519-
* @return bool
520-
*/
521-
protected function isStartOfTag($val)
481+
protected function normalizeWhitespaceInHtmlSentence(string $sentence) : string
522482
{
523-
return $val === '<';
524-
}
483+
if ($this->config->isKeepNewLines() === true) {
484+
return $sentence;
485+
}
525486

526-
/**
527-
* @param string $val
528-
*
529-
* @return bool
530-
*/
531-
protected function isEndOfTag($val)
532-
{
533-
return $val === '>';
534-
}
487+
$sentence = preg_replace('/\s\s+|\r+|\n+|\r\n+/', ' ', $sentence);
535488

536-
/**
537-
* @param string $value
538-
*
539-
* @return bool
540-
*/
541-
protected function isWhiteSpace($value)
542-
{
543-
return !preg_match('[^\s]u', $value);
544-
}
545489

546-
/**
547-
* @param string $value
548-
*
549-
* @return array
550-
*/
551-
protected function explode($value)
552-
{
553-
// as suggested by @onassar
554-
return preg_split('//u', $value, -1, PREG_SPLIT_NO_EMPTY);
555-
}
490+
$sentenceLength = $this->stringUtil->strlen($sentence);
491+
$firstCharacter = $this->stringUtil->substr($sentence, 0, 1);
492+
$lastCharacter = $this->stringUtil->substr($sentence, $sentenceLength -1, 1);
556493

557-
/**
558-
* @param string $str
559-
*
560-
* @return bool
561-
*/
562-
protected function ctypeAlphanumUnicode($str)
563-
{
564-
return preg_match("/^[a-zA-Z0-9\pL]+$/u", $str) === 1;
494+
if ($firstCharacter === ' ' || $firstCharacter === "\r" || $firstCharacter === "\n") {
495+
$sentence = ' ' . ltrim($sentence);
496+
}
497+
498+
if ($sentenceLength > 1 && ($lastCharacter === ' ' || $lastCharacter === "\r" || $lastCharacter === "\n")) {
499+
$sentence = rtrim($sentence) . ' ';
500+
}
501+
502+
return $sentence;
565503
}
566504
}

lib/Caxy/HtmlDiff/HtmlDiffConfig.php

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ class HtmlDiffConfig
1313
protected $specialCaseTags = array('strong', 'b', 'i', 'big', 'small', 'u', 'sub', 'sup', 'strike', 's', 'p');
1414

1515
/**
16-
* @var array
16+
* @var string[]
1717
*/
1818
protected $specialCaseChars = array('.', ',', '(', ')', '\'');
1919

@@ -126,18 +126,12 @@ public function setMatchThreshold($matchThreshold)
126126
return $this;
127127
}
128128

129-
/**
130-
* @param array $chars
131-
*/
132129
public function setSpecialCaseChars(array $chars)
133130
{
134131
$this->specialCaseChars = $chars;
135132
}
136133

137-
/**
138-
* @return array|null
139-
*/
140-
public function getSpecialCaseChars()
134+
public function getSpecialCaseChars() : array
141135
{
142136
return $this->specialCaseChars;
143137
}

0 commit comments

Comments
 (0)