Skip to content

Commit 5f24007

Browse files
author
Sven Hagemann
committed
Simplyfied whitespace checking
While finding blocks we have todo a whitespace check of part of the old sentence, this is done loads of times. This is done by refrerencing the oldWords array, and temporarely making a string from part of that array, and then checking if that string is only whitespace. I have replaced this by a loop that iterates over the part of the sentence item by item, and when one of the items is not a space (usually the first item), it immediately reports false and caching the result, speeding up the algorithm in some cases by up to 50%
1 parent ad70b4f commit 5f24007

File tree

1 file changed

+59
-72
lines changed

1 file changed

+59
-72
lines changed

lib/Caxy/HtmlDiff/HtmlDiff.php

Lines changed: 59 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -734,26 +734,24 @@ protected function matchingBlocks()
734734
}
735735

736736
/**
737-
* @param int $startInOld
738-
* @param int $endInOld
739-
* @param int $startInNew
740-
* @param int $endInNew
741-
* @param array $matchingBlocks
737+
* @param MatchingBlock[] $matchingBlocks
742738
*/
743-
protected function findMatchingBlocks($startInOld, $endInOld, $startInNew, $endInNew, &$matchingBlocks)
739+
protected function findMatchingBlocks(int $startInOld, int $endInOld, int $startInNew, int $endInNew, array &$matchingBlocks) : void
744740
{
745741
$match = $this->findMatch($startInOld, $endInOld, $startInNew, $endInNew);
746742

747-
if ($match !== null) {
748-
if ($startInOld < $match->startInOld && $startInNew < $match->startInNew) {
749-
$this->findMatchingBlocks($startInOld, $match->startInOld, $startInNew, $match->startInNew, $matchingBlocks);
750-
}
743+
if ($match === null) {
744+
return;
745+
}
751746

752-
$matchingBlocks[] = $match;
747+
if ($startInOld < $match->startInOld && $startInNew < $match->startInNew) {
748+
$this->findMatchingBlocks($startInOld, $match->startInOld, $startInNew, $match->startInNew, $matchingBlocks);
749+
}
753750

754-
if ($match->endInOld() < $endInOld && $match->endInNew() < $endInNew) {
755-
$this->findMatchingBlocks($match->endInOld(), $endInOld, $match->endInNew(), $endInNew, $matchingBlocks);
756-
}
751+
$matchingBlocks[] = $match;
752+
753+
if ($match->endInOld() < $endInOld && $match->endInNew() < $endInNew) {
754+
$this->findMatchingBlocks($match->endInOld(), $endInOld, $match->endInNew(), $endInNew, $matchingBlocks);
757755
}
758756
}
759757

@@ -766,70 +764,71 @@ protected function stripTagAttributes($word)
766764
{
767765
$space = $this->stringUtil->strpos($word, ' ', 1);
768766

769-
if ($space) {
767+
if ($space > 0) {
770768
return '<' . $this->stringUtil->substr($word, 1, $space) . '>';
771769
}
772770

773771
return trim($word, '<>');
774772
}
775773

776-
/**
777-
* @param int $startInOld
778-
* @param int $endInOld
779-
* @param int $startInNew
780-
* @param int $endInNew
781-
*
782-
* @return MatchingBlock|null
783-
*/
784-
protected function findMatch($startInOld, $endInOld, $startInNew, $endInNew)
774+
protected function findMatch(int $startInOld, int $endInOld, int $startInNew, int $endInNew) : ?MatchingBlock
785775
{
786776
$groupDiffs = $this->isGroupDiffs();
787777
$bestMatchInOld = $startInOld;
788778
$bestMatchInNew = $startInNew;
789779
$bestMatchSize = 0;
790-
$matchLengthAt = array();
780+
$matchLengthAt = [];
791781

792782
for ($indexInOld = $startInOld; $indexInOld < $endInOld; ++$indexInOld) {
793-
$newMatchLengthAt = array();
783+
$newMatchLengthAt = [];
784+
794785
$index = $this->oldWords[ $indexInOld ];
795-
if ($this->isTag($index)) {
786+
787+
if ($this->isTag($index) === true) {
796788
$index = $this->stripTagAttributes($index);
797789
}
798-
if (!isset($this->wordIndices[ $index ])) {
790+
791+
if (isset($this->wordIndices[$index]) === false) {
799792
$matchLengthAt = $newMatchLengthAt;
793+
800794
continue;
801795
}
802-
foreach ($this->wordIndices[ $index ] as $indexInNew) {
796+
797+
foreach ($this->wordIndices[$index] as $indexInNew) {
803798
if ($indexInNew < $startInNew) {
804799
continue;
805800
}
801+
806802
if ($indexInNew >= $endInNew) {
807803
break;
808804
}
809805

810-
$newMatchLength = (isset($matchLengthAt[ $indexInNew - 1 ]) ? $matchLengthAt[ $indexInNew - 1 ] : 0) + 1;
811-
$newMatchLengthAt[ $indexInNew ] = $newMatchLength;
806+
$newMatchLength =
807+
(isset($matchLengthAt[$indexInNew - 1]) === true ? ($matchLengthAt[$indexInNew - 1] + 1) : 1);
808+
809+
$newMatchLengthAt[$indexInNew] = $newMatchLength;
812810

813811
if ($newMatchLength > $bestMatchSize ||
814812
(
815-
$groupDiffs &&
813+
$groupDiffs === true &&
816814
$bestMatchSize > 0 &&
817-
$this->isOnlyWhitespace($this->array_slice_cached($this->oldWords, $bestMatchInOld, $bestMatchSize))
815+
$this->oldTextIsOnlyWhitespace($bestMatchInOld, $bestMatchSize) === true
818816
)
819817
) {
820818
$bestMatchInOld = $indexInOld - $newMatchLength + 1;
821819
$bestMatchInNew = $indexInNew - $newMatchLength + 1;
822-
$bestMatchSize = $newMatchLength;
820+
$bestMatchSize = $newMatchLength;
823821
}
824822
}
823+
825824
$matchLengthAt = $newMatchLengthAt;
826825
}
827826

828827
// Skip match if none found or match consists only of whitespace
829828
if ($bestMatchSize !== 0 &&
830829
(
831-
!$groupDiffs ||
832-
!$this->isOnlyWhitespace($this->array_slice_cached($this->oldWords, $bestMatchInOld, $bestMatchSize))
830+
$groupDiffs === false ||
831+
$this->oldTextIsOnlyWhitespace($bestMatchInOld, $bestMatchSize) === false
833832
)
834833
) {
835834
return new MatchingBlock($bestMatchInOld, $bestMatchInNew, $bestMatchSize);
@@ -838,40 +837,16 @@ protected function findMatch($startInOld, $endInOld, $startInNew, $endInNew)
838837
return null;
839838
}
840839

841-
/**
842-
* @param string $str
843-
*
844-
* @return bool
845-
*/
846-
protected function isOnlyWhitespace($str)
840+
protected function oldTextIsOnlyWhitespace(int $startingAtWord, int $wordCount) : bool
847841
{
848-
// Slightly faster then using preg_match
849-
return $str !== '' && trim($str) === '';
850-
}
842+
$isWhitespace = true;
851843

852-
/**
853-
* Special array_slice function that caches its last request.
854-
*
855-
* The diff algorithm seems to request the same information many times in a row.
856-
* by returning the previous answer the algorithm preforms way faster.
857-
*
858-
* The result is a string instead of an array, this way we safe on the amount of
859-
* memory intensive implode() calls.
860-
*
861-
* @param array &$array
862-
* @param integer $offset
863-
* @param integer|null $length
864-
*
865-
* @return string
866-
*/
867-
protected function array_slice_cached(&$array, $offset, $length = null)
868-
{
869-
static $lastOffset = null;
870-
static $lastLength = null;
871-
static $cache = null;
844+
// oldTextIsWhitespace get called consecutively by findMatch, with the same parameters.
845+
// by caching the previous result, we speed up the algorithm by more then 50%
846+
static $lastStartingWordOffset = null;
847+
static $lastWordCount = null;
848+
static $cache = null;
872849

873-
// PHP has no support for by-reference comparing.
874-
// to prevent false positive hits, reset the cache when the oldWords or newWords is changed.
875850
if ($this->resetCache === true) {
876851
$cache = null;
877852

@@ -880,16 +855,28 @@ protected function array_slice_cached(&$array, $offset, $length = null)
880855

881856
if (
882857
$cache !== null &&
883-
$lastLength === $length &&
884-
$lastOffset === $offset
858+
$lastWordCount === $wordCount &&
859+
$lastStartingWordOffset === $startingAtWord
885860
) { // Hit
886861
return $cache;
887862
} // Miss
888863

889-
$lastOffset = $offset;
890-
$lastLength = $length;
864+
for ($index = $startingAtWord; $index < ($startingAtWord + $wordCount); $index++) {
865+
// Assigning the oldWord to a variable is slightly faster then searching by reference twice
866+
// in the if statement
867+
$oldWord = $this->oldWords[$index];
868+
869+
if ($oldWord !== '' && trim($oldWord) !== '') {
870+
$isWhitespace = false;
871+
872+
break;
873+
}
874+
}
875+
876+
$lastWordCount = $wordCount;
877+
$lastStartingWordOffset = $startingAtWord;
891878

892-
$cache = implode('', array_slice($array, $offset, $length));
879+
$cache = $isWhitespace;
893880

894881
return $cache;
895882
}

0 commit comments

Comments
 (0)