Skip to content

Commit ed6b64d

Browse files
authored
Merge pull request #148 from Masterminds/perf
Improve performance by moving sequence matching
2 parents a48091c + 5c5634a commit ed6b64d

File tree

2 files changed

+39
-35
lines changed

2 files changed

+39
-35
lines changed

src/HTML5/Parser/Scanner.php

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,30 @@ public function __construct($data, $encoding = 'UTF-8')
6161
$this->EOF = strlen($data);
6262
}
6363

64+
/**
65+
* Check if upcomming chars match the given sequence.
66+
*
67+
* This will read the stream for the $sequence. If it's
68+
* found, this will return true. If not, return false.
69+
* Since this unconsumes any chars it reads, the caller
70+
* will still need to read the next sequence, even if
71+
* this returns true.
72+
*
73+
* Example: $this->scanner->sequenceMatches('</script>') will
74+
* see if the input stream is at the start of a
75+
* '</script>' string.
76+
*
77+
* @param string $sequence
78+
* @param bool $caseSensitive
79+
*
80+
* @return bool
81+
*/
82+
public function sequenceMatches($sequence, $caseSensitive = true)
83+
{
84+
$portion = substr($this->data, $this->char, strlen($sequence));
85+
return $caseSensitive ? $portion === $sequence : strcasecmp($portion, $sequence) === 0;
86+
}
87+
6488
/**
6589
* Get the current position.
6690
*
@@ -126,9 +150,7 @@ public function current()
126150
*/
127151
public function consume($count = 1)
128152
{
129-
for ($i = 0; $i < $count; ++ $i) {
130-
$this->next();
131-
}
153+
$this->char += $count;
132154
}
133155

134156
/**

src/HTML5/Parser/Tokenizer.php

Lines changed: 14 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -263,7 +263,7 @@ protected function rcdata($tok)
263263
$txt = '';
264264

265265
$caseSensitive = !Elements::isHtml5Element($this->untilTag);
266-
while ($tok !== false && ! ($tok == '<' && ($this->sequenceMatches($sequence, $caseSensitive)))) {
266+
while ($tok !== false && ! ($tok == '<' && ($this->scanner->sequenceMatches($sequence, $caseSensitive)))) {
267267
if ($tok == '&') {
268268
$txt .= $this->decodeCharacterReference();
269269
$tok = $this->scanner->current();
@@ -313,12 +313,13 @@ protected function eof($tok)
313313
*/
314314
protected function characterReference()
315315
{
316-
$ref = $this->decodeCharacterReference();
317-
if ($ref !== false) {
318-
$this->buffer($ref);
319-
return true;
316+
if ($this->scanner->current() !== '&') {
317+
return false;
320318
}
321-
return false;
319+
320+
$ref = $this->decodeCharacterReference();
321+
$this->buffer($ref);
322+
return true;
322323
}
323324

324325
/**
@@ -892,7 +893,7 @@ protected function cdataSection()
892893
}
893894
$cdata .= $tok;
894895
$tok = $this->scanner->next();
895-
} while (! $this->sequenceMatches(']]>'));
896+
} while (! $this->scanner->sequenceMatches(']]>'));
896897

897898
// Consume ]]>
898899
$this->scanner->consume(3);
@@ -972,7 +973,7 @@ protected function readUntilSequence($sequence)
972973
$buffer .= $this->scanner->charsUntil($first);
973974

974975
// Stop as soon as we hit the stopping condition.
975-
if ($this->sequenceMatches($sequence, false)) {
976+
if ($this->scanner->sequenceMatches($sequence, false)) {
976977
return $buffer;
977978
}
978979
$buffer .= $this->scanner->current();
@@ -993,7 +994,7 @@ protected function readUntilSequence($sequence)
993994
* will still need to read the next sequence, even if
994995
* this returns true.
995996
*
996-
* Example: $this->sequenceMatches('</script>') will
997+
* Example: $this->scanner->sequenceMatches('</script>') will
997998
* see if the input stream is at the start of a
998999
* '</script>' string.
9991000
*
@@ -1004,22 +1005,9 @@ protected function readUntilSequence($sequence)
10041005
*/
10051006
protected function sequenceMatches($sequence, $caseSensitive = true)
10061007
{
1007-
$len = strlen($sequence);
1008-
$buffer = '';
1009-
for ($i = 0; $i < $len; ++ $i) {
1010-
$tok = $this->scanner->current();
1011-
$buffer .= $tok;
1008+
@trigger_error(__METHOD__ . ' method is deprecated since version 2.4 and will be removed in 3.0. Use Scanner::sequenceMatches() instead.', E_USER_DEPRECATED);
10121009

1013-
// EOF. Rewind and let the caller handle it.
1014-
if ($tok === false) {
1015-
$this->scanner->unconsume($i);
1016-
return false;
1017-
}
1018-
$this->scanner->next();
1019-
}
1020-
1021-
$this->scanner->unconsume($len);
1022-
return $caseSensitive ? $buffer == $sequence : strcasecmp($buffer, $sequence) === 0;
1010+
return $this->scanner->sequenceMatches($sequence, $caseSensitive);
10231011
}
10241012

10251013
/**
@@ -1079,22 +1067,16 @@ protected function parseError($msg)
10791067
/**
10801068
* Decode a character reference and return the string.
10811069
*
1082-
* Returns false if the entity could not be found. If $inAttribute is set
1083-
* to true, a bare & will be returned as-is.
1070+
* If $inAttribute is set to true, a bare & will be returned as-is.
10841071
*
10851072
* @param bool $inAttribute
10861073
* Set to true if the text is inside of an attribute value.
10871074
* false otherwise.
10881075
*
1089-
* @return bool|string
1076+
* @return string
10901077
*/
10911078
protected function decodeCharacterReference($inAttribute = false)
10921079
{
1093-
// If it fails this, it's definitely not an entity.
1094-
if ($this->scanner->current() != '&') {
1095-
return false;
1096-
}
1097-
10981080
// Next char after &.
10991081
$tok = $this->scanner->next();
11001082
$start = $this->scanner->position();

0 commit comments

Comments
 (0)