Skip to content

Commit a48091c

Browse files
authored
Merge pull request #147 from tgalopin/tokenizer-perfs
Improve the Tokenizer performance
2 parents 563687a + 7ac198d commit a48091c

File tree

1 file changed

+92
-52
lines changed

1 file changed

+92
-52
lines changed

src/HTML5/Parser/Tokenizer.php

Lines changed: 92 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -121,14 +121,55 @@ public function setTextMode($textmode, $untilTag = null)
121121
*/
122122
protected function consumeData()
123123
{
124-
// Character Ref
125-
/*
126-
* $this->characterReference() || $this->tagOpen() || $this->eof() || $this->characterData();
127-
*/
124+
// Character reference
128125
$this->characterReference();
129-
$this->tagOpen();
130-
$this->eof();
131-
$this->characterData();
126+
127+
$tok = $this->scanner->current();
128+
129+
// Parse tag
130+
if ($tok === '<') {
131+
// Any buffered text data can go out now.
132+
$this->flushBuffer();
133+
134+
$tok = $this->scanner->next();
135+
136+
$this->markupDeclaration($tok)
137+
|| $this->endTag()
138+
|| $this->processingInstruction()
139+
|| $this->tagName()
140+
// This always returns false.
141+
|| $this->parseError("Illegal tag opening")
142+
|| $this->characterData();
143+
144+
$tok = $this->scanner->current();
145+
}
146+
147+
// Handle end of document
148+
$this->eof($tok);
149+
150+
// Parse character
151+
if ($tok !== false) {
152+
switch ($this->textMode) {
153+
case Elements::TEXT_RAW:
154+
$this->rawText($tok);
155+
break;
156+
157+
case Elements::TEXT_RCDATA:
158+
$this->rcdata($tok);
159+
break;
160+
161+
default:
162+
if (!strspn($tok, "<&")) {
163+
// NULL character
164+
if ($tok === "\00") {
165+
$this->parseError("Received null character.");
166+
}
167+
168+
$this->text .= $tok;
169+
$this->scanner->next();
170+
}
171+
}
172+
}
132173

133174
return $this->carryOn;
134175
}
@@ -148,64 +189,78 @@ protected function characterData()
148189
}
149190
switch ($this->textMode) {
150191
case Elements::TEXT_RAW:
151-
return $this->rawText();
192+
return $this->rawText($tok);
152193
case Elements::TEXT_RCDATA:
153-
return $this->rcdata();
194+
return $this->rcdata($tok);
154195
default:
155196
if (strspn($tok, "<&")) {
156197
return false;
157198
}
158-
return $this->text();
199+
return $this->text($tok);
159200
}
160201
}
161202

162203
/**
163204
* This buffers the current token as character data.
205+
*
206+
* @param string $tok The current token.
207+
*
208+
* @return bool
164209
*/
165-
protected function text()
210+
protected function text($tok)
166211
{
167-
$tok = $this->scanner->current();
168-
169212
// This should never happen...
170213
if ($tok === false) {
171214
return false;
172215
}
173-
// Null
216+
217+
// NULL character
174218
if ($tok === "\00") {
175219
$this->parseError("Received null character.");
176220
}
177-
// fprintf(STDOUT, "Writing '%s'", $tok);
221+
178222
$this->buffer($tok);
179223
$this->scanner->next();
224+
180225
return true;
181226
}
182227

183228
/**
184229
* Read text in RAW mode.
230+
*
231+
* @param string $tok The current token.
232+
*
233+
* @return bool
185234
*/
186-
protected function rawText()
235+
protected function rawText($tok)
187236
{
188237
if (is_null($this->untilTag)) {
189-
return $this->text();
238+
return $this->text($tok);
190239
}
240+
191241
$sequence = '</' . $this->untilTag . '>';
192242
$txt = $this->readUntilSequence($sequence);
193243
$this->events->text($txt);
194244
$this->setTextMode(0);
245+
195246
return $this->endTag();
196247
}
197248

198249
/**
199250
* Read text in RCDATA mode.
251+
*
252+
* @param string $tok The current token.
253+
*
254+
* @return bool
200255
*/
201-
protected function rcdata()
256+
protected function rcdata($tok)
202257
{
203258
if (is_null($this->untilTag)) {
204-
return $this->text();
259+
return $this->text($tok);
205260
}
261+
206262
$sequence = '</' . $this->untilTag;
207263
$txt = '';
208-
$tok = $this->scanner->current();
209264

210265
$caseSensitive = !Elements::isHtml5Element($this->untilTag);
211266
while ($tok !== false && ! ($tok == '<' && ($this->sequenceMatches($sequence, $caseSensitive)))) {
@@ -223,24 +278,28 @@ protected function rcdata()
223278
if ($this->scanner->current() !== '>') {
224279
$this->parseError("Unclosed RCDATA end tag");
225280
}
281+
226282
$this->scanner->unconsume($len);
227283
$this->events->text($txt);
228284
$this->setTextMode(0);
285+
229286
return $this->endTag();
230287
}
231288

232289
/**
233290
* If the document is read, emit an EOF event.
234291
*/
235-
protected function eof()
292+
protected function eof($tok)
236293
{
237-
if ($this->scanner->current() === false) {
294+
if ($tok === false) {
238295
// fprintf(STDOUT, "EOF");
239296
$this->flushBuffer();
240297
$this->events->eof();
241298
$this->carryOn = false;
299+
242300
return true;
243301
}
302+
244303
return false;
245304
}
246305

@@ -262,33 +321,12 @@ protected function characterReference()
262321
return false;
263322
}
264323

265-
/**
266-
* Emit a tagStart event on encountering a tag.
267-
*
268-
* 8.2.4.8
269-
*/
270-
protected function tagOpen()
271-
{
272-
if ($this->scanner->current() != '<') {
273-
return false;
274-
}
275-
276-
// Any buffered text data can go out now.
277-
$this->flushBuffer();
278-
279-
$this->scanner->next();
280-
281-
return $this->markupDeclaration() || $this->endTag() || $this->processingInstruction() || $this->tagName() ||
282-
/* This always returns false. */
283-
$this->parseError("Illegal tag opening") || $this->characterData();
284-
}
285-
286324
/**
287325
* Look for markup.
288326
*/
289-
protected function markupDeclaration()
327+
protected function markupDeclaration($tok)
290328
{
291-
if ($this->scanner->current() != '!') {
329+
if ($tok != '!') {
292330
return false;
293331
}
294332

@@ -343,8 +381,9 @@ protected function endTag()
343381
// Trash whitespace.
344382
$this->scanner->whitespace();
345383

346-
if ($this->scanner->current() != '>') {
347-
$this->parseError("Expected >, got '%s'", $this->scanner->current());
384+
$tok = $this->scanner->current();
385+
if ($tok != '>') {
386+
$this->parseError("Expected >, got '%s'", $tok);
348387
// We just trash stuff until we get to the next tag close.
349388
$this->scanner->charsUntil('>');
350389
}
@@ -456,10 +495,11 @@ protected function attribute(&$attributes)
456495
$name = strtolower($this->scanner->charsUntil("/>=\n\f\t "));
457496

458497
if (strlen($name) == 0) {
459-
$this->parseError("Expected an attribute name, got %s.", $this->scanner->current());
498+
$tok = $this->scanner->current();
499+
$this->parseError("Expected an attribute name, got %s.", $tok);
460500
// Really, only '=' can be the char here. Everything else gets absorbed
461501
// under one rule or another.
462-
$name = $this->scanner->current();
502+
$name = $tok;
463503
$this->scanner->next();
464504
}
465505

@@ -556,7 +596,7 @@ protected function quotedAttributeValue($quote)
556596

557597
$tok = $this->scanner->current();
558598
if ($tok == '&') {
559-
$val .= $this->decodeCharacterReference(true, $tok);
599+
$val .= $this->decodeCharacterReference(true);
560600
continue;
561601
}
562602
break;
@@ -714,7 +754,7 @@ protected function doctype()
714754
// EOF: die.
715755
if ($tok === false) {
716756
$this->events->doctype('html5', EventHandler::DOCTYPE_NONE, '', true);
717-
return $this->eof();
757+
return $this->eof($tok);
718758
}
719759

720760
// NULL char: convert.
@@ -1032,6 +1072,7 @@ protected function parseError($msg)
10321072
$line = $this->scanner->currentLine();
10331073
$col = $this->scanner->columnOffset();
10341074
$this->events->parseError($msg, $line, $col);
1075+
10351076
return false;
10361077
}
10371078

@@ -1049,7 +1090,6 @@ protected function parseError($msg)
10491090
*/
10501091
protected function decodeCharacterReference($inAttribute = false)
10511092
{
1052-
10531093
// If it fails this, it's definitely not an entity.
10541094
if ($this->scanner->current() != '&') {
10551095
return false;

0 commit comments

Comments
 (0)