@@ -121,14 +121,55 @@ public function setTextMode($textmode, $untilTag = null)
121121 */
122122 protected function consumeData ()
123123 {
124- // Character Ref
125- /*
126- * $this->characterReference() || $this->tagOpen() || $this->eof() || $this->characterData();
127- */
124+ // Character reference
128125 $ this ->characterReference ();
129- $ this ->tagOpen ();
130- $ this ->eof ();
131- $ this ->characterData ();
126+
127+ $ tok = $ this ->scanner ->current ();
128+
129+ // Parse tag
130+ if ($ tok === '< ' ) {
131+ // Any buffered text data can go out now.
132+ $ this ->flushBuffer ();
133+
134+ $ tok = $ this ->scanner ->next ();
135+
136+ $ this ->markupDeclaration ($ tok )
137+ || $ this ->endTag ()
138+ || $ this ->processingInstruction ()
139+ || $ this ->tagName ()
140+ // This always returns false.
141+ || $ this ->parseError ("Illegal tag opening " )
142+ || $ this ->characterData ();
143+
144+ $ tok = $ this ->scanner ->current ();
145+ }
146+
147+ // Handle end of document
148+ $ this ->eof ($ tok );
149+
150+ // Parse character
151+ if ($ tok !== false ) {
152+ switch ($ this ->textMode ) {
153+ case Elements::TEXT_RAW :
154+ $ this ->rawText ($ tok );
155+ break ;
156+
157+ case Elements::TEXT_RCDATA :
158+ $ this ->rcdata ($ tok );
159+ break ;
160+
161+ default :
162+ if (!strspn ($ tok , "<& " )) {
163+ // NULL character
164+ if ($ tok === "\00" ) {
165+ $ this ->parseError ("Received null character. " );
166+ }
167+
168+ $ this ->text .= $ tok ;
169+ $ this ->scanner ->next ();
170+ }
171+ }
172+ }
132173
133174 return $ this ->carryOn ;
134175 }
@@ -148,64 +189,78 @@ protected function characterData()
148189 }
149190 switch ($ this ->textMode ) {
150191 case Elements::TEXT_RAW :
151- return $ this ->rawText ();
192+ return $ this ->rawText ($ tok );
152193 case Elements::TEXT_RCDATA :
153- return $ this ->rcdata ();
194+ return $ this ->rcdata ($ tok );
154195 default :
155196 if (strspn ($ tok , "<& " )) {
156197 return false ;
157198 }
158- return $ this ->text ();
199+ return $ this ->text ($ tok );
159200 }
160201 }
161202
162203 /**
163204 * This buffers the current token as character data.
205+ *
206+ * @param string $tok The current token.
207+ *
208+ * @return bool
164209 */
165- protected function text ()
210+ protected function text ($ tok )
166211 {
167- $ tok = $ this ->scanner ->current ();
168-
169212 // This should never happen...
170213 if ($ tok === false ) {
171214 return false ;
172215 }
173- // Null
216+
217+ // NULL character
174218 if ($ tok === "\00" ) {
175219 $ this ->parseError ("Received null character. " );
176220 }
177- // fprintf(STDOUT, "Writing '%s'", $tok);
221+
178222 $ this ->buffer ($ tok );
179223 $ this ->scanner ->next ();
224+
180225 return true ;
181226 }
182227
183228 /**
184229 * Read text in RAW mode.
230+ *
231+ * @param string $tok The current token.
232+ *
233+ * @return bool
185234 */
186- protected function rawText ()
235+ protected function rawText ($ tok )
187236 {
188237 if (is_null ($ this ->untilTag )) {
189- return $ this ->text ();
238+ return $ this ->text ($ tok );
190239 }
240+
191241 $ sequence = '</ ' . $ this ->untilTag . '> ' ;
192242 $ txt = $ this ->readUntilSequence ($ sequence );
193243 $ this ->events ->text ($ txt );
194244 $ this ->setTextMode (0 );
245+
195246 return $ this ->endTag ();
196247 }
197248
198249 /**
199250 * Read text in RCDATA mode.
251+ *
252+ * @param string $tok The current token.
253+ *
254+ * @return bool
200255 */
201- protected function rcdata ()
256+ protected function rcdata ($ tok )
202257 {
203258 if (is_null ($ this ->untilTag )) {
204- return $ this ->text ();
259+ return $ this ->text ($ tok );
205260 }
261+
206262 $ sequence = '</ ' . $ this ->untilTag ;
207263 $ txt = '' ;
208- $ tok = $ this ->scanner ->current ();
209264
210265 $ caseSensitive = !Elements::isHtml5Element ($ this ->untilTag );
211266 while ($ tok !== false && ! ($ tok == '< ' && ($ this ->sequenceMatches ($ sequence , $ caseSensitive )))) {
@@ -223,24 +278,28 @@ protected function rcdata()
223278 if ($ this ->scanner ->current () !== '> ' ) {
224279 $ this ->parseError ("Unclosed RCDATA end tag " );
225280 }
281+
226282 $ this ->scanner ->unconsume ($ len );
227283 $ this ->events ->text ($ txt );
228284 $ this ->setTextMode (0 );
285+
229286 return $ this ->endTag ();
230287 }
231288
232289 /**
233290 * If the document is read, emit an EOF event.
234291 */
235- protected function eof ()
292+ protected function eof ($ tok )
236293 {
237- if ($ this -> scanner -> current () === false ) {
294+ if ($ tok === false ) {
238295 // fprintf(STDOUT, "EOF");
239296 $ this ->flushBuffer ();
240297 $ this ->events ->eof ();
241298 $ this ->carryOn = false ;
299+
242300 return true ;
243301 }
302+
244303 return false ;
245304 }
246305
@@ -262,33 +321,12 @@ protected function characterReference()
262321 return false ;
263322 }
264323
265- /**
266- * Emit a tagStart event on encountering a tag.
267- *
268- * 8.2.4.8
269- */
270- protected function tagOpen ()
271- {
272- if ($ this ->scanner ->current () != '< ' ) {
273- return false ;
274- }
275-
276- // Any buffered text data can go out now.
277- $ this ->flushBuffer ();
278-
279- $ this ->scanner ->next ();
280-
281- return $ this ->markupDeclaration () || $ this ->endTag () || $ this ->processingInstruction () || $ this ->tagName () ||
282- /* This always returns false. */
283- $ this ->parseError ("Illegal tag opening " ) || $ this ->characterData ();
284- }
285-
286324 /**
287325 * Look for markup.
288326 */
289- protected function markupDeclaration ()
327+ protected function markupDeclaration ($ tok )
290328 {
291- if ($ this -> scanner -> current () != '! ' ) {
329+ if ($ tok != '! ' ) {
292330 return false ;
293331 }
294332
@@ -343,8 +381,9 @@ protected function endTag()
343381 // Trash whitespace.
344382 $ this ->scanner ->whitespace ();
345383
346- if ($ this ->scanner ->current () != '> ' ) {
347- $ this ->parseError ("Expected >, got '%s' " , $ this ->scanner ->current ());
384+ $ tok = $ this ->scanner ->current ();
385+ if ($ tok != '> ' ) {
386+ $ this ->parseError ("Expected >, got '%s' " , $ tok );
348387 // We just trash stuff until we get to the next tag close.
349388 $ this ->scanner ->charsUntil ('> ' );
350389 }
@@ -456,10 +495,11 @@ protected function attribute(&$attributes)
456495 $ name = strtolower ($ this ->scanner ->charsUntil ("/>= \n\f\t " ));
457496
458497 if (strlen ($ name ) == 0 ) {
459- $ this ->parseError ("Expected an attribute name, got %s. " , $ this ->scanner ->current ());
498+ $ tok = $ this ->scanner ->current ();
499+ $ this ->parseError ("Expected an attribute name, got %s. " , $ tok );
460500 // Really, only '=' can be the char here. Everything else gets absorbed
461501 // under one rule or another.
462- $ name = $ this -> scanner -> current () ;
502+ $ name = $ tok ;
463503 $ this ->scanner ->next ();
464504 }
465505
@@ -556,7 +596,7 @@ protected function quotedAttributeValue($quote)
556596
557597 $ tok = $ this ->scanner ->current ();
558598 if ($ tok == '& ' ) {
559- $ val .= $ this ->decodeCharacterReference (true , $ tok );
599+ $ val .= $ this ->decodeCharacterReference (true );
560600 continue ;
561601 }
562602 break ;
@@ -714,7 +754,7 @@ protected function doctype()
714754 // EOF: die.
715755 if ($ tok === false ) {
716756 $ this ->events ->doctype ('html5 ' , EventHandler::DOCTYPE_NONE , '' , true );
717- return $ this ->eof ();
757+ return $ this ->eof ($ tok );
718758 }
719759
720760 // NULL char: convert.
@@ -1032,6 +1072,7 @@ protected function parseError($msg)
10321072 $ line = $ this ->scanner ->currentLine ();
10331073 $ col = $ this ->scanner ->columnOffset ();
10341074 $ this ->events ->parseError ($ msg , $ line , $ col );
1075+
10351076 return false ;
10361077 }
10371078
@@ -1049,7 +1090,6 @@ protected function parseError($msg)
10491090 */
10501091 protected function decodeCharacterReference ($ inAttribute = false )
10511092 {
1052-
10531093 // If it fails this, it's definitely not an entity.
10541094 if ($ this ->scanner ->current () != '& ' ) {
10551095 return false ;
0 commit comments