@@ -843,6 +843,12 @@ public function step( $node_to_process = self::PROCESS_NEXT_NODE ): bool {
843843
844844 if ( self ::PROCESS_NEXT_NODE === $ node_to_process ) {
845845 parent ::next_token ();
846+ if (
847+ WP_HTML_Tag_Processor::STATE_TEXT_NODE === $ this ->parser_state ||
848+ WP_HTML_Tag_Processor::STATE_CDATA_NODE === $ this ->parser_state
849+ ) {
850+ parent ::subdivide_text_appropriately ();
851+ }
846852 }
847853
848854 // Finish stepping when there are no more tokens in the document.
@@ -1056,8 +1062,7 @@ private function step_initial(): bool {
10561062 * Parse error: ignore the token.
10571063 */
10581064 case '#text ' :
1059- $ text = $ this ->get_modifiable_text ();
1060- if ( strlen ( $ text ) === strspn ( $ text , " \t\n\f\r" ) ) {
1065+ if ( parent ::TEXT_IS_WHITESPACE === $ this ->text_node_classification ) {
10611066 return $ this ->step ();
10621067 }
10631068 goto initial_anything_else;
@@ -1145,8 +1150,7 @@ private function step_before_html(): bool {
11451150 * Parse error: ignore the token.
11461151 */
11471152 case '#text ' :
1148- $ text = $ this ->get_modifiable_text ();
1149- if ( strlen ( $ text ) === strspn ( $ text , " \t\n\f\r" ) ) {
1153+ if ( parent ::TEXT_IS_WHITESPACE === $ this ->text_node_classification ) {
11501154 return $ this ->step ();
11511155 }
11521156 goto before_html_anything_else;
@@ -1227,8 +1231,7 @@ private function step_before_head(): bool {
12271231 * Parse error: ignore the token.
12281232 */
12291233 case '#text ' :
1230- $ text = $ this ->get_modifiable_text ();
1231- if ( strlen ( $ text ) === strspn ( $ text , " \t\n\f\r" ) ) {
1234+ if ( parent ::TEXT_IS_WHITESPACE === $ this ->text_node_classification ) {
12321235 return $ this ->step ();
12331236 }
12341237 goto before_head_anything_else;
@@ -1323,16 +1326,7 @@ private function step_in_head(): bool {
13231326 * > U+000A LINE FEED (LF), U+000C FORM FEED (FF),
13241327 * > U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
13251328 */
1326- $ text = $ this ->get_modifiable_text ();
1327- if ( '' === $ text ) {
1328- /*
1329- * If the text is empty after processing HTML entities and stripping
1330- * U+0000 NULL bytes then ignore the token.
1331- */
1332- return $ this ->step ();
1333- }
1334-
1335- if ( strlen ( $ text ) === strspn ( $ text , " \t\n\f\r" ) ) {
1329+ if ( parent ::TEXT_IS_WHITESPACE === $ this ->text_node_classification ) {
13361330 // Insert the character.
13371331 $ this ->insert_html_element ( $ this ->state ->current_token );
13381332 return true ;
@@ -1552,8 +1546,7 @@ private function step_in_head_noscript(): bool {
15521546 * Parse error: ignore the token.
15531547 */
15541548 case '#text ' :
1555- $ text = $ this ->get_modifiable_text ();
1556- if ( strlen ( $ text ) === strspn ( $ text , " \t\n\f\r" ) ) {
1549+ if ( parent ::TEXT_IS_WHITESPACE === $ this ->text_node_classification ) {
15571550 return $ this ->step_in_head ();
15581551 }
15591552
@@ -1654,8 +1647,7 @@ private function step_after_head(): bool {
16541647 * > U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
16551648 */
16561649 case '#text ' :
1657- $ text = $ this ->get_modifiable_text ();
1658- if ( strlen ( $ text ) === strspn ( $ text , " \t\n\f\r" ) ) {
1650+ if ( parent ::TEXT_IS_WHITESPACE === $ this ->text_node_classification ) {
16591651 // Insert the character.
16601652 $ this ->insert_html_element ( $ this ->state ->current_token );
16611653 return true ;
@@ -1793,8 +1785,6 @@ private function step_in_body(): bool {
17931785
17941786 switch ( $ op ) {
17951787 case '#text ' :
1796- $ current_token = $ this ->bookmarks [ $ this ->state ->current_token ->bookmark_name ];
1797-
17981788 /*
17991789 * > A character token that is U+0000 NULL
18001790 *
@@ -1804,11 +1794,7 @@ private function step_in_body(): bool {
18041794 * here, but if there are any other characters in the stream
18051795 * the active formats should be reconstructed.
18061796 */
1807- if (
1808- 1 <= $ current_token ->length &&
1809- "\x00" === $ this ->html [ $ current_token ->start ] &&
1810- strspn ( $ this ->html , "\x00" , $ current_token ->start , $ current_token ->length ) === $ current_token ->length
1811- ) {
1797+ if ( parent ::TEXT_IS_NULL_SEQUENCE === $ this ->text_node_classification ) {
18121798 // Parse error: ignore the token.
18131799 return $ this ->step ();
18141800 }
@@ -1820,8 +1806,7 @@ private function step_in_body(): bool {
18201806 * It is probably inter-element whitespace, but it may also
18211807 * contain character references which decode only to whitespace.
18221808 */
1823- $ text = $ this ->get_modifiable_text ();
1824- if ( strlen ( $ text ) !== strspn ( $ text , " \t\n\f\r" ) ) {
1809+ if ( parent ::TEXT_IS_GENERIC === $ this ->text_node_classification ) {
18251810 $ this ->state ->frameset_ok = false ;
18261811 }
18271812
@@ -2829,12 +2814,11 @@ private function step_in_table(): bool {
28292814 'TR ' === $ current_node_name
28302815 )
28312816 ) {
2832- $ text = $ this ->get_modifiable_text ();
28332817 /*
28342818 * If the text is empty after processing HTML entities and stripping
28352819 * U+0000 NULL bytes then ignore the token.
28362820 */
2837- if ( '' === $ text ) {
2821+ if ( parent :: TEXT_IS_NULL_SEQUENCE === $ this -> text_node_classification ) {
28382822 return $ this ->step ();
28392823 }
28402824
@@ -2857,7 +2841,7 @@ private function step_in_table(): bool {
28572841 *
28582842 * @see https://html.spec.whatwg.org/#parsing-main-intabletext
28592843 */
2860- if ( strlen ( $ text ) === strspn ( $ text , " \t\f\r\n" ) ) {
2844+ if ( parent :: TEXT_IS_WHITESPACE === $ this -> text_node_classification ) {
28612845 $ this ->insert_html_element ( $ this ->state ->current_token );
28622846 return true ;
28632847 }
@@ -3177,16 +3161,7 @@ private function step_in_column_group(): bool {
31773161 * > U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
31783162 */
31793163 case '#text ' :
3180- $ text = $ this ->get_modifiable_text ();
3181- if ( '' === $ text ) {
3182- /*
3183- * If the text is empty after processing HTML entities and stripping
3184- * U+0000 NULL bytes then ignore the token.
3185- */
3186- return $ this ->step ();
3187- }
3188-
3189- if ( strlen ( $ text ) === strspn ( $ text , " \t\n\f\r" ) ) {
3164+ if ( parent ::TEXT_IS_WHITESPACE === $ this ->text_node_classification ) {
31903165 // Insert the character.
31913166 $ this ->insert_html_element ( $ this ->state ->current_token );
31923167 return true ;
@@ -3609,19 +3584,13 @@ private function step_in_select(): bool {
36093584 * > Any other character token
36103585 */
36113586 case '#text ' :
3612- $ current_token = $ this ->bookmarks [ $ this ->state ->current_token ->bookmark_name ];
3613-
36143587 /*
36153588 * > A character token that is U+0000 NULL
36163589 *
36173590 * If a text node only comprises null bytes then it should be
36183591 * entirely ignored and should not return to calling code.
36193592 */
3620- if (
3621- 1 <= $ current_token ->length &&
3622- "\x00" === $ this ->html [ $ current_token ->start ] &&
3623- strspn ( $ this ->html , "\x00" , $ current_token ->start , $ current_token ->length ) === $ current_token ->length
3624- ) {
3593+ if ( parent ::TEXT_IS_NULL_SEQUENCE === $ this ->text_node_classification ) {
36253594 // Parse error: ignore the token.
36263595 return $ this ->step ();
36273596 }
@@ -3986,8 +3955,7 @@ private function step_after_body(): bool {
39863955 * > Process the token using the rules for the "in body" insertion mode.
39873956 */
39883957 case '#text ' :
3989- $ text = $ this ->get_modifiable_text ();
3990- if ( strlen ( $ text ) === strspn ( $ text , " \t\n\f\r" ) ) {
3958+ if ( parent ::TEXT_IS_WHITESPACE === $ this ->text_node_classification ) {
39913959 return $ this ->step_in_body ();
39923960 }
39933961 goto after_body_anything_else;
@@ -4072,9 +4040,7 @@ private function step_in_frameset(): bool {
40724040 * them under HTML. This is not supported at this time.
40734041 */
40744042 case '#text ' :
4075- $ text = $ this ->get_modifiable_text ();
4076- $ text = $ this ->get_modifiable_text ();
4077- if ( strlen ( $ text ) === strspn ( $ text , " \t\n\f\r" ) ) {
4043+ if ( parent ::TEXT_IS_WHITESPACE === $ this ->text_node_classification ) {
40784044 return $ this ->step_in_body ();
40794045 }
40804046 $ this ->bail ( 'Non-whitespace characters cannot be handled in frameset. ' );
@@ -4193,8 +4159,7 @@ private function step_after_frameset(): bool {
41934159 * them under HTML. This is not supported at this time.
41944160 */
41954161 case '#text ' :
4196- $ text = $ this ->get_modifiable_text ();
4197- if ( strlen ( $ text ) === strspn ( $ text , " \t\n\f\r" ) ) {
4162+ if ( parent ::TEXT_IS_WHITESPACE === $ this ->text_node_classification ) {
41984163 return $ this ->step_in_body ();
41994164 }
42004165 $ this ->bail ( 'Non-whitespace characters cannot be handled in after frameset ' );
@@ -4288,8 +4253,7 @@ private function step_after_after_body(): bool {
42884253 * > Process the token using the rules for the "in body" insertion mode.
42894254 */
42904255 case '#text ' :
4291- $ text = $ this ->get_modifiable_text ();
4292- if ( strlen ( $ text ) === strspn ( $ text , " \t\n\f\r" ) ) {
4256+ if ( parent ::TEXT_IS_WHITESPACE === $ this ->text_node_classification ) {
42934257 return $ this ->step_in_body ();
42944258 }
42954259 goto after_after_body_anything_else;
@@ -4355,8 +4319,7 @@ private function step_after_after_frameset(): bool {
43554319 * them under HTML. This is not supported at this time.
43564320 */
43574321 case '#text ' :
4358- $ text = $ this ->get_modifiable_text ();
4359- if ( strlen ( $ text ) === strspn ( $ text , " \t\n\f\r" ) ) {
4322+ if ( parent ::TEXT_IS_WHITESPACE === $ this ->text_node_classification ) {
43604323 return $ this ->step_in_body ();
43614324 }
43624325 $ this ->bail ( 'Non-whitespace characters cannot be handled in after after frameset. ' );
@@ -4412,6 +4375,7 @@ private function step_in_foreign_content(): bool {
44124375 }
44134376
44144377 switch ( $ op ) {
4378+ case '#cdata-section ' :
44154379 case '#text ' :
44164380 /*
44174381 * > A character token that is U+0000 NULL
@@ -4424,8 +4388,7 @@ private function step_in_foreign_content(): bool {
44244388 * It is probably inter-element whitespace, but it may also
44254389 * contain character references which decode only to whitespace.
44264390 */
4427- $ text = $ this ->get_modifiable_text ();
4428- if ( strlen ( $ text ) !== strspn ( $ text , " \t\n\f\r" ) ) {
4391+ if ( parent ::TEXT_IS_GENERIC === $ this ->text_node_classification ) {
44294392 $ this ->state ->frameset_ok = false ;
44304393 }
44314394
@@ -4435,7 +4398,6 @@ private function step_in_foreign_content(): bool {
44354398 /*
44364399 * > A comment token
44374400 */
4438- case '#cdata-section ' :
44394401 case '#comment ' :
44404402 case '#funky-comment ' :
44414403 case '#presumptuous-tag ' :
0 commit comments