Skip to content

Commit 95eb879

Browse files
committed
HTML API: Allow subdividing text nodes by meaningful prefixes.
HTML parsing rules at times differentiate character tokens that are all null bytes, all whitespace, or other content. This patch introduces a new function which may be used to classify text node sub-regions and lead to more efficient application of these parsing rules. Further, when classified in this way, application code may skip some rules and decoding entirely, improving performance. For example, this can be used to ease the implementation of skipping inter-element whitespace, which is usually not rendered. Developed in #7236 Discussed in https://core.trac.wordpress.org/ticket/61974 Props dmsnell, jonsurrell. Fixes #61974. git-svn-id: https://develop.svn.wordpress.org/trunk@58970 602fd350-edb4-49c9-b593-d223f7449a82
1 parent b37cbf9 commit 95eb879

File tree

3 files changed

+182
-79
lines changed

3 files changed

+182
-79
lines changed

src/wp-includes/html-api/class-wp-html-processor.php

Lines changed: 25 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -843,6 +843,12 @@ public function step( $node_to_process = self::PROCESS_NEXT_NODE ): bool {
843843

844844
if ( self::PROCESS_NEXT_NODE === $node_to_process ) {
845845
parent::next_token();
846+
if (
847+
WP_HTML_Tag_Processor::STATE_TEXT_NODE === $this->parser_state ||
848+
WP_HTML_Tag_Processor::STATE_CDATA_NODE === $this->parser_state
849+
) {
850+
parent::subdivide_text_appropriately();
851+
}
846852
}
847853

848854
// Finish stepping when there are no more tokens in the document.
@@ -1056,8 +1062,7 @@ private function step_initial(): bool {
10561062
* Parse error: ignore the token.
10571063
*/
10581064
case '#text':
1059-
$text = $this->get_modifiable_text();
1060-
if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) {
1065+
if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) {
10611066
return $this->step();
10621067
}
10631068
goto initial_anything_else;
@@ -1145,8 +1150,7 @@ private function step_before_html(): bool {
11451150
* Parse error: ignore the token.
11461151
*/
11471152
case '#text':
1148-
$text = $this->get_modifiable_text();
1149-
if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) {
1153+
if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) {
11501154
return $this->step();
11511155
}
11521156
goto before_html_anything_else;
@@ -1227,8 +1231,7 @@ private function step_before_head(): bool {
12271231
* Parse error: ignore the token.
12281232
*/
12291233
case '#text':
1230-
$text = $this->get_modifiable_text();
1231-
if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) {
1234+
if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) {
12321235
return $this->step();
12331236
}
12341237
goto before_head_anything_else;
@@ -1323,16 +1326,7 @@ private function step_in_head(): bool {
13231326
* > U+000A LINE FEED (LF), U+000C FORM FEED (FF),
13241327
* > U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
13251328
*/
1326-
$text = $this->get_modifiable_text();
1327-
if ( '' === $text ) {
1328-
/*
1329-
* If the text is empty after processing HTML entities and stripping
1330-
* U+0000 NULL bytes then ignore the token.
1331-
*/
1332-
return $this->step();
1333-
}
1334-
1335-
if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) {
1329+
if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) {
13361330
// Insert the character.
13371331
$this->insert_html_element( $this->state->current_token );
13381332
return true;
@@ -1552,8 +1546,7 @@ private function step_in_head_noscript(): bool {
15521546
* Parse error: ignore the token.
15531547
*/
15541548
case '#text':
1555-
$text = $this->get_modifiable_text();
1556-
if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) {
1549+
if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) {
15571550
return $this->step_in_head();
15581551
}
15591552

@@ -1654,8 +1647,7 @@ private function step_after_head(): bool {
16541647
* > U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
16551648
*/
16561649
case '#text':
1657-
$text = $this->get_modifiable_text();
1658-
if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) {
1650+
if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) {
16591651
// Insert the character.
16601652
$this->insert_html_element( $this->state->current_token );
16611653
return true;
@@ -1793,8 +1785,6 @@ private function step_in_body(): bool {
17931785

17941786
switch ( $op ) {
17951787
case '#text':
1796-
$current_token = $this->bookmarks[ $this->state->current_token->bookmark_name ];
1797-
17981788
/*
17991789
* > A character token that is U+0000 NULL
18001790
*
@@ -1804,11 +1794,7 @@ private function step_in_body(): bool {
18041794
* here, but if there are any other characters in the stream
18051795
* the active formats should be reconstructed.
18061796
*/
1807-
if (
1808-
1 <= $current_token->length &&
1809-
"\x00" === $this->html[ $current_token->start ] &&
1810-
strspn( $this->html, "\x00", $current_token->start, $current_token->length ) === $current_token->length
1811-
) {
1797+
if ( parent::TEXT_IS_NULL_SEQUENCE === $this->text_node_classification ) {
18121798
// Parse error: ignore the token.
18131799
return $this->step();
18141800
}
@@ -1820,8 +1806,7 @@ private function step_in_body(): bool {
18201806
* It is probably inter-element whitespace, but it may also
18211807
* contain character references which decode only to whitespace.
18221808
*/
1823-
$text = $this->get_modifiable_text();
1824-
if ( strlen( $text ) !== strspn( $text, " \t\n\f\r" ) ) {
1809+
if ( parent::TEXT_IS_GENERIC === $this->text_node_classification ) {
18251810
$this->state->frameset_ok = false;
18261811
}
18271812

@@ -2829,12 +2814,11 @@ private function step_in_table(): bool {
28292814
'TR' === $current_node_name
28302815
)
28312816
) {
2832-
$text = $this->get_modifiable_text();
28332817
/*
28342818
* If the text is empty after processing HTML entities and stripping
28352819
* U+0000 NULL bytes then ignore the token.
28362820
*/
2837-
if ( '' === $text ) {
2821+
if ( parent::TEXT_IS_NULL_SEQUENCE === $this->text_node_classification ) {
28382822
return $this->step();
28392823
}
28402824

@@ -2857,7 +2841,7 @@ private function step_in_table(): bool {
28572841
*
28582842
* @see https://html.spec.whatwg.org/#parsing-main-intabletext
28592843
*/
2860-
if ( strlen( $text ) === strspn( $text, " \t\f\r\n" ) ) {
2844+
if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) {
28612845
$this->insert_html_element( $this->state->current_token );
28622846
return true;
28632847
}
@@ -3177,16 +3161,7 @@ private function step_in_column_group(): bool {
31773161
* > U+000C FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
31783162
*/
31793163
case '#text':
3180-
$text = $this->get_modifiable_text();
3181-
if ( '' === $text ) {
3182-
/*
3183-
* If the text is empty after processing HTML entities and stripping
3184-
* U+0000 NULL bytes then ignore the token.
3185-
*/
3186-
return $this->step();
3187-
}
3188-
3189-
if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) {
3164+
if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) {
31903165
// Insert the character.
31913166
$this->insert_html_element( $this->state->current_token );
31923167
return true;
@@ -3609,19 +3584,13 @@ private function step_in_select(): bool {
36093584
* > Any other character token
36103585
*/
36113586
case '#text':
3612-
$current_token = $this->bookmarks[ $this->state->current_token->bookmark_name ];
3613-
36143587
/*
36153588
* > A character token that is U+0000 NULL
36163589
*
36173590
* If a text node only comprises null bytes then it should be
36183591
* entirely ignored and should not return to calling code.
36193592
*/
3620-
if (
3621-
1 <= $current_token->length &&
3622-
"\x00" === $this->html[ $current_token->start ] &&
3623-
strspn( $this->html, "\x00", $current_token->start, $current_token->length ) === $current_token->length
3624-
) {
3593+
if ( parent::TEXT_IS_NULL_SEQUENCE === $this->text_node_classification ) {
36253594
// Parse error: ignore the token.
36263595
return $this->step();
36273596
}
@@ -3986,8 +3955,7 @@ private function step_after_body(): bool {
39863955
* > Process the token using the rules for the "in body" insertion mode.
39873956
*/
39883957
case '#text':
3989-
$text = $this->get_modifiable_text();
3990-
if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) {
3958+
if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) {
39913959
return $this->step_in_body();
39923960
}
39933961
goto after_body_anything_else;
@@ -4072,9 +4040,7 @@ private function step_in_frameset(): bool {
40724040
* them under HTML. This is not supported at this time.
40734041
*/
40744042
case '#text':
4075-
$text = $this->get_modifiable_text();
4076-
$text = $this->get_modifiable_text();
4077-
if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) {
4043+
if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) {
40784044
return $this->step_in_body();
40794045
}
40804046
$this->bail( 'Non-whitespace characters cannot be handled in frameset.' );
@@ -4193,8 +4159,7 @@ private function step_after_frameset(): bool {
41934159
* them under HTML. This is not supported at this time.
41944160
*/
41954161
case '#text':
4196-
$text = $this->get_modifiable_text();
4197-
if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) {
4162+
if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) {
41984163
return $this->step_in_body();
41994164
}
42004165
$this->bail( 'Non-whitespace characters cannot be handled in after frameset' );
@@ -4288,8 +4253,7 @@ private function step_after_after_body(): bool {
42884253
* > Process the token using the rules for the "in body" insertion mode.
42894254
*/
42904255
case '#text':
4291-
$text = $this->get_modifiable_text();
4292-
if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) {
4256+
if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) {
42934257
return $this->step_in_body();
42944258
}
42954259
goto after_after_body_anything_else;
@@ -4355,8 +4319,7 @@ private function step_after_after_frameset(): bool {
43554319
* them under HTML. This is not supported at this time.
43564320
*/
43574321
case '#text':
4358-
$text = $this->get_modifiable_text();
4359-
if ( strlen( $text ) === strspn( $text, " \t\n\f\r" ) ) {
4322+
if ( parent::TEXT_IS_WHITESPACE === $this->text_node_classification ) {
43604323
return $this->step_in_body();
43614324
}
43624325
$this->bail( 'Non-whitespace characters cannot be handled in after after frameset.' );
@@ -4412,6 +4375,7 @@ private function step_in_foreign_content(): bool {
44124375
}
44134376

44144377
switch ( $op ) {
4378+
case '#cdata-section':
44154379
case '#text':
44164380
/*
44174381
* > A character token that is U+0000 NULL
@@ -4424,8 +4388,7 @@ private function step_in_foreign_content(): bool {
44244388
* It is probably inter-element whitespace, but it may also
44254389
* contain character references which decode only to whitespace.
44264390
*/
4427-
$text = $this->get_modifiable_text();
4428-
if ( strlen( $text ) !== strspn( $text, " \t\n\f\r" ) ) {
4391+
if ( parent::TEXT_IS_GENERIC === $this->text_node_classification ) {
44294392
$this->state->frameset_ok = false;
44304393
}
44314394

@@ -4435,7 +4398,6 @@ private function step_in_foreign_content(): bool {
44354398
/*
44364399
* > A comment token
44374400
*/
4438-
case '#cdata-section':
44394401
case '#comment':
44404402
case '#funky-comment':
44414403
case '#presumptuous-tag':

0 commit comments

Comments
 (0)