Skip to content

Commit 057f585

Browse files
committed
HTML API: Reduce length checks in skip_script_data.
Apply an optimization to remove several repeated string length checks in `WP_HTML_Tag_Processor::skip_script_data()`. Developed in #9230. Props jonsurrell, dmsnell. See #63738. git-svn-id: https://develop.svn.wordpress.org/trunk@60617 602fd350-edb4-49c9-b593-d223f7449a82
1 parent b91b757 commit 057f585

File tree

2 files changed

+79
-12
lines changed

2 files changed

+79
-12
lines changed

src/wp-includes/html-api/class-wp-html-tag-processor.php

Lines changed: 36 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1496,13 +1496,48 @@ private function skip_script_data(): bool {
14961496
while ( false !== $at && $at < $doc_length ) {
14971497
$at += strcspn( $html, '-<', $at );
14981498

1499+
/*
1500+
* Optimization: Terminating a complete script element requires at least eight
1501+
* additional bytes in the document. Some checks below may cause local escaped
1502+
* state transitions when processing shorter strings, but those transitions are
1503+
* irrelevant if the script tag is incomplete and the function must return false.
1504+
*
1505+
* This may need updating if those transitions become significant or exported from
1506+
* this function in some way, such as when building safe methods to embed JavaScript
1507+
* or data inside a SCRIPT element.
1508+
*
1509+
* $at may be here.
1510+
* ↓
1511+
* ...</script>
1512+
* ╰──┬───╯
1513+
* $at + 8 additional bytes are required for a non-false return value.
1514+
*
1515+
* This single check eliminates the need to check lengths for the shorter spans:
1516+
*
1517+
* $at may be here.
1518+
* ↓
1519+
* <script><!-- --></script>
1520+
* ├╯
1521+
* $at + 2 additional characters does not require a length check.
1522+
*
1523+
* The transition from "escaped" to "unescaped" is not relevant if the document ends:
1524+
*
1525+
* $at may be here.
1526+
* ↓
1527+
* <script><!-- -->[[END-OF-DOCUMENT]]
1528+
* ╰──┬───╯
1529+
* $at + 8 additional bytes is not satisfied, return false.
1530+
*/
1531+
if ( $at + 8 >= $doc_length ) {
1532+
return false;
1533+
}
1534+
14991535
/*
15001536
* For all script states a "-->" transitions
15011537
* back into the normal unescaped script mode,
15021538
* even if that's the current state.
15031539
*/
15041540
if (
1505-
$at + 2 < $doc_length &&
15061541
'-' === $html[ $at ] &&
15071542
'-' === $html[ $at + 1 ] &&
15081543
'>' === $html[ $at + 2 ]
@@ -1512,10 +1547,6 @@ private function skip_script_data(): bool {
15121547
continue;
15131548
}
15141549

1515-
if ( $at + 1 >= $doc_length ) {
1516-
return false;
1517-
}
1518-
15191550
/*
15201551
* Everything of interest past here starts with "<".
15211552
* Check this character and advance position regardless.
@@ -1537,7 +1568,6 @@ private function skip_script_data(): bool {
15371568
* parsing after updating the state.
15381569
*/
15391570
if (
1540-
$at + 2 < $doc_length &&
15411571
'!' === $html[ $at ] &&
15421572
'-' === $html[ $at + 1 ] &&
15431573
'-' === $html[ $at + 2 ]
@@ -1561,7 +1591,6 @@ private function skip_script_data(): bool {
15611591
* proceed scanning to the next potential token in the text.
15621592
*/
15631593
if ( ! (
1564-
$at + 6 < $doc_length &&
15651594
( 's' === $html[ $at ] || 'S' === $html[ $at ] ) &&
15661595
( 'c' === $html[ $at + 1 ] || 'C' === $html[ $at + 1 ] ) &&
15671596
( 'r' === $html[ $at + 2 ] || 'R' === $html[ $at + 2 ] ) &&
@@ -1579,9 +1608,6 @@ private function skip_script_data(): bool {
15791608
* "<script123" should not end a script region even though
15801609
* "<script" is found within the text.
15811610
*/
1582-
if ( $at + 6 >= $doc_length ) {
1583-
continue;
1584-
}
15851611
$at += 6;
15861612
$c = $html[ $at ];
15871613
if ( ' ' !== $c && "\t" !== $c && "\r" !== $c && "\n" !== $c && '/' !== $c && '>' !== $c ) {
@@ -1611,8 +1637,6 @@ private function skip_script_data(): bool {
16111637
}
16121638

16131639
if ( $this->bytes_already_parsed >= $doc_length ) {
1614-
$this->parser_state = self::STATE_INCOMPLETE_INPUT;
1615-
16161640
return false;
16171641
}
16181642

tests/phpunit/tests/html-api/wpHtmlTagProcessor.php

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1981,6 +1981,49 @@ public static function data_next_tag_ignores_script_tag_contents() {
19811981
);
19821982
}
19831983

1984+
/**
1985+
* Test that script tags are parsed correctly.
1986+
*
1987+
* Script tag parsing is very complicated, see the following resources for more details:
1988+
*
1989+
* - https://html.spec.whatwg.org/multipage/parsing.html#script-data-state
1990+
* - https://html.spec.whatwg.org/multipage/scripting.html#restrictions-for-contents-of-script-elements
1991+
*
1992+
* @ticket 63738
1993+
*
1994+
* @dataProvider data_script_tag
1995+
*/
1996+
public function test_script_tag_parsing( string $input, bool $closes ) {
1997+
$processor = new WP_HTML_Tag_Processor( $input );
1998+
1999+
if ( $closes ) {
2000+
$this->assertTrue( $processor->next_token(), 'Expected to find complete script tag.' );
2001+
$this->assertSame( 'SCRIPT', $processor->get_tag() );
2002+
return;
2003+
}
2004+
2005+
$this->assertFalse( $processor->next_token(), 'Expected to fail next_token().' );
2006+
$this->assertTrue( $processor->paused_at_incomplete_token(), 'Expected an incomplete SCRIPT tag token.' );
2007+
}
2008+
2009+
/**
2010+
* Data provider.
2011+
*/
2012+
public static function data_script_tag(): array {
2013+
return array(
2014+
'Basic script tag' => array( '<script></script>', true ),
2015+
'Script with type attribute' => array( '<script type="text/javascript"></script>', true ),
2016+
'Script data escaped' => array( '<script><!--</script>', true ),
2017+
'Script data double-escaped exit (comment)' => array( '<script><!--<script>--></script>', true ),
2018+
'Script data double-escaped exit (closed)' => array( '<script><!--<script></script></script>', true ),
2019+
'Script data double-escaped exit (closed/truncated)' => array( '<script><!--<script></script </script>', true ),
2020+
'Script data no double-escape' => array( '<script><!-- --><script></script>', true ),
2021+
2022+
'Script tag with self-close flag (ignored)' => array( '<script />', false ),
2023+
'Script data double-escaped' => array( '<script><!--<script></script>', false ),
2024+
);
2025+
}
2026+
19842027
/**
19852028
* Invalid tag names are comments on tag closers.
19862029
*

0 commit comments

Comments
 (0)