Skip to content

Commit 2ac68e9

Browse files
committed
HTML API: Improve script tag escape state processing.
Addresses some edge cases parsing of script tag contents: - "<!-->" remains in the unescaped state and does not enter the escaped state. - Contents in the escaped state that end with "<script" do not enter double-escaped state. - "\f" (Form Feed) was missing as a tag name terminating character. Developed in #9397 and #9402. Props jonsurrell, dmsnell. See #63738. git-svn-id: https://develop.svn.wordpress.org/trunk@60649 602fd350-edb4-49c9-b593-d223f7449a82
1 parent be2b79e commit 2ac68e9

File tree

2 files changed

+90
-27
lines changed

2 files changed

+90
-27
lines changed

src/wp-includes/html-api/class-wp-html-tag-processor.php

Lines changed: 45 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1556,24 +1556,33 @@ private function skip_script_data(): bool {
15561556
}
15571557

15581558
/*
1559-
* Unlike with "-->", the "<!--" only transitions
1560-
* into the escaped mode if not already there.
1561-
*
1562-
* Inside the escaped modes it will be ignored; and
1563-
* should never break out of the double-escaped
1564-
* mode and back into the escaped mode.
1565-
*
1566-
* While this requires a mode change, it does not
1567-
* impact the parsing otherwise, so continue
1568-
* parsing after updating the state.
1559+
* "<!--" only transitions from _unescaped_ to _escaped_. This byte sequence is only
1560+
* significant in the _unescaped_ state and is ignored in any other state.
15691561
*/
15701562
if (
1563+
'unescaped' === $state &&
15711564
'!' === $html[ $at ] &&
15721565
'-' === $html[ $at + 1 ] &&
15731566
'-' === $html[ $at + 2 ]
15741567
) {
1575-
$at += 3;
1576-
$state = 'unescaped' === $state ? 'escaped' : $state;
1568+
$at += 3;
1569+
1570+
/*
1571+
* The parser is ready to enter the _escaped_ state, but may remain in the
1572+
* _unescaped_ state. This occurs when "<!--" is immediately followed by a
1573+
* sequence of 0 or more "-" followed by ">". This is similar to abruptly closed
1574+
* HTML comments like "<!-->" or "<!--->".
1575+
*
1576+
* Note that this check may advance the position significantly and requires a
1577+
* length check to prevent bad offsets on inputs like `<script><!---------`.
1578+
*/
1579+
$at += strspn( $html, '-', $at );
1580+
if ( $at < $doc_length && '>' === $html[ $at ] ) {
1581+
++$at;
1582+
continue;
1583+
}
1584+
1585+
$state = 'escaped';
15771586
continue;
15781587
}
15791588

@@ -1610,8 +1619,30 @@ private function skip_script_data(): bool {
16101619
*/
16111620
$at += 6;
16121621
$c = $html[ $at ];
1613-
if ( ' ' !== $c && "\t" !== $c && "\r" !== $c && "\n" !== $c && '/' !== $c && '>' !== $c ) {
1614-
++$at;
1622+
if (
1623+
/**
1624+
* These characters trigger state transitions of interest:
1625+
*
1626+
* - @see {https://html.spec.whatwg.org/multipage/parsing.html#script-data-end-tag-name-state}
1627+
* - @see {https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-end-tag-name-state}
1628+
* - @see {https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-start-state}
1629+
* - @see {https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-end-state}
1630+
*
1631+
* The "\r" character is not present in the above references. However, "\r" must be
1632+
* treated the same as "\n". This is because the HTML Standard requires newline
1633+
* normalization during preprocessing which applies this replacement.
1634+
*
1635+
* - @see https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream
1636+
* - @see https://infra.spec.whatwg.org/#normalize-newlines
1637+
*/
1638+
'>' !== $c &&
1639+
' ' !== $c &&
1640+
"\n" !== $c &&
1641+
'/' !== $c &&
1642+
"\t" !== $c &&
1643+
"\f" !== $c &&
1644+
"\r" !== $c
1645+
) {
16151646
continue;
16161647
}
16171648

tests/phpunit/tests/html-api/wpHtmlTagProcessor.php

Lines changed: 45 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2009,19 +2009,51 @@ public function test_script_tag_parsing( string $input, bool $closes ) {
20092009
/**
20102010
* Data provider.
20112011
*/
2012-
public static function data_script_tag(): array {
2013-
return array(
2014-
'Basic script tag' => array( '<script></script>', true ),
2015-
'Script with type attribute' => array( '<script type="text/javascript"></script>', true ),
2016-
'Script data escaped' => array( '<script><!--</script>', true ),
2017-
'Script data double-escaped exit (comment)' => array( '<script><!--<script>--></script>', true ),
2018-
'Script data double-escaped exit (closed)' => array( '<script><!--<script></script></script>', true ),
2019-
'Script data double-escaped exit (closed/truncated)' => array( '<script><!--<script></script </script>', true ),
2020-
'Script data no double-escape' => array( '<script><!-- --><script></script>', true ),
2021-
2022-
'Script tag with self-close flag (ignored)' => array( '<script />', false ),
2023-
'Script data double-escaped' => array( '<script><!--<script></script>', false ),
2024-
);
2012+
public static function data_script_tag(): Generator {
2013+
yield 'Basic script tag' => array( '<script></script>', true );
2014+
yield 'Script tag with </script> close' => array( '<script></script>', true );
2015+
yield 'Script tag with </script/> close' => array( '<script></script/>', true );
2016+
yield 'Script tag with </script > close' => array( '<script></script >', true );
2017+
yield 'Script tag with </script\n> close' => array( "<script></script\n>", true );
2018+
yield 'Script tag with </script\t> close' => array( "<script></script\t>", true );
2019+
yield 'Script tag with </script\f> close' => array( "<script></script\f>", true );
2020+
yield 'Script tag with </script\r> close' => array( "<script></script\r>", true );
2021+
yield 'Script with type attribute' => array( '<script type="text/javascript"></script>', true );
2022+
yield 'Script data escaped' => array( '<script><!--</script>', true );
2023+
yield 'Script data double-escaped exit (comment)' => array( '<script><!--<script>--></script>', true );
2024+
yield 'Script data double-escaped exit (closed ">")' => array( '<script><!--<script></script></script>', true );
2025+
yield 'Script data double-escaped exit (closed "/")' => array( '<script><!--<script></script/</script>', true );
2026+
yield 'Script data double-escaped exit (closed " ")' => array( '<script><!--<script></script </script>', true );
2027+
yield 'Script data double-escaped exit (closed "\n")' => array( "<script><!--<script></script\n</script>", true );
2028+
yield 'Script data double-escaped exit (closed "\t")' => array( "<script><!--<script></script\t</script>", true );
2029+
yield 'Script data double-escaped exit (closed "\f")' => array( "<script><!--<script></script\f</script>", true );
2030+
yield 'Script data double-escaped exit (closed "\r")' => array( "<script><!--<script></script\r</script>", true );
2031+
yield 'Script data no double-escape' => array( '<script><!-- --><script></script>', true );
2032+
yield 'Script data no double-escape (short comment)' => array( '<script><!--><script></script>', true );
2033+
yield 'Script data almost double-escaped' => array( '<script><!--<script</script>', true );
2034+
yield 'Script data with complex JavaScript' => array(
2035+
'<script>
2036+
var x = 10;
2037+
x--;
2038+
x < 0 ? x += 100 : x = (x + 1) - 1;
2039+
</script>',
2040+
true,
2041+
);
2042+
2043+
yield 'Script tag with self-close flag (ignored)' => array( '<script />', false );
2044+
yield 'Script data double-escaped' => array( '<script><!--<script></script>', false );
2045+
yield 'Unclosed script in escaped state' => array( '<script><!--------------', false );
2046+
yield 'Unclosed script in double escaped state' => array( '<script><!--<script ', false );
2047+
yield 'Document end in closer start' => array( '<script></', false );
2048+
yield 'Document end in script closer' => array( '<script></script', false );
2049+
yield 'Document end in script closer with attributes' => array( '<script></script attr="val"', false );
2050+
yield 'Script tag double-escaped with <script>' => array( '<script><!--<script></script>', false );
2051+
yield 'Script tag double-escaped with <script/' => array( '<script><!--<script/</script>', false );
2052+
yield 'Script tag double-escaped with <script ' => array( '<script><!--<script </script>', false );
2053+
yield 'Script tag double-escaped with <script\n' => array( "<script><!--<script\n</script>", false );
2054+
yield 'Script tag double-escaped with <script\t' => array( "<script><!--<script\t</script>", false );
2055+
yield 'Script tag double-escaped with <script\f' => array( "<script><!--<script\f</script>", false );
2056+
yield 'Script tag double-escaped with <script\r' => array( "<script><!--<script\r</script>", false );
20252057
}
20262058

20272059
/**

0 commit comments

Comments
 (0)