Skip to content

Commit 5828382

Browse files
committed
Merge branch 'html-api/auto-escape-javascript-json' into scripts/use-html-api-for-script-tags
2 parents 253b971 + 614916d commit 5828382

File tree

3 files changed

+125
-52
lines changed

3 files changed

+125
-52
lines changed

src/wp-includes/html-api/class-wp-html-tag-processor.php

Lines changed: 70 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -3814,15 +3814,16 @@ public function set_modifiable_text( string $plaintext_content ): bool {
38143814
/*
38153815
* SCRIPT tag contents can be dangerous.
38163816
*
3817-
* The text `</script>` could close the SCRIPT element prematurely.
3817+
* The text "</script>" could close the SCRIPT element prematurely.
38183818
*
3819-
* The text `<script>` could enter the "script data double escaped state", preventing the
3819+
* The text "<script>" could enter the script data double escaped state, preventing the
38203820
* SCRIPT element from closing as expected, for example:
38213821
*
38223822
* <script>
3823-
* // If this "<!--" then "<script>" the closing tag will not be recognized.
3823+
* // If "<!--" and "<script>" appear like this,
3824+
* // the following `</script>` close tag will not be recognized.
38243825
* </script>
3825-
* <h1>This appears inside the preceding SCRIPT element.</h1>
3826+
* <h1>This appears _inside_ the preceding SCRIPT element.</h1>
38263827
*
38273828
* The relevant state transitions happen on text like:
38283829
* 1. <
@@ -3844,23 +3845,63 @@ public function set_modifiable_text( string $plaintext_content ): bool {
38443845
false !== stripos( $plaintext_content, '<script' )
38453846
) {
38463847
/*
3847-
* JavaScript can be safely escaped.
3848+
* JavaScript can be safely escaped with a few exceptions. This is achieved by
3849+
* replacing dangerous sequences like "<script" and "</script" with a form
3850+
* using a Unicode escape sequence "<\u0073cript>" and "</\u0073cript>".
3851+
*
3852+
* `<script` and `</script` appear in JavaScript source code in limited places,
3853+
* all of which support a Unicode escape sequence on the "s" character.
3854+
* JavaScript identifiers, string literals, template literals, and RegExp
3855+
* literals all support Unicode escape sequences, meaning that the escaped form
3856+
* is indistinguishable from the unescaped form when the JavaScript
3857+
* is evaluated.
3858+
*
3859+
* There are a few exceptions where the escaped form can be detected:
3860+
*
3861+
* - The escaped form would appear in any JavaScript comments.
3862+
* - “Raw” strings via `String.raw()` or the `raw` property of the first
3863+
* argument to a tagged template literal exposes the raw form, revealing any
3864+
* escaping that has been applied.
3865+
* - The `source` property of a RegExp object reveals an escaped form the of
3866+
* the pattern.
3867+
*
3868+
* For JavaScript that needs to avoid these issues, workarounds may
3869+
* be available. For example:
3870+
*
3871+
* // Instead of:
3872+
* const rawStringWillBeEscaped = String.raw`</script>`;
3873+
*
3874+
* // This will yield the same result with no escaping required:
3875+
* const rawStringWillBePreserved = String.raw`</scr` + String.raw`ipt>`;
3876+
*
3877+
* // After the escaping has been applied and the JavaScript evaluated,
3878+
* // these are the resulting values:
3879+
* rawStringWillBeEscaped; // "</\\u0073cript>"
3880+
* rawStringWillBePreserve; // "</script>"
3881+
*
3882+
*
3883+
* Escaping is applied only where strictly necessary, reducing the likelyhood
3884+
* that observable differences manifest in the escaped JavaScript.
3885+
*
3886+
* The alternatives are to reject JavaScript that could be safely escaped in
3887+
* a majority of cases or to relax restrictions in ways that produce dangerous
3888+
* or broken HTML documents, neither are desirable.
38483889
*/
38493890
if ( $this->is_javascript_script_tag() ) {
38503891
$plaintext_content = preg_replace_callback(
38513892
/*
3852-
* This case-insensitive pattern consists of three groups:
3893+
* This case-insensitive pattern consists of three groups (in order):
38533894
*
3854-
* 1: "<" or "</"
3855-
* 2: "s"
3856-
* 3: "cript" + a trailing character that terminates a tag name.
3895+
* HEAD: "<" or "</"
3896+
* S_CHAR: "s"
3897+
* TAIL: "cript" + a trailing tag name termination character
38573898
*/
3858-
'~(</?)(s)(cript[\\t\\r\\n\\f />])~i',
3899+
'~(?P<HEAD></?)(?P<S_CHAR>s)(?P<TAIL>cript[\\t\\r\\n\\f />])~i',
38593900
static function ( $matches ) {
3860-
$escaped_s_char = 's' === $matches[2]
3901+
$escaped_s_char = 's' === $matches['S_CHAR']
38613902
? '\\u0073'
38623903
: '\\u0053';
3863-
return "{$matches[1]}{$escaped_s_char}{$matches[3]}";
3904+
return "{$matches['HEAD']}{$escaped_s_char}{$matches['TAIL']}";
38643905
},
38653906
$plaintext_content
38663907
);
@@ -3882,7 +3923,8 @@ static function ( $matches ) {
38823923
);
38833924
} else {
38843925
/*
3885-
* Other types of script tags cannot be escaped safely.
3926+
* Other types of script tags cannot be escaped safely because the type
3927+
* of comment and escaping strategy are unknown.
38863928
*/
38873929
return false;
38883930
}
@@ -3948,11 +3990,14 @@ static function ( $tag_match ) {
39483990
*
39493991
* @see https://html.spec.whatwg.org/multipage/scripting.html#prepare-the-script-element
39503992
*
3951-
* @since {WP_VERSION}
3993+
* @ignore
3994+
* @todo Consider a public API that is clear and general.
3995+
*
3996+
* @since 7.0.0
39523997
*
39533998
* @return bool True if the script tag will be evaluated as JavaScript.
39543999
*/
3955-
public function is_javascript_script_tag(): bool {
4000+
private function is_javascript_script_tag(): bool {
39564001
if ( 'SCRIPT' !== $this->get_tag() || $this->get_namespace() !== 'html' ) {
39574002
return false;
39584003
}
@@ -4059,11 +4104,14 @@ public function is_javascript_script_tag(): bool {
40594104
/**
40604105
* Indicates if the currently matched tag is a JSON script tag.
40614106
*
4062-
* @since {WP_VERSION}
4107+
* @ignore
4108+
* @todo Consider a public API that is clear and general.
4109+
*
4110+
* @since 7.0.0
40634111
*
40644112
* @return bool True if the script tag should be treated as JSON.
40654113
*/
4066-
public function is_json_script_tag(): bool {
4114+
private function is_json_script_tag(): bool {
40674115
if ( 'SCRIPT' !== $this->get_tag() || $this->get_namespace() !== 'html' ) {
40684116
return false;
40694117
}
@@ -4083,16 +4131,15 @@ public function is_json_script_tag(): bool {
40834131
* > A JSON MIME type is any MIME type whose subtype ends in "+json" or whose essence
40844132
* > is "application/json" or "text/json".
40854133
*
4086-
* The JSON subtype ending in "+json" is not currently handled due to lack
4087-
* of a MIME type parser.
4134+
* @todo The JSON MIME type handling handles some common cases but when MIME type parsing is available it should be leveraged here.
40884135
*
40894136
* @see https://mimesniff.spec.whatwg.org/#json-mime-type
40904137
*/
40914138
if (
4092-
'application/json' === $type
4093-
|| 'importmap' === $type
4094-
|| 'speculationrules' === $type
4095-
|| 'text/json' === $type
4139+
'importmap' === $type ||
4140+
'speculationrules' === $type ||
4141+
'application/json' === $type ||
4142+
'text/json' === $type
40964143
) {
40974144
return true;
40984145
}

tests/phpunit/tests/html-api/wpHtmlTagProcessorModifiableText.php

Lines changed: 24 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -544,7 +544,7 @@ public static function data_script_tag_text_updates(): array {
544544
* @ticket 64419
545545
*/
546546
public function test_complex_javascript_and_json_auto_escaping() {
547-
$processor = new WP_HTML_Tag_Processor( "<script></script>\n<script></script>\n<h1>OK</h1>" );
547+
$processor = new WP_HTML_Tag_Processor( "<script></script>\n<script></script>\n<hr>" );
548548
$processor->next_tag( 'SCRIPT' );
549549
$processor->set_attribute( 'type', 'importmap' );
550550
$importmap_data = array(
@@ -558,27 +558,27 @@ public function test_complex_javascript_and_json_auto_escaping() {
558558
JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE | JSON_UNESCAPED_LINE_TERMINATORS
559559
);
560560

561-
$processor->set_modifiable_text( $importmap );
561+
$processor->set_modifiable_text( "\n{$importmap}\n" );
562562
$decoded_importmap = json_decode( $processor->get_modifiable_text(), true );
563-
$this->assertSame( JSON_ERROR_NONE, json_last_error(), 'Precondition failed, JSON did not decode as expected: ' . json_last_error_msg() );
564-
$this->assertEquals(
565-
$importmap_data,
566-
$decoded_importmap,
567-
'Precondition failed: importmap JSON did not decode/encode as expected: ' . json_last_error_msg()
568-
);
569-
563+
$this->assertSame( JSON_ERROR_NONE, json_last_error(), 'JSON failed to decode correctly.' );
564+
$this->assertEquals( $importmap_data, $decoded_importmap );
570565
$processor->next_tag( 'SCRIPT' );
571566
$processor->set_attribute( 'type', 'module' );
572567
$javascript = <<<'JS'
573568
import '</SCRIPT>\\<!--\\<script>';
574569
JS;
575-
$processor->set_modifiable_text( $javascript );
570+
$processor->set_modifiable_text( "\n{$javascript}\n" );
576571

577-
$expected = <<<'HTML'
578-
<script type="importmap">{"imports":{"\u003C/SCRIPT>\\\u003C!--\\\u003Cscript>":"./script"}}</script>
579-
<script type="module">import '</\u0053CRIPT>\\<!--\\<\u0073cript>';</script>
580-
<h1>OK</h1>
572+
$expected = <<<'HTML'
573+
<script type="importmap">
574+
{"imports":{"\u003C/SCRIPT>\\\u003C!--\\\u003Cscript>":"./script"}}
575+
</script>
576+
<script type="module">
577+
import '</\u0053CRIPT>\\<!--\\<\u0073cript>';
578+
</script>
579+
<hr>
581580
HTML;
581+
582582
$updated_html = $processor->get_updated_html();
583583
$this->assertEqualHTML( $expected, $updated_html );
584584

@@ -588,11 +588,11 @@ public function test_complex_javascript_and_json_auto_escaping() {
588588
$this->assertSame( 'importmap', $processor->get_attribute( 'type' ) );
589589
$importmap_json = $processor->get_modifiable_text();
590590
$decoded_importmap = json_decode( $importmap_json, true );
591-
$this->assertSame( 'No error', json_last_error_msg() );
591+
$this->assertSame( JSON_ERROR_NONE, json_last_error(), 'Importmap JSON failed to decode.' );
592592
$this->assertEquals(
593593
$importmap_data,
594594
$decoded_importmap,
595-
'Precondition failed: re-processed importmap JSON did not decode/encode as expected: ' . json_last_error_msg()
595+
'JSON was not equal after re-processing updated HTML.'
596596
);
597597
}
598598

@@ -603,21 +603,23 @@ public function test_json_auto_escaping() {
603603
// This is not a typical JSON encoding or escaping, but it is valid.
604604
$json_text = '"Escaped BS: \\\\; Escaped BS+LT: \\\\<; Unescaped LT: <; Script closer: </script>"';
605605
$expected_decoded_json = 'Escaped BS: \\; Escaped BS+LT: \\<; Unescaped LT: <; Script closer: </script>';
606-
$decoded_json = json_decode( $json_text, true );
607-
$this->assertSame( JSON_ERROR_NONE, json_last_error(), 'Precondition failed, JSON did not decode as expected: ' . json_last_error_msg() );
606+
$decoded_json = json_decode( $json_text );
607+
$this->assertSame( JSON_ERROR_NONE, json_last_error(), 'JSON failed to decode.' );
608608
$this->assertSame(
609609
$expected_decoded_json,
610610
$decoded_json,
611-
'Precondition failed: test JSON text did not decode as expected.'
611+
'Decoded JSON did not match expected value.'
612612
);
613613

614614
$processor = new WP_HTML_Tag_Processor( '<script type="application/json"></script>' );
615615
$processor->next_tag( 'SCRIPT' );
616616

617-
$processor->set_modifiable_text( $json_text );
617+
$processor->set_modifiable_text( "\n{$json_text}\n" );
618618

619619
$expected = <<<'HTML'
620-
<script type="application/json">"Escaped BS: \\; Escaped BS+LT: \\\u003C; Unescaped LT: \u003C; Script closer: \u003C/script>"</script>
620+
<script type="application/json">
621+
"Escaped BS: \\; Escaped BS+LT: \\\u003C; Unescaped LT: \u003C; Script closer: \u003C/script>"
622+
</script>
621623
HTML;
622624

623625
$updated_html = $processor->get_updated_html();
@@ -627,7 +629,7 @@ public function test_json_auto_escaping() {
627629
$processor = new WP_HTML_Tag_Processor( $expected );
628630
$processor->next_tag( 'SCRIPT' );
629631
$decoded_json_from_html = json_decode( $processor->get_modifiable_text(), true );
630-
$this->assertSame( JSON_ERROR_NONE, json_last_error(), 'Precondition failed, JSON did not decode as expected: ' . json_last_error_msg() );
632+
$this->assertSame( JSON_ERROR_NONE, json_last_error(), 'JSON failed to decode.' );
631633
$this->assertEquals(
632634
$expected_decoded_json,
633635
$decoded_json_from_html

tests/phpunit/tests/html-api/wpHtmlTagProcessorScriptTag.php

Lines changed: 31 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,13 @@ class Tests_HtmlApi_WpHtmlTagProcessorScriptTag extends WP_UnitTestCase {
2323
public function test_is_javascript_script_tag( string $html, bool $expected_result ) {
2424
$processor = new WP_HTML_Tag_Processor( $html );
2525
$processor->next_tag();
26+
27+
$result = ( function () {
28+
return $this->is_javascript_script_tag();
29+
} )->call( $processor );
2630
$this->assertSame(
2731
$expected_result,
28-
$processor->is_javascript_script_tag(),
32+
$result,
2933
'Failed to correctly identify JavaScript script tag'
3034
);
3135
}
@@ -98,6 +102,9 @@ public static function data_is_javascript_script_tag(): array {
98102
'Script tag with language="jscript"' => array( '<script language="jscript"></script>', true ),
99103
'Script tag with language="livescript"' => array( '<script language="livescript"></script>', true ),
100104

105+
// Whitespace is not trimmed in the langauge attribute.
106+
'Script tag with language=" javascript"' => array( '<script language=" javascript"></script>', false ),
107+
101108
// Non-JavaScript script tags - should NOT be JavaScript.
102109
'Script tag with importmap type' => array( '<script type="importmap"></script>', false ),
103110
'Script tag with speculationrules type' => array( '<script type="speculationrules"></script>', false ),
@@ -122,9 +129,11 @@ public static function data_is_javascript_script_tag(): array {
122129
public function test_is_javascript_script_tag_returns_false_before_finding_tags() {
123130
$processor = new WP_HTML_Tag_Processor( 'Just some text' );
124131
$processor->next_token();
125-
132+
$result = ( function () {
133+
return $this->is_javascript_script_tag();
134+
} )->call( $processor );
126135
$this->assertFalse(
127-
$processor->is_javascript_script_tag(),
136+
$result,
128137
'Should return false when not stopped on script tag'
129138
);
130139
}
@@ -137,8 +146,13 @@ public function test_is_javascript_script_tag_returns_false_before_finding_tags(
137146
public function test_is_javascript_script_tag_returns_false_for_non_html_namespace() {
138147
$processor = new WP_HTML_Tag_Processor( '<script></script>' );
139148
$processor->change_parsing_namespace( 'svg' );
149+
$processor->next_tag();
150+
$this->assertSame( 'SCRIPT', $processor->get_tag() );
151+
$result = ( function () {
152+
return $this->is_javascript_script_tag();
153+
} )->call( $processor );
140154
$this->assertFalse(
141-
$processor->is_javascript_script_tag(),
155+
$result,
142156
'Should return false for script tags in non-HTML namespace'
143157
);
144158
}
@@ -156,9 +170,12 @@ public function test_is_javascript_script_tag_returns_false_for_non_html_namespa
156170
public function test_is_json_script_tag( string $html, bool $expected_result ) {
157171
$processor = new WP_HTML_Tag_Processor( $html );
158172
$processor->next_tag();
173+
$result = ( function () {
174+
return $this->is_json_script_tag();
175+
} )->call( $processor );
159176
$this->assertSame(
160177
$expected_result,
161-
$processor->is_json_script_tag(),
178+
$result,
162179
'Failed to correctly identify JSON script tag'
163180
);
164181
}
@@ -222,8 +239,11 @@ public static function data_is_json_script_tag(): array {
222239
public function test_is_json_script_tag_returns_false_before_finding_tags() {
223240
$processor = new WP_HTML_Tag_Processor( 'Just some text' );
224241
$processor->next_token();
242+
$result = ( function () {
243+
return $this->is_json_script_tag();
244+
} )->call( $processor );
225245
$this->assertFalse(
226-
$processor->is_json_script_tag(),
246+
$result,
227247
'Should return false when not stopped on script tag'
228248
);
229249
}
@@ -237,8 +257,12 @@ public function test_is_json_script_tag_returns_false_for_non_html_namespace() {
237257
$processor = new WP_HTML_Tag_Processor( '<script></script>' );
238258
$processor->change_parsing_namespace( 'svg' );
239259
$processor->next_tag();
260+
$this->assertSame( 'SCRIPT', $processor->get_tag() );
261+
$result = ( function () {
262+
return $this->is_json_script_tag();
263+
} )->call( $processor );
240264
$this->assertFalse(
241-
$processor->is_json_script_tag(),
265+
$result,
242266
'Should return false for script tags in non-HTML namespace'
243267
);
244268
}

0 commit comments

Comments
 (0)