Skip to content

Commit 4a8ca98

Browse files
committed
Auto-escape JavaScript and JSON script tags when necessary
1 parent eda8d9d commit 4a8ca98

File tree

2 files changed

+373
-21
lines changed

2 files changed

+373
-21
lines changed

src/wp-includes/html-api/class-wp-html-tag-processor.php

Lines changed: 230 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -3811,29 +3811,83 @@ public function set_modifiable_text( string $plaintext_content ): bool {
38113811

38123812
switch ( $this->get_tag() ) {
38133813
case 'SCRIPT':
3814-
/**
3815-
* This is over-protective, but ensures the update doesn't break
3816-
* the HTML structure of the SCRIPT element.
3814+
/*
3815+
* SCRIPT tag contents can be dangerous.
3816+
*
3817+
* The text `</script>` could close the SCRIPT element prematurely.
38173818
*
3818-
* More thorough analysis could track the HTML tokenizer states
3819-
* and to ensure that the SCRIPT element closes at the expected
3820-
* SCRIPT close tag as is done in {@see ::skip_script_data()}.
3819+
* The text `<script>` could enter the "script data double escaped state", preventing the
3820+
* SCRIPT element from closing as expected, for example:
38213821
*
3822-
* A SCRIPT element could be closed prematurely by contents
3823-
* like `</script>`. A SCRIPT element could be prevented from
3824-
* closing by contents like `<!--<script>`.
3822+
* <script>
3823+
* // If this "<!--" then "<script>" the closing tag will not be recognized.
3824+
* </script>
3825+
* <h1>This appears inside the preceding SCRIPT element.</h1>
38253826
*
3826-
* The following strings are essential for dangerous content,
3827-
* although they are insufficient on their own. This trade-off
3828-
* prevents dangerous scripts from being sent to the browser.
3829-
* It is also unlikely to produce HTML that may confuse more
3830-
* basic HTML tooling.
3827+
* The relevant state transitions happen on text like:
3828+
* 1. <
3829+
* 2. / (optional)
3830+
* 3. script (case-insensitive)
3831+
* 4. One of the following characters:
3832+
* - \t
3833+
* - \n
3834+
* - \r (\r and \r\n newlines are normalized to \n in HTML pre-processing)
3835+
* - \f
3836+
* - " " (U+0020 SPACE)
3837+
* - /
3838+
* - >
3839+
*
3840+
* @see https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-state
38313841
*/
38323842
if (
38333843
false !== stripos( $plaintext_content, '</script' ) ||
38343844
false !== stripos( $plaintext_content, '<script' )
38353845
) {
3836-
return false;
3846+
/*
3847+
* JavaScript can be safely escaped.
3848+
*/
3849+
if ( $this->is_javascript_script_tag() ) {
3850+
$plaintext_content = preg_replace_callback(
3851+
/*
3852+
* This case-insensitive pattern consists of three groups:
3853+
*
3854+
* 1: "<" or "</"
3855+
* 2: "s"
3856+
* 3: "cript" + a trailing character that terminates a tag name.
3857+
*/
3858+
'~(</?)(s)(cript[\\t\\r\\n\\f />])~i',
3859+
static function ( $matches ) {
3860+
$escaped_s_char = 's' === $matches[2]
3861+
? '\\u0073'
3862+
: '\\u0053';
3863+
return "{$matches[1]}{$escaped_s_char}{$matches[3]}";
3864+
},
3865+
$plaintext_content
3866+
);
3867+
} elseif ( $this->is_json_script_tag() ) {
3868+
/**
3869+
* JSON can be safely escaped.
3870+
*
3871+
* The following replacement may appear insuficcient, "<" is replaced
3872+
* with its JSON escape sequence "\u003C" without considering whether
3873+
* the "<" is preceded by an escaping slash. JSON does not support
3874+
* arbitrary character escaping (like JavaScript strings) so "\<"
3875+
* is invalid JSON and would have to be preceded by
3876+
* an escaped backslash: "\\<".
3877+
*
3878+
* @see https://www.json.org/json-en.html
3879+
*/
3880+
3881+
$plaintext_content = strtr(
3882+
$plaintext_content,
3883+
array( '<' => '\\u003C' )
3884+
);
3885+
} else {
3886+
/*
3887+
* Other types of script tags cannot be escaped safely.
3888+
*/
3889+
return false;
3890+
}
38373891
}
38383892

38393893
$this->lexical_updates['modifiable text'] = new WP_HTML_Text_Replacement(
@@ -3891,6 +3945,167 @@ static function ( $tag_match ) {
38913945
return false;
38923946
}
38933947

3948+
/**
3949+
* Indicates if the currently matched tag is a JavaScript script tag.
3950+
*
3951+
* @see https://html.spec.whatwg.org/multipage/scripting.html#prepare-the-script-element
3952+
*
3953+
* @since {WP_VERSION}
3954+
*
3955+
* @return bool True if the script tag will be evaluated as JavaScript.
3956+
*/
3957+
public function is_javascript_script_tag(): bool {
3958+
if ( 'SCRIPT' !== $this->get_tag() || $this->get_namespace() !== 'html' ) {
3959+
return false;
3960+
}
3961+
3962+
/*
3963+
* > If any of the following are true:
3964+
* > - el has a type attribute whose value is the empty string;
3965+
* > - el has no type attribute but it has a language attribute and that attribute's
3966+
* > value is the empty string; or
3967+
* > - el has neither a type attribute nor a language attribute,
3968+
* > then let the script block's type string for this script element be "text/javascript".
3969+
*/
3970+
$type_attr = $this->get_attribute( 'type' );
3971+
$language_attr = $this->get_attribute( 'language' );
3972+
3973+
if ( true === $type_attr || '' === $type_attr ) {
3974+
return true;
3975+
}
3976+
if (
3977+
null === $type_attr && (
3978+
true === $language_attr ||
3979+
'' === $language_attr ||
3980+
null === $language_attr
3981+
)
3982+
) {
3983+
return true;
3984+
}
3985+
3986+
/*
3987+
* > Otherwise, if el has a type attribute, then let the script block's type string be
3988+
* > the value of that attribute with leading and trailing ASCII whitespace stripped.
3989+
* > Otherwise, el has a non-empty language attribute; let the script block's type string
3990+
* > be the concatenation of "text/" and the value of el's language attribute.
3991+
*/
3992+
$type_string = $type_attr ? trim( $type_attr, " \t\f\r\n" ) : "text/{$language_attr}";
3993+
3994+
/*
3995+
* > If the script block's type string is a JavaScript MIME type essence match, then
3996+
* > set el's type to "classic".
3997+
*
3998+
* > A string is a JavaScript MIME type essence match if it is an ASCII case-insensitive
3999+
* > match for one of the JavaScript MIME type essence strings.
4000+
*
4001+
* > A JavaScript MIME type is any MIME type whose essence is one of the following:
4002+
* >
4003+
* > - application/ecmascript
4004+
* > - application/javascript
4005+
* > - application/x-ecmascript
4006+
* > - application/x-javascript
4007+
* > - text/ecmascript
4008+
* > - text/javascript
4009+
* > - text/javascript1.0
4010+
* > - text/javascript1.1
4011+
* > - text/javascript1.2
4012+
* > - text/javascript1.3
4013+
* > - text/javascript1.4
4014+
* > - text/javascript1.5
4015+
* > - text/jscript
4016+
* > - text/livescript
4017+
* > - text/x-ecmascript
4018+
* > - text/x-javascript
4019+
*
4020+
* @see https://mimesniff.spec.whatwg.org/#javascript-mime-type-essence-match
4021+
* @see https://mimesniff.spec.whatwg.org/#javascript-mime-type
4022+
*/
4023+
switch ( strtolower( $type_string ) ) {
4024+
case 'application/ecmascript':
4025+
case 'application/javascript':
4026+
case 'application/x-ecmascript':
4027+
case 'application/x-javascript':
4028+
case 'text/ecmascript':
4029+
case 'text/javascript':
4030+
case 'text/javascript1.0':
4031+
case 'text/javascript1.1':
4032+
case 'text/javascript1.2':
4033+
case 'text/javascript1.3':
4034+
case 'text/javascript1.4':
4035+
case 'text/javascript1.5':
4036+
case 'text/jscript':
4037+
case 'text/livescript':
4038+
case 'text/x-ecmascript':
4039+
case 'text/x-javascript':
4040+
return true;
4041+
4042+
/*
4043+
* > Otherwise, if the script block's type string is an ASCII case-insensitive match for
4044+
* > the string "module", then set el's type to "module".
4045+
*
4046+
* A module is evaluated as JavaScript.
4047+
*/
4048+
case 'module':
4049+
return true;
4050+
}
4051+
4052+
/*
4053+
* > - Otherwise, if the script block's type string is an ASCII case-insensitive match for
4054+
* > the string "importmap", then set el's type to "importmap".
4055+
*
4056+
* An importmap is JSON and not evaluated as JavaScript. This case is not handled here.
4057+
*/
4058+
4059+
/*
4060+
* > Otherwise, return. (No script is executed, and el's type is left as null.)
4061+
*/
4062+
return false;
4063+
}
4064+
4065+
/**
4066+
* Indicates if the currently matched tag is a JSON script tag.
4067+
*
4068+
* @since {WP_VERSION}
4069+
*
4070+
* @return bool True if the script tag should be treated as JSON.
4071+
*/
4072+
public function is_json_script_tag(): bool {
4073+
if ( 'SCRIPT' !== $this->get_tag() || $this->get_namespace() !== 'html' ) {
4074+
return false;
4075+
}
4076+
4077+
$type_attr = $this->get_attribute( 'type' );
4078+
4079+
if ( empty( $type_attr ) || true === $type_attr ) {
4080+
return false;
4081+
}
4082+
4083+
$type_string = strtolower( trim( $type_attr, " \t\f\r\n" ) );
4084+
4085+
/*
4086+
* > …
4087+
* > Otherwise, if the script block's type string is an ASCII case-insensitive match for the string "importmap", then set el's type to "importmap".
4088+
* > Otherwise, if the script block's type string is an ASCII case-insensitive match for the string "speculationrules", then set el's type to "speculationrules".
4089+
* @see https://html.spec.whatwg.org/#script-processing-model
4090+
*
4091+
* > A JSON MIME type is any MIME type whose subtype ends in "+json" or whose essence
4092+
* > is "application/json" or "text/json".
4093+
*
4094+
* @see https://mimesniff.spec.whatwg.org/#json-mime-type
4095+
*/
4096+
if (
4097+
'application/json' === $type_string
4098+
|| 'importmap' === $type_string
4099+
|| 'speculationrules' === $type_string
4100+
|| 'text/json' === $type_string
4101+
|| str_ends_with( $type_string, '+json' )
4102+
) {
4103+
return true;
4104+
}
4105+
4106+
return false;
4107+
}
4108+
38944109
/**
38954110
* Updates or creates a new attribute on the currently matched tag with the passed value.
38964111
*

0 commit comments

Comments
 (0)