Skip to content

Commit cb6b990

Browse files
committed
Extract and document escaping functions
1 parent 2ef0bf0 commit cb6b990

File tree

1 file changed

+87
-104
lines changed

1 file changed

+87
-104
lines changed

src/wp-includes/html-api/class-wp-html-tag-processor.php

Lines changed: 87 additions & 104 deletions
Original file line numberDiff line numberDiff line change
@@ -3812,116 +3812,22 @@ public function set_modifiable_text( string $plaintext_content ): bool {
38123812
switch ( $this->get_tag() ) {
38133813
case 'SCRIPT':
38143814
/*
3815-
* SCRIPT tag contents can be dangerous.
3815+
* SCRIPT tag contents can be dangerous:
38163816
*
3817-
* The text "</script>" could close the SCRIPT element prematurely.
3817+
* - "</script>" could close the SCRIPT element prematurely.
3818+
* - "<script>" could enter the “script data double escaped state” and preventing the
3819+
* SCRIPT element from closing as expected.
38183820
*
3819-
* The text "<script>" could enter the “script data double escaped state”, preventing the
3820-
* SCRIPT element from closing as expected, for example:
3821-
*
3822-
* <script>
3823-
* // If "<!--" and "<script>" appear like this,
3824-
* // the following SCRIPT close tag will not be recognized.
3825-
* </script>
3826-
* <h1>This appears _inside_ the preceding SCRIPT element.</h1>
3827-
*
3828-
* The relevant state transitions happen on text like:
3829-
* 1. <
3830-
* 2. / (optional)
3831-
* 3. script (case-insensitive)
3832-
* 4. One of the following characters:
3833-
* - \t
3834-
* - \n
3835-
* - \r (\r and \r\n newlines are normalized to \n in HTML pre-processing)
3836-
* - \f
3837-
* - " " (U+0020 SPACE)
3838-
* - /
3839-
* - >
3840-
*
3841-
* @see https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-state
3821+
* Identify risky script contents to escape when possible or reject otherwise.
38423822
*/
3843-
if (
3823+
$needs_escaping =
38443824
false !== stripos( $plaintext_content, '</script' ) ||
3845-
false !== stripos( $plaintext_content, '<script' )
3846-
) {
3847-
/*
3848-
* JavaScript can be safely escaped with a few exceptions. This is achieved by
3849-
* replacing dangerous sequences like "<script" and "</script" with a form
3850-
* using a Unicode escape sequence "<\u0073cript>" and "</\u0073cript>".
3851-
*
3852-
* `<script` and `</script` appear in JavaScript source code in limited places,
3853-
* all of which support a Unicode escape sequence on the "s" character.
3854-
* JavaScript identifiers, string literals, template literals, and RegExp
3855-
* literals all support Unicode escape sequences, meaning that the escaped form
3856-
* is indistinguishable from the unescaped form when the JavaScript
3857-
* is evaluated.
3858-
*
3859-
* There are a few exceptions where the escaped form can be detected:
3860-
*
3861-
* - The escaped form would appear in any JavaScript comments.
3862-
* - “Raw” strings via `String.raw()` or the `raw` property of the first
3863-
* argument to a tagged template literal exposes the raw form, revealing any
3864-
* escaping that has been applied.
3865-
* - The `source` property of a RegExp object reveals an escaped form the of
3866-
* the pattern.
3867-
*
3868-
* For JavaScript that needs to avoid these issues, workarounds may
3869-
* be available. For example:
3870-
*
3871-
* // Instead of this:
3872-
* const rawStringWillBeEscaped = String.raw`</script>`;
3873-
*
3874-
* // This is a safe alternative:
3875-
* const rawStringWillBePreserved = String.raw`</scr` + String.raw`ipt>`;
3876-
*
3877-
* After escaping, the JavaScript result looks like this:
3878-
*
3879-
* const rawStringWillBeEscaped = String.raw`</\u0073cript>`;
3880-
* // Evaluates to `'</\\u0073cript>'`.
3881-
*
3882-
* const rawStringWillBePreserved = String.raw`</scr` + String.raw`ipt>`;
3883-
* // Evaluates to `'</script>'`.
3884-
*
3885-
* Escaping is applied only where strictly necessary, reducing the likelyhood
3886-
* that observable differences manifest in the escaped JavaScript.
3887-
*
3888-
* This escaping strategy strikes will make ALL JavaScript safe to embed in
3889-
* HTML in a way that is completely transparent in most cases.
3890-
*/
3825+
false !== stripos( $plaintext_content, '<script' );
3826+
if ( $needs_escaping ) {
38913827
if ( $this->is_javascript_script_tag() ) {
3892-
$plaintext_content = preg_replace_callback(
3893-
/*
3894-
* This case-insensitive pattern consists of three groups (in order):
3895-
*
3896-
* HEAD: "<" or "</"
3897-
* S_CHAR: "s"
3898-
* TAIL: "cript" + a trailing tag name termination character
3899-
*/
3900-
'~(?P<HEAD></?)(?P<S_CHAR>s)(?P<TAIL>cript[\\t\\r\\n\\f />])~i',
3901-
static function ( $matches ) {
3902-
$escaped_s_char = 's' === $matches['S_CHAR']
3903-
? '\\u0073'
3904-
: '\\u0053';
3905-
return "{$matches['HEAD']}{$escaped_s_char}{$matches['TAIL']}";
3906-
},
3907-
$plaintext_content
3908-
);
3828+
$plaintext_content = $this->escape_javascript_script_contents( $plaintext_content );
39093829
} elseif ( $this->is_json_script_tag() ) {
3910-
/**
3911-
* JSON can be safely escaped.
3912-
*
3913-
* The following replacement may appear insufficient, "<" is replaced
3914-
* with its JSON escape sequence "\u003C" without considering whether
3915-
* the "<" is preceded by an escaping backslash. JSON does not support
3916-
* arbitrary character escaping in strings (unlike JavaScript) so "\<"
3917-
* is invalid JSON and does not need to be considered.
3918-
*
3919-
* @see https://www.json.org/json-en.html
3920-
*/
3921-
$plaintext_content = strtr(
3922-
$plaintext_content,
3923-
array( '<' => '\\u003C' )
3924-
);
3830+
$plaintext_content = $this->escape_json_script_contents( $plaintext_content );
39253831
} else {
39263832
/*
39273833
* Other types of script tags cannot be escaped safely because the type
@@ -4148,6 +4054,83 @@ private function is_json_script_tag(): bool {
41484054
return false;
41494055
}
41504056

4057+
/**
4058+
* Escape JavaScript script tag contents.
4059+
*
4060+
* Prevent JavaScript text from modifying the HTML structure of a document and
4061+
* ensure that it's contained within its enclosing SCRIPT tag as intended.
4062+
*
4063+
* JavaScript can be safely escaped with a few exceptions. This is achieved by
4064+
* replacing dangerous sequences like "<script" and "</script" with a form
4065+
* using a Unicode escape sequence "<\u0073cript>" and "</\u0073cript>".
4066+
*
4067+
* This text may appear in the JavaScript in limited ways, all of which support
4068+
* the use of Unicode escape sequences on the "s" character. The escaping is safe
4069+
* to perform in all JavaScript and the modified JavaScript maintains identical
4070+
* behavior with a few exceptions:
4071+
*
4072+
* - Comments.
4073+
* - Tagged templates like `String.raw()` that access “raw” strings.
4074+
* - The `source` property of a RegExp object.
4075+
*
4076+
* For example, this input JavaScript:
4077+
*
4078+
* // A comment: "</script>"
4079+
*
4080+
* console.log( String.raw`</script>` );
4081+
*
4082+
* const regex = /<script>/;
4083+
* console.log( regex.source );
4084+
*
4085+
* Is transformed to:
4086+
*
4087+
* // A comment: "</\u0073cript>"
4088+
*
4089+
* console.log( String.raw`</\u0073cript>` );
4090+
*
4091+
* const regex = /<\u0073cript>/;
4092+
* console.log( regex.source );
4093+
*
4094+
* Note that the RegExp's matching behavior is equivalent, meaning that
4095+
* `regex.test( '<script>' ) === true` in both the unescaped and
4096+
* escaped versions.
4097+
*
4098+
* @see https://html.spec.whatwg.org/#restrictions-for-contents-of-script-elements
4099+
*/
4100+
private function escape_javascript_script_contents( string $text ): string {
4101+
return preg_replace_callback(
4102+
'~(?P<HEAD></?)(?P<S_CHAR>s)(?P<TAIL>cript[\\t\\r\\n\\f />])~i',
4103+
static function ( $matches ) {
4104+
$escaped_s_char = 's' === $matches['S_CHAR']
4105+
? '\\u0073'
4106+
: '\\u0053';
4107+
return "{$matches['HEAD']}{$escaped_s_char}{$matches['TAIL']}";
4108+
},
4109+
$text
4110+
);
4111+
}
4112+
4113+
/**
4114+
* Escape JSON script tag contents.
4115+
*
4116+
* Prevent JSON text from modifying the HTML structure of a document and
4117+
* ensure that it's contained within its enclosing SCRIPT tag as intended.
4118+
*
4119+
* JSON can be escaped simply by replacing "<" with its Unicode escape
4120+
* sequence "\u003C". "<" is not part of the JSON syntax and only appears
4121+
* in JSON strings, so it's always safe to escape. Furthermore, JSON only
4122+
* does not allow backslash escaping of "<", so there's no need to
4123+
* consider whether the "<" is escaped.
4124+
*
4125+
* @see https://www.json.org/json-en.html
4126+
*/
4127+
private function escape_json_script_contents( string $text ): string {
4128+
return strtr(
4129+
$text,
4130+
array( '<' => '\\u003C' )
4131+
);
4132+
}
4133+
41514134
/**
41524135
* Updates or creates a new attribute on the currently matched tag with the passed value.
41534136
*

0 commit comments

Comments
 (0)