@@ -3812,116 +3812,22 @@ public function set_modifiable_text( string $plaintext_content ): bool {
38123812 switch ( $ this ->get_tag () ) {
38133813 case 'SCRIPT ' :
38143814 /*
3815- * SCRIPT tag contents can be dangerous.
3815+ * SCRIPT tag contents can be dangerous:
38163816 *
3817- * The text "</script>" could close the SCRIPT element prematurely.
3817+ * - "</script>" could close the SCRIPT element prematurely.
3818+ * - "<script>" could enter the “script data double escaped state” and preventing the
3819+ * SCRIPT element from closing as expected.
38183820 *
3819- * The text "<script>" could enter the “script data double escaped state”, preventing the
3820- * SCRIPT element from closing as expected, for example:
3821- *
3822- * <script>
3823- * // If "<!--" and "<script>" appear like this,
3824- * // the following SCRIPT close tag will not be recognized.
3825- * </script>
3826- * <h1>This appears _inside_ the preceding SCRIPT element.</h1>
3827- *
3828- * The relevant state transitions happen on text like:
3829- * 1. <
3830- * 2. / (optional)
3831- * 3. script (case-insensitive)
3832- * 4. One of the following characters:
3833- * - \t
3834- * - \n
3835- * - \r (\r and \r\n newlines are normalized to \n in HTML pre-processing)
3836- * - \f
3837- * - " " (U+0020 SPACE)
3838- * - /
3839- * - >
3840- *
3841- * @see https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-state
3821+ * Identify risky script contents to escape when possible or reject otherwise.
38423822 */
3843- if (
3823+ $ needs_escaping =
38443824 false !== stripos ( $ plaintext_content , '</script ' ) ||
3845- false !== stripos ( $ plaintext_content , '<script ' )
3846- ) {
3847- /*
3848- * JavaScript can be safely escaped with a few exceptions. This is achieved by
3849- * replacing dangerous sequences like "<script" and "</script" with a form
3850- * using a Unicode escape sequence "<\u0073cript>" and "</\u0073cript>".
3851- *
3852- * `<script` and `</script` appear in JavaScript source code in limited places,
3853- * all of which support a Unicode escape sequence on the "s" character.
3854- * JavaScript identifiers, string literals, template literals, and RegExp
3855- * literals all support Unicode escape sequences, meaning that the escaped form
3856- * is indistinguishable from the unescaped form when the JavaScript
3857- * is evaluated.
3858- *
3859- * There are a few exceptions where the escaped form can be detected:
3860- *
3861- * - The escaped form would appear in any JavaScript comments.
3862- * - “Raw” strings via `String.raw()` or the `raw` property of the first
3863- * argument to a tagged template literal exposes the raw form, revealing any
3864- * escaping that has been applied.
3865- * - The `source` property of a RegExp object reveals an escaped form the of
3866- * the pattern.
3867- *
3868- * For JavaScript that needs to avoid these issues, workarounds may
3869- * be available. For example:
3870- *
3871- * // Instead of this:
3872- * const rawStringWillBeEscaped = String.raw`</script>`;
3873- *
3874- * // This is a safe alternative:
3875- * const rawStringWillBePreserved = String.raw`</scr` + String.raw`ipt>`;
3876- *
3877- * After escaping, the JavaScript result looks like this:
3878- *
3879- * const rawStringWillBeEscaped = String.raw`</\u0073cript>`;
3880- * // Evaluates to `'</\\u0073cript>'`.
3881- *
3882- * const rawStringWillBePreserved = String.raw`</scr` + String.raw`ipt>`;
3883- * // Evaluates to `'</script>'`.
3884- *
3885- * Escaping is applied only where strictly necessary, reducing the likelyhood
3886- * that observable differences manifest in the escaped JavaScript.
3887- *
3888- * This escaping strategy strikes will make ALL JavaScript safe to embed in
3889- * HTML in a way that is completely transparent in most cases.
3890- */
3825+ false !== stripos ( $ plaintext_content , '<script ' );
3826+ if ( $ needs_escaping ) {
38913827 if ( $ this ->is_javascript_script_tag () ) {
3892- $ plaintext_content = preg_replace_callback (
3893- /*
3894- * This case-insensitive pattern consists of three groups (in order):
3895- *
3896- * HEAD: "<" or "</"
3897- * S_CHAR: "s"
3898- * TAIL: "cript" + a trailing tag name termination character
3899- */
3900- '~(?P<HEAD></?)(?P<S_CHAR>s)(?P<TAIL>cript[ \\t \\r \\n \\f />])~i ' ,
3901- static function ( $ matches ) {
3902- $ escaped_s_char = 's ' === $ matches ['S_CHAR ' ]
3903- ? '\\u0073 '
3904- : '\\u0053 ' ;
3905- return "{$ matches ['HEAD ' ]}{$ escaped_s_char }{$ matches ['TAIL ' ]}" ;
3906- },
3907- $ plaintext_content
3908- );
3828+ $ plaintext_content = $ this ->escape_javascript_script_contents ( $ plaintext_content );
39093829 } elseif ( $ this ->is_json_script_tag () ) {
3910- /**
3911- * JSON can be safely escaped.
3912- *
3913- * The following replacement may appear insufficient, "<" is replaced
3914- * with its JSON escape sequence "\u003C" without considering whether
3915- * the "<" is preceded by an escaping backslash. JSON does not support
3916- * arbitrary character escaping in strings (unlike JavaScript) so "\<"
3917- * is invalid JSON and does not need to be considered.
3918- *
3919- * @see https://www.json.org/json-en.html
3920- */
3921- $ plaintext_content = strtr (
3922- $ plaintext_content ,
3923- array ( '< ' => '\\u003C ' )
3924- );
3830+ $ plaintext_content = $ this ->escape_json_script_contents ( $ plaintext_content );
39253831 } else {
39263832 /*
39273833 * Other types of script tags cannot be escaped safely because the type
@@ -4148,6 +4054,83 @@ private function is_json_script_tag(): bool {
41484054 return false ;
41494055 }
41504056
4057+ /**
4058+ * Escape JavaScript script tag contents.
4059+ *
4060+ * Prevent JavaScript text from modifying the HTML structure of a document and
4061+ * ensure that it's contained within its enclosing SCRIPT tag as intended.
4062+ *
4063+ * JavaScript can be safely escaped with a few exceptions. This is achieved by
4064+ * replacing dangerous sequences like "<script" and "</script" with a form
4065+ * using a Unicode escape sequence "<\u0073cript>" and "</\u0073cript>".
4066+ *
4067+ * This text may appear in the JavaScript in limited ways, all of which support
4068+ * the use of Unicode escape sequences on the "s" character. The escaping is safe
4069+ * to perform in all JavaScript and the modified JavaScript maintains identical
4070+ * behavior with a few exceptions:
4071+ *
4072+ * - Comments.
4073+ * - Tagged templates like `String.raw()` that access “raw” strings.
4074+ * - The `source` property of a RegExp object.
4075+ *
4076+ * For example, this input JavaScript:
4077+ *
4078+ * // A comment: "</script>"
4079+ *
4080+ * console.log( String.raw`</script>` );
4081+ *
4082+ * const regex = /<script>/;
4083+ * console.log( regex.source );
4084+ *
4085+ * Is transformed to:
4086+ *
4087+ * // A comment: "</\u0073cript>"
4088+ *
4089+ * console.log( String.raw`</\u0073cript>` );
4090+ *
4091+ * const regex = /<\u0073cript>/;
4092+ * console.log( regex.source );
4093+ *
4094+ * Note that the RegExp's matching behavior is equivalent, meaning that
4095+ * `regex.test( '<script>' ) === true` in both the unescaped and
4096+ * escaped versions.
4097+ *
4098+ * @see https://html.spec.whatwg.org/#restrictions-for-contents-of-script-elements
4099+ */
4100+ private function escape_javascript_script_contents ( string $ text ): string {
4101+ return preg_replace_callback (
4102+ '~(?P<HEAD></?)(?P<S_CHAR>s)(?P<TAIL>cript[ \\t \\r \\n \\f />])~i ' ,
4103+ static function ( $ matches ) {
4104+ $ escaped_s_char = 's ' === $ matches ['S_CHAR ' ]
4105+ ? '\\u0073 '
4106+ : '\\u0053 ' ;
4107+ return "{$ matches ['HEAD ' ]}{$ escaped_s_char }{$ matches ['TAIL ' ]}" ;
4108+ },
4109+ $ text
4110+ );
4111+ }
4112+
4113+ /**
4114+ * Escape JSON script tag contents.
4115+ *
4116+ * Prevent JSON text from modifying the HTML structure of a document and
4117+ * ensure that it's contained within its enclosing SCRIPT tag as intended.
4118+ *
4119+ * JSON can be escaped simply by replacing "<" with its Unicode escape
4120+ * sequence "\u003C". "<" is not part of the JSON syntax and only appears
4121+ * in JSON strings, so it's always safe to escape. Furthermore, JSON only
4122+ * does not allow backslash escaping of "<", so there's no need to
4123+ * consider whether the "<" is escaped.
4124+ *
4125+ * @see https://www.json.org/json-en.html
4126+ */
4127+ private function escape_json_script_contents ( string $ text ): string {
4128+ return strtr (
4129+ $ text ,
4130+ array ( '< ' => '\\u003C ' )
4131+ );
4132+ }
4133+
41514134 /**
41524135 * Updates or creates a new attribute on the currently matched tag with the passed value.
41534136 *
0 commit comments