@@ -3814,15 +3814,16 @@ public function set_modifiable_text( string $plaintext_content ): bool {
38143814 /*
38153815 * SCRIPT tag contents can be dangerous.
38163816 *
3817- * The text ` </script>` could close the SCRIPT element prematurely.
3817+ * The text " </script>" could close the SCRIPT element prematurely.
38183818 *
3819- * The text ` <script>` could enter the " script data double escaped state" , preventing the
3819+ * The text " <script>" could enter the “ script data double escaped state” , preventing the
38203820 * SCRIPT element from closing as expected, for example:
38213821 *
38223822 * <script>
3823- * // If this "<!--" then "<script>" the closing tag will not be recognized.
3823+ * // If "<!--" and "<script>" appear like this,
3824+ * // the following `</script>` close tag will not be recognized.
38243825 * </script>
3825- * <h1>This appears inside the preceding SCRIPT element.</h1>
3826+ * <h1>This appears _inside_ the preceding SCRIPT element.</h1>
38263827 *
38273828 * The relevant state transitions happen on text like:
38283829 * 1. <
@@ -3844,23 +3845,63 @@ public function set_modifiable_text( string $plaintext_content ): bool {
38443845 false !== stripos ( $ plaintext_content , '<script ' )
38453846 ) {
38463847 /*
3847- * JavaScript can be safely escaped.
3848+ * JavaScript can be safely escaped with a few exceptions. This is achieved by
3849+ * replacing dangerous sequences like "<script" and "</script" with a form
3850+ * using a Unicode escape sequence "<\u0073cript>" and "</\u0073cript>".
3851+ *
3852+ * `<script` and `</script` appear in JavaScript source code in limited places,
3853+ * all of which support a Unicode escape sequence on the "s" character.
3854+ * JavaScript identifiers, string literals, template literals, and RegExp
3855+ * literals all support Unicode escape sequences, meaning that the escaped form
3856+ * is indistinguishable from the unescaped form when the JavaScript
3857+ * is evaluated.
3858+ *
3859+ * There are a few exceptions where the escaped form can be detected:
3860+ *
3861+ * - The escaped form would appear in any JavaScript comments.
3862+ * - “Raw” strings via `String.raw()` or the `raw` property of the first
3863+ * argument to a tagged template literal exposes the raw form, revealing any
3864+ * escaping that has been applied.
3865+ * - The `source` property of a RegExp object reveals an escaped form the of
3866+ * the pattern.
3867+ *
3868+ * For JavaScript that needs to avoid these issues, workarounds may
3869+ * be available. For example:
3870+ *
3871+ * // Instead of:
3872+ * const rawStringWillBeEscaped = String.raw`</script>`;
3873+ *
3874+ * // This will yield the same result with no escaping required:
3875+ * const rawStringWillBePreserved = String.raw`</scr` + String.raw`ipt>`;
3876+ *
3877+ * // After the escaping has been applied and the JavaScript evaluated,
3878+ * // these are the resulting values:
3879+ * rawStringWillBeEscaped; // "</\\u0073cript>"
3880+ * rawStringWillBePreserve; // "</script>"
3881+ *
3882+ *
3883+ * Escaping is applied only where strictly necessary, reducing the likelyhood
3884+ * that observable differences manifest in the escaped JavaScript.
3885+ *
3886+ * The alternatives are to reject JavaScript that could be safely escaped in
3887+ * a majority of cases or to relax restrictions in ways that produce dangerous
3888+ * or broken HTML documents, neither are desirable.
38483889 */
38493890 if ( $ this ->is_javascript_script_tag () ) {
38503891 $ plaintext_content = preg_replace_callback (
38513892 /*
3852- * This case-insensitive pattern consists of three groups:
3893+ * This case-insensitive pattern consists of three groups (in order) :
38533894 *
3854- * 1: "<" or "</"
3855- * 2 : "s"
3856- * 3: "cript" + a trailing character that terminates a tag name.
3895+ * HEAD: "<" or "</"
3896+ * S_CHAR : "s"
3897+ * TAIL: "cript" + a trailing tag name termination character
38573898 */
3858- '~(< /?)(s)(cript[ \\t \\r \\n \\f />])~i ' ,
3899+ '~(?P<HEAD>< /?)(?P<S_CHAR> s)(?P<TAIL> cript[ \\t \\r \\n \\f />])~i ' ,
38593900 static function ( $ matches ) {
3860- $ escaped_s_char = 's ' === $ matches [2 ]
3901+ $ escaped_s_char = 's ' === $ matches [' S_CHAR ' ]
38613902 ? '\\u0073 '
38623903 : '\\u0053 ' ;
3863- return "{$ matches [1 ]}{$ escaped_s_char }{$ matches [3 ]}" ;
3904+ return "{$ matches [' HEAD ' ]}{$ escaped_s_char }{$ matches [' TAIL ' ]}" ;
38643905 },
38653906 $ plaintext_content
38663907 );
@@ -3882,7 +3923,8 @@ static function ( $matches ) {
38823923 );
38833924 } else {
38843925 /*
3885- * Other types of script tags cannot be escaped safely.
3926+ * Other types of script tags cannot be escaped safely because the type
3927+ * of comment and escaping strategy are unknown.
38863928 */
38873929 return false ;
38883930 }
@@ -3948,11 +3990,14 @@ static function ( $tag_match ) {
39483990 *
39493991 * @see https://html.spec.whatwg.org/multipage/scripting.html#prepare-the-script-element
39503992 *
3951- * @since {WP_VERSION}
3993+ * @ignore
3994+ * @todo Consider a public API that is clear and general.
3995+ *
3996+ * @since 7.0.0
39523997 *
39533998 * @return bool True if the script tag will be evaluated as JavaScript.
39543999 */
3955- public function is_javascript_script_tag (): bool {
4000+ private function is_javascript_script_tag (): bool {
39564001 if ( 'SCRIPT ' !== $ this ->get_tag () || $ this ->get_namespace () !== 'html ' ) {
39574002 return false ;
39584003 }
@@ -4059,11 +4104,14 @@ public function is_javascript_script_tag(): bool {
40594104 /**
40604105 * Indicates if the currently matched tag is a JSON script tag.
40614106 *
4062- * @since {WP_VERSION}
4107+ * @ignore
4108+ * @todo Consider a public API that is clear and general.
4109+ *
4110+ * @since 7.0.0
40634111 *
40644112 * @return bool True if the script tag should be treated as JSON.
40654113 */
4066- public function is_json_script_tag (): bool {
4114+ private function is_json_script_tag (): bool {
40674115 if ( 'SCRIPT ' !== $ this ->get_tag () || $ this ->get_namespace () !== 'html ' ) {
40684116 return false ;
40694117 }
@@ -4083,16 +4131,15 @@ public function is_json_script_tag(): bool {
40834131 * > A JSON MIME type is any MIME type whose subtype ends in "+json" or whose essence
40844132 * > is "application/json" or "text/json".
40854133 *
4086- * The JSON subtype ending in "+json" is not currently handled due to lack
4087- * of a MIME type parser.
4134+ * @todo The JSON MIME type handling handles some common cases but when MIME type parsing is available it should be leveraged here.
40884135 *
40894136 * @see https://mimesniff.spec.whatwg.org/#json-mime-type
40904137 */
40914138 if (
4092- 'application/json ' === $ type
4093- || ' importmap ' === $ type
4094- || ' speculationrules ' === $ type
4095- || 'text/json ' === $ type
4139+ 'importmap ' === $ type ||
4140+ ' speculationrules ' === $ type ||
4141+ ' application/json ' === $ type ||
4142+ 'text/json ' === $ type
40964143 ) {
40974144 return true ;
40984145 }
0 commit comments