@@ -3811,29 +3811,83 @@ public function set_modifiable_text( string $plaintext_content ): bool {
38113811
38123812 switch ( $ this ->get_tag () ) {
38133813 case 'SCRIPT ' :
3814- /**
3815- * This is over-protective, but ensures the update doesn't break
3816- * the HTML structure of the SCRIPT element.
3814+ /*
3815+ * SCRIPT tag contents can be dangerous.
3816+ *
3817+ * The text `</script>` could close the SCRIPT element prematurely.
38173818 *
3818- * More thorough analysis could track the HTML tokenizer states
3819- * and to ensure that the SCRIPT element closes at the expected
3820- * SCRIPT close tag as is done in {@see ::skip_script_data()}.
3819+ * The text `<script>` could enter the "script data double escaped state", preventing the
3820+ * SCRIPT element from closing as expected, for example:
38213821 *
3822- * A SCRIPT element could be closed prematurely by contents
3823- * like `</script>`. A SCRIPT element could be prevented from
3824- * closing by contents like `<!--<script>`.
3822+ * <script>
3823+ * // If this "<!--" then "<script>" the closing tag will not be recognized.
3824+ * </script>
3825+ * <h1>This appears inside the preceding SCRIPT element.</h1>
38253826 *
3826- * The following strings are essential for dangerous content,
3827- * although they are insufficient on their own. This trade-off
3828- * prevents dangerous scripts from being sent to the browser.
3829- * It is also unlikely to produce HTML that may confuse more
3830- * basic HTML tooling.
3827+ * The relevant state transitions happen on text like:
3828+ * 1. <
3829+ * 2. / (optional)
3830+ * 3. script (case-insensitive)
3831+ * 4. One of the following characters:
3832+ * - \t
3833+ * - \n
3834+ * - \r (\r and \r\n newlines are normalized to \n in HTML pre-processing)
3835+ * - \f
3836+ * - " " (U+0020 SPACE)
3837+ * - /
3838+ * - >
3839+ *
3840+ * @see https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-state
38313841 */
38323842 if (
38333843 false !== stripos ( $ plaintext_content , '</script ' ) ||
38343844 false !== stripos ( $ plaintext_content , '<script ' )
38353845 ) {
3836- return false ;
3846+ /*
3847+ * JavaScript can be safely escaped.
3848+ */
3849+ if ( $ this ->is_javascript_script_tag () ) {
3850+ $ plaintext_content = preg_replace_callback (
3851+ /*
3852+ * This case-insensitive pattern consists of three groups:
3853+ *
3854+ * 1: "<" or "</"
3855+ * 2: "s"
3856+ * 3: "cript" + a trailing character that terminates a tag name.
3857+ */
3858+ '~(</?)(s)(cript[ \\t \\r \\n \\f />])~i ' ,
3859+ static function ( $ matches ) {
3860+ $ escaped_s_char = 's ' === $ matches [2 ]
3861+ ? '\\u0073 '
3862+ : '\\u0053 ' ;
3863+ return "{$ matches [1 ]}{$ escaped_s_char }{$ matches [3 ]}" ;
3864+ },
3865+ $ plaintext_content
3866+ );
3867+ } elseif ( $ this ->is_json_script_tag () ) {
3868+ /**
3869+ * JSON can be safely escaped.
3870+ *
3871+ * The following replacement may appear insuficcient, "<" is replaced
3872+ * with its JSON escape sequence "\u003C" without considering whether
3873+ * the "<" is preceded by an escaping slash. JSON does not support
3874+ * arbitrary character escaping (like JavaScript strings) so "\<"
3875+ * is invalid JSON and would have to be preceded by
3876+ * an escaped backslash: "\\<".
3877+ *
3878+ * @see https://www.json.org/json-en.html
3879+ */
3880+
3881+ $ plaintext_content = strtr (
3882+ $ plaintext_content ,
3883+ array ( '< ' => '\\u003C ' )
3884+ );
3885+ } else {
3886+ /*
3887+ * Other types of script tags cannot be escaped safely.
3888+ */
3889+ return false ;
3890+ }
38373891 }
38383892
38393893 $ this ->lexical_updates ['modifiable text ' ] = new WP_HTML_Text_Replacement (
@@ -3891,6 +3945,167 @@ static function ( $tag_match ) {
38913945 return false ;
38923946 }
38933947
3948+ /**
3949+ * Indicates if the currently matched tag is a JavaScript script tag.
3950+ *
3951+ * @see https://html.spec.whatwg.org/multipage/scripting.html#prepare-the-script-element
3952+ *
3953+ * @since {WP_VERSION}
3954+ *
3955+ * @return bool True if the script tag will be evaluated as JavaScript.
3956+ */
3957+ public function is_javascript_script_tag (): bool {
3958+ if ( 'SCRIPT ' !== $ this ->get_tag () || $ this ->get_namespace () !== 'html ' ) {
3959+ return false ;
3960+ }
3961+
3962+ /*
3963+ * > If any of the following are true:
3964+ * > - el has a type attribute whose value is the empty string;
3965+ * > - el has no type attribute but it has a language attribute and that attribute's
3966+ * > value is the empty string; or
3967+ * > - el has neither a type attribute nor a language attribute,
3968+ * > then let the script block's type string for this script element be "text/javascript".
3969+ */
3970+ $ type_attr = $ this ->get_attribute ( 'type ' );
3971+ $ language_attr = $ this ->get_attribute ( 'language ' );
3972+
3973+ if ( true === $ type_attr || '' === $ type_attr ) {
3974+ return true ;
3975+ }
3976+ if (
3977+ null === $ type_attr && (
3978+ true === $ language_attr ||
3979+ '' === $ language_attr ||
3980+ null === $ language_attr
3981+ )
3982+ ) {
3983+ return true ;
3984+ }
3985+
3986+ /*
3987+ * > Otherwise, if el has a type attribute, then let the script block's type string be
3988+ * > the value of that attribute with leading and trailing ASCII whitespace stripped.
3989+ * > Otherwise, el has a non-empty language attribute; let the script block's type string
3990+ * > be the concatenation of "text/" and the value of el's language attribute.
3991+ */
3992+ $ type_string = $ type_attr ? trim ( $ type_attr , " \t\f\r\n" ) : "text/ {$ language_attr }" ;
3993+
3994+ /*
3995+ * > If the script block's type string is a JavaScript MIME type essence match, then
3996+ * > set el's type to "classic".
3997+ *
3998+ * > A string is a JavaScript MIME type essence match if it is an ASCII case-insensitive
3999+ * > match for one of the JavaScript MIME type essence strings.
4000+ *
4001+ * > A JavaScript MIME type is any MIME type whose essence is one of the following:
4002+ * >
4003+ * > - application/ecmascript
4004+ * > - application/javascript
4005+ * > - application/x-ecmascript
4006+ * > - application/x-javascript
4007+ * > - text/ecmascript
4008+ * > - text/javascript
4009+ * > - text/javascript1.0
4010+ * > - text/javascript1.1
4011+ * > - text/javascript1.2
4012+ * > - text/javascript1.3
4013+ * > - text/javascript1.4
4014+ * > - text/javascript1.5
4015+ * > - text/jscript
4016+ * > - text/livescript
4017+ * > - text/x-ecmascript
4018+ * > - text/x-javascript
4019+ *
4020+ * @see https://mimesniff.spec.whatwg.org/#javascript-mime-type-essence-match
4021+ * @see https://mimesniff.spec.whatwg.org/#javascript-mime-type
4022+ */
4023+ switch ( strtolower ( $ type_string ) ) {
4024+ case 'application/ecmascript ' :
4025+ case 'application/javascript ' :
4026+ case 'application/x-ecmascript ' :
4027+ case 'application/x-javascript ' :
4028+ case 'text/ecmascript ' :
4029+ case 'text/javascript ' :
4030+ case 'text/javascript1.0 ' :
4031+ case 'text/javascript1.1 ' :
4032+ case 'text/javascript1.2 ' :
4033+ case 'text/javascript1.3 ' :
4034+ case 'text/javascript1.4 ' :
4035+ case 'text/javascript1.5 ' :
4036+ case 'text/jscript ' :
4037+ case 'text/livescript ' :
4038+ case 'text/x-ecmascript ' :
4039+ case 'text/x-javascript ' :
4040+ return true ;
4041+
4042+ /*
4043+ * > Otherwise, if the script block's type string is an ASCII case-insensitive match for
4044+ * > the string "module", then set el's type to "module".
4045+ *
4046+ * A module is evaluated as JavaScript.
4047+ */
4048+ case 'module ' :
4049+ return true ;
4050+ }
4051+
4052+ /*
4053+ * > - Otherwise, if the script block's type string is an ASCII case-insensitive match for
4054+ * > the string "importmap", then set el's type to "importmap".
4055+ *
4056+ * An importmap is JSON and not evaluated as JavaScript. This case is not handled here.
4057+ */
4058+
4059+ /*
4060+ * > Otherwise, return. (No script is executed, and el's type is left as null.)
4061+ */
4062+ return false ;
4063+ }
4064+
4065+ /**
4066+ * Indicates if the currently matched tag is a JSON script tag.
4067+ *
4068+ * @since {WP_VERSION}
4069+ *
4070+ * @return bool True if the script tag should be treated as JSON.
4071+ */
4072+ public function is_json_script_tag (): bool {
4073+ if ( 'SCRIPT ' !== $ this ->get_tag () || $ this ->get_namespace () !== 'html ' ) {
4074+ return false ;
4075+ }
4076+
4077+ $ type_attr = $ this ->get_attribute ( 'type ' );
4078+
4079+ if ( empty ( $ type_attr ) || true === $ type_attr ) {
4080+ return false ;
4081+ }
4082+
4083+ $ type_string = strtolower ( trim ( $ type_attr , " \t\f\r\n" ) );
4084+
4085+ /*
4086+ * > …
4087+ * > Otherwise, if the script block's type string is an ASCII case-insensitive match for the string "importmap", then set el's type to "importmap".
4088+ * > Otherwise, if the script block's type string is an ASCII case-insensitive match for the string "speculationrules", then set el's type to "speculationrules".
4089+ * @see https://html.spec.whatwg.org/#script-processing-model
4090+ *
4091+ * > A JSON MIME type is any MIME type whose subtype ends in "+json" or whose essence
4092+ * > is "application/json" or "text/json".
4093+ *
4094+ * @see https://mimesniff.spec.whatwg.org/#json-mime-type
4095+ */
4096+ if (
4097+ 'application/json ' === $ type_string
4098+ || 'importmap ' === $ type_string
4099+ || 'speculationrules ' === $ type_string
4100+ || 'text/json ' === $ type_string
4101+ || str_ends_with ( $ type_string , '+json ' )
4102+ ) {
4103+ return true ;
4104+ }
4105+
4106+ return false ;
4107+ }
4108+
38944109 /**
38954110 * Updates or creates a new attribute on the currently matched tag with the passed value.
38964111 *
0 commit comments