From 6ad9951fd97873227a3ed23fa28910af72f73fc4 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Thu, 10 Jul 2025 12:28:15 +0200 Subject: [PATCH 01/11] Move script data length checks to top of loop --- .../html-api/class-wp-html-tag-processor.php | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index 83c1784418248..e1a5b68104409 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -1495,6 +1495,15 @@ private function skip_script_data(): bool { while ( false !== $at && $at < $doc_length ) { $at += strcspn( $html, '-<', $at ); + /* + * Ultimately a SCRIPT closer (``) must be found or this function will + * return false. + * `= $doc_length ) { + return false; + } /* * For all script states a "-->" transitions @@ -1502,7 +1511,6 @@ private function skip_script_data(): bool { * even if that's the current state. */ if ( - $at + 2 < $doc_length && '-' === $html[ $at ] && '-' === $html[ $at + 1 ] && '>' === $html[ $at + 2 ] @@ -1512,10 +1520,6 @@ private function skip_script_data(): bool { continue; } - if ( $at + 1 >= $doc_length ) { - return false; - } - /* * Everything of interest past here starts with "<". * Check this character and advance position regardless. @@ -1537,7 +1541,6 @@ private function skip_script_data(): bool { * parsing after updating the state. */ if ( - $at + 2 < $doc_length && '!' === $html[ $at ] && '-' === $html[ $at + 1 ] && '-' === $html[ $at + 2 ] @@ -1561,7 +1564,6 @@ private function skip_script_data(): bool { * proceed scanning to the next potential token in the text. */ if ( ! ( - $at + 6 < $doc_length && ( 's' === $html[ $at ] || 'S' === $html[ $at ] ) && ( 'c' === $html[ $at + 1 ] || 'C' === $html[ $at + 1 ] ) && ( 'r' === $html[ $at + 2 ] || 'R' === $html[ $at + 2 ] ) && From b3b3177e27ce1dfbc8c73c629392b7af6b6e3f1b Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Thu, 10 Jul 2025 12:28:54 +0200 Subject: [PATCH 02/11] Remove parser_state change in skip_script_data The parser state is managed externally and should not be changed in skip_script_data --- src/wp-includes/html-api/class-wp-html-tag-processor.php | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index e1a5b68104409..b24084688f30a 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -1613,8 +1613,6 @@ private function skip_script_data(): bool { } if ( $this->bytes_already_parsed >= $doc_length ) { - $this->parser_state = self::STATE_INCOMPLETE_INPUT; - return false; } From ca16e0e916377c390513c4266a716ba45b48a8ec Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Thu, 10 Jul 2025 13:34:46 +0200 Subject: [PATCH 03/11] Remove more length checks --- src/wp-includes/html-api/class-wp-html-tag-processor.php | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index b24084688f30a..ef9932f7d4e17 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -1581,9 +1581,6 @@ private function skip_script_data(): bool { * "= $doc_length ) { - continue; - } $at += 6; $c = $html[ $at ]; if ( ' ' !== $c && "\t" !== $c && "\r" !== $c && "\n" !== $c && '/' !== $c && '>' !== $c ) { From 0456be7cfbdab44c5747045b9aba9552af13967d Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Thu, 10 Jul 2025 13:49:30 +0200 Subject: [PATCH 04/11] Improve documentation --- .../html-api/class-wp-html-tag-processor.php | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index ef9932f7d4e17..a266b4da42745 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -1495,11 +1495,17 @@ private function skip_script_data(): bool { while ( false !== $at && $at < $doc_length ) { $at += strcspn( $html, '-<', $at ); + /* * Ultimately a SCRIPT closer (``) must be found or this function will - * return false. - * ` + * ╰──┬───╯ + * $at + 8 additional characters is the minimum length required to skip script data. */ if ( $at + 8 >= $doc_length ) { return false; From ea6f7d3c7df39df05c7e86ee5596f74a60fd7917 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Thu, 31 Jul 2025 09:23:39 +0200 Subject: [PATCH 05/11] Improve comment explaining early return logic --- .../html-api/class-wp-html-tag-processor.php | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index a266b4da42745..2abb80a4747e6 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -1497,15 +1497,15 @@ private function skip_script_data(): bool { $at += strcspn( $html, '-<', $at ); /* - * Ultimately a SCRIPT closer (``) must be found or this function will - * return false. This removes the need for additional length checks and allows - * for an early return if it's impossible to find a closer. + * A SCRIPT close tag `` must be found or this function will + * return false. If a close tag would not fit in the remaining string, + * no further work is necessary. * - * $at is potentially here - * ↓ - * - * ╰──┬───╯ - * $at + 8 additional characters is the minimum length required to skip script data. + * $at is potentially here + * ↓ + * + * ╰──┬───╯ + * $at + 8 additional characters is the minimum length required to skip script data. */ if ( $at + 8 >= $doc_length ) { return false; From 4be62b9ea492cddf8dbafef2cb8a51d75b8ad435 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Wed, 6 Aug 2025 15:23:17 +0200 Subject: [PATCH 06/11] Improve loop comment --- .../html-api/class-wp-html-tag-processor.php | 22 ++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/src/wp-includes/html-api/class-wp-html-tag-processor.php b/src/wp-includes/html-api/class-wp-html-tag-processor.php index 2abb80a4747e6..1e82d2fff0ab3 100644 --- a/src/wp-includes/html-api/class-wp-html-tag-processor.php +++ b/src/wp-includes/html-api/class-wp-html-tag-processor.php @@ -1497,15 +1497,27 @@ private function skip_script_data(): bool { $at += strcspn( $html, '-<', $at ); /* - * A SCRIPT close tag `` must be found or this function will - * return false. If a close tag would not fit in the remaining string, - * no further work is necessary. + * *IMPORTANT:* Any changes to this loop *must* ensure the conditions described in this + * comment remain valid. * - * $at is potentially here + * The rest of this loop matches different byte sequences. If a script close tag is not + * found, the function will return false. The script close tag is the longest byte + * sequenced to match. Therefore, a single length check for at least 8 additional + * bytes allows for an early `false` return OR subsequent matches without length checks. + * + * $at may be here. * ↓ * * ╰──┬───╯ - * $at + 8 additional characters is the minimum length required to skip script data. + * $at + 8 additional bytes are required for a non-false return value. + * + * The length of shorter matches is already satisfied: + * + * $at may be here. + * ↓ + * --> + * ├╯ + * $at + 2 additional characters does not require an additional length check. */ if ( $at + 8 >= $doc_length ) { return false; From df2affa7bae25fb27f637842d1dcf80a10d426a5 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Wed, 6 Aug 2025 17:29:34 +0200 Subject: [PATCH 07/11] Add script tag processing tests --- .../tests/html-api/wpHtmlTagProcessor.php | 45 +++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/tests/phpunit/tests/html-api/wpHtmlTagProcessor.php b/tests/phpunit/tests/html-api/wpHtmlTagProcessor.php index cd8faee4ed6a4..bb360cb212d44 100644 --- a/tests/phpunit/tests/html-api/wpHtmlTagProcessor.php +++ b/tests/phpunit/tests/html-api/wpHtmlTagProcessor.php @@ -3046,4 +3046,49 @@ public static function data_alphabet_by_characters_uppercase() { yield strtoupper( $data[0] ) => array( strtoupper( $data[0] ) ); } } + + /** + * Test that script tags are parsed correctly. + * + * Script tag parsing is very complicated, see the following resources for more details: + * + * - https://html.spec.whatwg.org/multipage/parsing.html#script-data-state + * - https://html.spec.whatwg.org/multipage/scripting.html#restrictions-for-contents-of-script-elements + * + * @ticket 63738 + * + * @dataProvider data_script_tag + */ + public function test_script_tag_parsing( string $input, bool $closes ) { + $processor = new WP_HTML_Tag_Processor( $input ); + + if ( $closes ) { + $this->assertTrue( $processor->next_token(), 'Expected to find complete script tag.' ); + $this->assertSame( 'SCRIPT', $processor->get_tag() ); + return; + } + + $this->assertFalse( $processor->next_token(), 'Expected to fail next_token().' ); + $this->assertTrue( $processor->paused_at_incomplete_token(), 'Expected an incomplete SCRIPT tag token.' ); + } + + /** + * Data provider. + */ + public static function data_script_tag(): array { + return array( + 'Basic script tag' => array( '', true ), + 'Script with type attribute' => array( '', true ), + 'Script data escaped' => array( '', true ), + 'Script data double-escaped exit (comment)' => array( '', true ), + 'Script data double-escaped exit (closed)' => array( '', true ), + 'Script data double-escaped exit (closed/truncated)' => array( '', true ), + 'Script data no double-escape' => array( '', true ), + 'Script data no double-escape (short comment)' => array( '', true ), + 'Script data almost double-escaped' => array( '', true ), + + 'Script tag with self-close flag (ignored)' => array( '', true ), - 'Script data double-escaped exit (closed)' => array( '', true ), + 'Basic script tag' => array( '', true ), + 'Script with type attribute' => array( '', true ), + 'Script data escaped' => array( '', true ), + 'Script data double-escaped exit (comment)' => array( '', true ), + 'Script data double-escaped exit (closed)' => array( '', true ), 'Script data double-escaped exit (closed/truncated)' => array( '', true ), - 'Script data no double-escape' => array( '', true ), - 'Script data no double-escape (short comment)' => array( '', true ), - 'Script data almost double-escaped' => array( '', true ), + 'Script data no double-escape' => array( '', true ), - 'Script tag with self-close flag (ignored)' => array( ' - * ╰──┬───╯ + * ↓ + * ... + * ╰──┬───╯ * $at + 8 additional bytes are required for a non-false return value. * - * The length of shorter matches is already satisfied: + * This single check eliminates the need to check lengths for the shorter spans. + * For example, when leaving the script escaped state back into script data state. * - * $at may be here. - * ↓ - * --> - * ├╯ - * $at + 2 additional characters does not require an additional length check. + * $at may be here. + * ↓ + * + * ├╯ + * $at + 2 additional characters does not + * require an additional length check. */ if ( $at + 8 >= $doc_length ) { return false; From f7d4b8e1c41f09dca74a1349602d25f0e3eaf723 Mon Sep 17 00:00:00 2001 From: Jon Surrell Date: Thu, 7 Aug 2025 08:36:38 +0200 Subject: [PATCH 10/11] Move script parsing adjacent to other script parsing test --- .../tests/html-api/wpHtmlTagProcessor.php | 86 +++++++++---------- 1 file changed, 43 insertions(+), 43 deletions(-) diff --git a/tests/phpunit/tests/html-api/wpHtmlTagProcessor.php b/tests/phpunit/tests/html-api/wpHtmlTagProcessor.php index 6e7d4561bcaa0..511586f99327c 100644 --- a/tests/phpunit/tests/html-api/wpHtmlTagProcessor.php +++ b/tests/phpunit/tests/html-api/wpHtmlTagProcessor.php @@ -1981,6 +1981,49 @@ public static function data_next_tag_ignores_script_tag_contents() { ); } + /** + * Test that script tags are parsed correctly. + * + * Script tag parsing is very complicated, see the following resources for more details: + * + * - https://html.spec.whatwg.org/multipage/parsing.html#script-data-state + * - https://html.spec.whatwg.org/multipage/scripting.html#restrictions-for-contents-of-script-elements + * + * @ticket 63738 + * + * @dataProvider data_script_tag + */ + public function test_script_tag_parsing( string $input, bool $closes ) { + $processor = new WP_HTML_Tag_Processor( $input ); + + if ( $closes ) { + $this->assertTrue( $processor->next_token(), 'Expected to find complete script tag.' ); + $this->assertSame( 'SCRIPT', $processor->get_tag() ); + return; + } + + $this->assertFalse( $processor->next_token(), 'Expected to fail next_token().' ); + $this->assertTrue( $processor->paused_at_incomplete_token(), 'Expected an incomplete SCRIPT tag token.' ); + } + + /** + * Data provider. + */ + public static function data_script_tag(): array { + return array( + 'Basic script tag' => array( '', true ), + 'Script with type attribute' => array( '', true ), + 'Script data escaped' => array( '', true ), + 'Script data double-escaped exit (comment)' => array( '', true ), + 'Script data double-escaped exit (closed)' => array( '', true ), + 'Script data double-escaped exit (closed/truncated)' => array( '', true ), + 'Script data no double-escape' => array( '', true ), + + 'Script tag with self-close flag (ignored)' => array( '', true ), - 'Script data double-escaped exit (closed)' => array( '', true ), - 'Script data double-escaped exit (closed/truncated)' => array( '', true ), - 'Script data no double-escape' => array( '', true ), - - 'Script tag with self-close flag (ignored)' => array( ' + * ├╯ + * $at + 2 additional characters does not require a length check. + * + * The transition from "escaped" to "unescaped" is not relevant if the document ends: + * + * $at may be here. + * ↓ + *