Charset: Introduce UTF-8 scanning pipeline.

dmsnell · dmsnell · commit c9166919cce1 · 2025-09-16T12:35:01.000Z
This is the third in a series of patches to modernize and standardize UTF-8 handling. When the fallback UTF-8 validation code was added it was placed inside formatting.php; however, that validation logic can be reused for a number of related UTF-8 functions. To faciliate this it was moved into a new location and loaded early. This patch is follow-up to that first half, whereby the UTF-8 scanning logic forms its own new `_wp_scan_utf8()` function. This new UTF-8 scanner is a low-level function which forms a shared spec-compliant processing core to power multiple fallback functions and some new functionality as well. Developed in #9830 Discussed in https://core.trac.wordpress.org/ticket/63863 Follow-up to: [60743]. See #63863. git-svn-id: https://develop.svn.wordpress.org/trunk@60768 602fd350-edb4-49c9-b593-d223f7449a82
diff --git a/phpcs.xml.dist b/phpcs.xml.dist
@@ -255,14 +255,19 @@
 		<exclude-pattern>/wp-tests-config-sample\.php</exclude-pattern>
 	</rule>
 
-	<!-- Exclude forbidding goto in the HTML Processor, which mimics algorithms that are written
-	     this way in the HTML specification, and these particular algorithms are complex and
-	     highly imperative. Avoiding the goto introduces a number of risks that could make it
-	     more difficult to maintain the relationship to the standard, lead to subtle differences
-	     in the parsing, and distance the code from its standard. -->
 	<rule ref="Generic.PHP.DiscourageGoto.Found">
+		<!-- Exclude forbidding goto in the HTML Processor, which mimics algorithms that are written
+			 this way in the HTML specification, and these particular algorithms are complex and
+			 highly imperative. Avoiding the goto introduces a number of risks that could make it
+			 more difficult to maintain the relationship to the standard, lead to subtle differences
+			 in the parsing, and distance the code from its standard. -->
 		<exclude-pattern>/wp-includes/html-api/class-wp-html-processor\.php</exclude-pattern>
 		<exclude-pattern>/wp-includes/html-api/class-wp-html-doctype-info\.php</exclude-pattern>
+
+		<!-- Goto is an effective way to handle errors in decoders which expect valid bytes
+		     without impacting the fast path while avoiding bloating the code with redundant
+		     and risky handling code. Exclude forbidding goto in UTF-8 fallback code. -->
+		<exclude-pattern>/wp-includes/compat-utf8\.php</exclude-pattern>
 	</rule>
 
 	<!-- Exclude sample config from modernization to prevent breaking CI workflows based on WP-CLI scaffold.
diff --git a/src/wp-includes/compat-utf8.php b/src/wp-includes/compat-utf8.php
@@ -1,52 +1,103 @@
 <?php
 
 /**
- * Fallback mechanism for safely validating UTF-8 bytes.
+ * Finds spans of valid and invalid UTF-8 bytes in a given string.
+ *
+ * This is a low-level tool to power various UTF-8 functionality.
+ * It scans through a string until it finds invalid byte spans.
+ * When it does this, it does three things:
+ *
+ *  - Assigns `$at` to the position after the last successful code point.
+ *  - Assigns `$invalid_length` to the length of the maximal subpart of
+ *    the invalid bytes starting at `$at`.
+ *  - Returns how many code points were successfully scanned.
+ *
+ * This information is enough to build a number of useful UTF-8 functions.
+ *
+ * Example:
+ *
+ *     // ñ is U+F1, which in `ISO-8859-1`/`latin1`/`Windows-1252`/`cp1252` is 0xF1.
+ *     "Pi\xF1a" === $pineapple = mb_convert_encoding( "Piña", 'Windows-1252', 'UTF-8' );
+ *     $at = $invalid_length = 0;
+ *
+ *     // The first step finds the invalid 0xF1 byte.
+ *     2 === _wp_scan_utf8( $pineapple, $at, $invalid_length );
+ *     $at === 2; $invalid_length === 1;
  *
- * By implementing a raw method here the code will behave in the same way on
- * all installed systems, regardless of what extensions are installed.
+ *     // The second step continues to the end of the string.
+ *     1 === _wp_scan_utf8( $pineapple, $at, $invalid_length );
+ *     $at === 4; $invalid_length === 0;
  *
- * @see wp_is_valid_utf8
+ * Note! This functions many arguments are passed without and “options”
+ * array. This choice is based on the fact that this is a low-level function
+ * and there’s no need to create an array of items on every invocation.
  *
  * @since 6.9.0
  * @access private
  *
- * @param string $bytes String which might contain text encoded as UTF-8.
- * @return bool Whether the provided bytes can decode as valid UTF-8.
+ * @param string   $bytes           UTF-8 encoded string which might include invalid spans of bytes.
+ * @param int      $at              Where to start scanning.
+ * @param int      $invalid_length  Will be set to how many bytes are to be ignored after `$at`.
+ * @param int|null $max_bytes       Stop scanning after this many bytes have been seen.
+ * @param int|null $max_code_points Stop scanning after this many code points have been seen.
+ * @return int How many code points were successfully scanned.
  */
-function _wp_is_valid_utf8_fallback( string $bytes ): bool {
-	$end = strlen( $bytes );
-
-	for ( $i = 0; $i < $end; $i++ ) {
+function _wp_scan_utf8( string $bytes, int &$at, int &$invalid_length, ?int $max_bytes = null, ?int $max_code_points = null ): int {
+	$byte_length       = strlen( $bytes );
+	$end               = min( $byte_length, $at + ( $max_bytes ?? PHP_INT_MAX ) );
+	$invalid_length    = 0;
+	$count             = 0;
+	$max_count         = $max_code_points ?? PHP_INT_MAX;
+
+	for ( $i = $at; $i < $end && $count <= $max_count; $i++ ) {
 		/*
 		 * Quickly skip past US-ASCII bytes, all of which are valid UTF-8.
 		 *
 		 * This optimization step improves the speed from 10x to 100x
 		 * depending on whether the JIT has optimized the function.
 		 */
-		$i += strspn(
+		$ascii_byte_count = strspn(
 			$bytes,
 			"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" .
 			"\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" .
 			" !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f",
-			$i
+			$i,
+			$end - $i
 		);
+
+		if ( $count + $ascii_byte_count >= $max_count ) {
+			$at    = $i + ( $max_count - $count );
+			$count = $max_count;
+			return $count;
+		}
+
+		$count += $ascii_byte_count;
+		$i     += $ascii_byte_count;
+
 		if ( $i >= $end ) {
-			break;
+			$at = $end;
+			return $count;
 		}
 
 		/**
 		 * The above fast-track handled all single-byte UTF-8 characters. What
 		 * follows MUST be a multibyte sequence otherwise there’s invalid UTF-8.
 		 *
 		 * Therefore everything past here is checking those multibyte sequences.
+		 *
+		 * It may look like there’s a need to check against the max bytes here,
+		 * but since each match of a single character returns, this functions will
+		 * bail already if crossing the max-bytes threshold. This function SHALL
+		 * NOT return in the middle of a multi-byte character, so if a character
+		 * falls on each side of the max bytes, the entire character will be scanned.
+		 *
 		 * Because it’s possible that there are truncated characters, the use of
 		 * the null-coalescing operator with "\xC0" is a convenience for skipping
 		 * length checks on every continuation bytes. This works because 0xC0 is
 		 * always invalid in a UTF-8 string, meaning that if the string has been
 		 * truncated, it will find 0xC0 and reject as invalid UTF-8.
 		 *
-		 *  > [The following table] lists all of the byte sequences that are well-formed
+		 * > [The following table] lists all of the byte sequences that are well-formed
 		 * > in UTF-8. A range of byte values such as A0..BF indicates that any byte
 		 * > from A0 to BF (inclusive) is well-formed in that position. Any byte value
 		 * > outside of the ranges listed is ill-formed.
@@ -66,29 +117,24 @@ function _wp_is_valid_utf8_fallback( string $bytes ): bool {
 		 *  │ U+100000..U+10FFFF  │ F4         │ 80..8F       │ 80..BF      │ 80..BF       │
 		 *  ╰─────────────────────┴────────────┴──────────────┴─────────────┴──────────────╯
 		 *
-		 * Notice that all valid third and forth bytes are in the range 80..BF. This
-		 * validator takes advantage of that to only check the range of those bytes once.
-		 *
-		 * @see https://lemire.me/blog/2018/05/09/how-quickly-can-you-check-that-a-string-is-valid-unicode-utf-8/
 		 * @see https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G27506
 		 */
 
+		// Valid two-byte code points.
 		$b1 = ord( $bytes[ $i ] );
 		$b2 = ord( $bytes[ $i + 1 ] ?? "\xC0" );
 
-		// Valid two-byte code points.
-
 		if ( $b1 >= 0xC2 && $b1 <= 0xDF && $b2 >= 0x80 && $b2 <= 0xBF ) {
+			++$count;
 			++$i;
 			continue;
 		}
 
-		$b3 = ord( $bytes[ $i + 2 ] ?? "\xC0" );
-
 		// Valid three-byte code points.
+		$b3 = ord( $bytes[ $i + 2 ] ?? "\xC0" );
 
 		if ( $b3 < 0x80 || $b3 > 0xBF ) {
-			return false;
+			goto invalid_utf8;
 		}
 
 		if (
@@ -97,31 +143,108 @@ function _wp_is_valid_utf8_fallback( string $bytes ): bool {
 			( 0xED === $b1 && $b2 >= 0x80 && $b2 <= 0x9F ) ||
 			( $b1 >= 0xEE && $b1 <= 0xEF && $b2 >= 0x80 && $b2 <= 0xBF )
 		) {
+			++$count;
 			$i += 2;
 			continue;
 		}
 
-		$b4 = ord( $bytes[ $i + 3 ] ?? "\xC0" );
-
 		// Valid four-byte code points.
+		$b4 = ord( $bytes[ $i + 3 ] ?? "\xC0" );
 
 		if ( $b4 < 0x80 || $b4 > 0xBF ) {
-			return false;
+			goto invalid_utf8;
 		}
 
 		if (
 			( 0xF0 === $b1 && $b2 >= 0x90 && $b2 <= 0xBF ) ||
 			( $b1 >= 0xF1 && $b1 <= 0xF3 && $b2 >= 0x80 && $b2 <= 0xBF ) ||
 			( 0xF4 === $b1 && $b2 >= 0x80 && $b2 <= 0x8F )
 		) {
+			++$count;
 			$i += 3;
 			continue;
 		}
 
-		// Any other sequence is invalid.
-		return false;
+		/**
+		 * When encountering invalid byte sequences, Unicode suggests finding the
+		 * maximal subpart of a text and replacing that subpart with a single
+		 * replacement character.
+		 *
+		 * > This practice is more secure because it does not result in the
+		 * > conversion consuming parts of valid sequences as though they were
+		 * > invalid. It also guarantees at least one replacement character will
+		 * > occur for each instance of an invalid sequence in the original text.
+		 * > Furthermore, this practice can be defined consistently for better
+		 * > interoperability between different implementations of conversion.
+		 *
+		 * @see https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-5/#G40630
+		 */
+		invalid_utf8:
+		$at             = $i;
+		$invalid_length = 1;
+
+		// Single-byte and two-byte characters.
+		if ( ( 0x00 === ( $b1 & 0x80 ) ) || ( 0xC0 === ( $b1 & 0xE0 ) ) ) {
+			return $count;
+		}
+
+		$b2 = ord( $bytes[ $i + 1 ] ?? "\xC0" );
+		$b3 = ord( $bytes[ $i + 2 ] ?? "\xC0" );
+
+		// Find the maximal subpart and skip past it.
+		if ( 0xE0 === ( $b1 & 0xF0 ) ) {
+			// Three-byte characters.
+			$b2_valid = (
+				( 0xE0 === $b1 && $b2 >= 0xA0 && $b2 <= 0xBF ) ||
+				( $b1 >= 0xE1 && $b1 <= 0xEC && $b2 >= 0x80 && $b2 <= 0xBF ) ||
+				( 0xED === $b1 && $b2 >= 0x80 && $b2 <= 0x9F ) ||
+				( $b1 >= 0xEE && $b1 <= 0xEF && $b2 >= 0x80 && $b2 <= 0xBF )
+			);
+
+			$invalid_length = min( $end - $i, $b2_valid ? 2 : 1 );
+			return $count;
+		} elseif ( 0xF0 === ( $b1 & 0xF8 ) ) {
+			// Four-byte characters.
+			$b2_valid = (
+				( 0xF0 === $b1 && $b2 >= 0x90 && $b2 <= 0xBF ) ||
+				( $b1 >= 0xF1 && $b1 <= 0xF3 && $b2 >= 0x80 && $b2 <= 0xBF ) ||
+				( 0xF4 === $b1 && $b2 >= 0x80 && $b2 <= 0x8F )
+			);
+
+			$b3_valid = $b3 >= 0x80 && $b3 <= 0xBF;
+
+			$invalid_length = min( $end - $i, $b2_valid ? ( $b3_valid ? 3 : 2 ) : 1 );
+			return $count;
+		}
+
+		return $count;
+	}
+
+	$at = $i;
+	return $count;
+}
+
+/**
+ * Fallback mechanism for safely validating UTF-8 bytes.
+ *
+ * @see wp_is_valid_utf8()
+ *
+ * @since 6.9.0
+ * @access private
+ *
+ * @param string $bytes String which might contain text encoded as UTF-8.
+ * @return bool Whether the provided bytes can decode as valid UTF-8.
+ */
+function _wp_is_valid_utf8_fallback( string $bytes ): bool {
+	$bytes_length = strlen( $bytes );
+	if ( 0 === $bytes_length ) {
+		return true;
 	}
 
-	// Reaching the end implies validating every byte.
-	return true;
+	$next_byte_at   = 0;
+	$invalid_length = 0;
+
+	_wp_scan_utf8( $bytes, $next_byte_at, $invalid_length );
+
+	return $bytes_length === $next_byte_at && 0 === $invalid_length;
 }