Charset: Rely on new UTF-8 pipeline for mb_substr() fallback.

dmsnell · dmsnell · commit 8ec91a47203a · 2025-10-18T04:34:02.000Z
The existing polyfill for `mb_substr()` contains a number of issues leaving plenty of opportunity for improvement. Specifically, the following are all deficiencies: it relies on Unicode PCRE support, assumes input strings are valid UTF-8, splits input strings into an array of characters (1,000 at a time, iterating until complete), and re-joins them at the end. This patch provides an updated polyfill which will reliably parse UTF-8 strings even in the presence of invalid bytes. It computes boundaries for the substring extraction with zero allocations and then returns a single `substr()` call at the end. This change improves the reliability of UTF-8 string handling and removes behavioral variability based on the runtime system. Developed in #9829 Discussed in https://core.trac.wordpress.org/ticket/63863 See #63863. git-svn-id: https://develop.svn.wordpress.org/trunk@60969 602fd350-edb4-49c9-b593-d223f7449a82
diff --git a/src/wp-includes/compat-utf8.php b/src/wp-includes/compat-utf8.php
@@ -338,6 +338,48 @@ function _wp_utf8_codepoint_count( string $text, ?int $byte_offset = 0, ?int $ma
 	return $count;
 }
 
+/**
+ * Given a starting offset within a string and a maximum number of code points,
+ * return how many bytes are occupied by the span of characters.
+ *
+ * Invalid spans of bytes count as a single code point according to the maximal
+ * subpart rule. This function is a fallback method for calling
+ * `strlen( mb_substr( substr( $text, $at ), 0, $max_code_points ) )`.
+ *
+ * @since 6.9.0
+ * @access private
+ *
+ * @param string $text              Count bytes of span in this text.
+ * @param int    $byte_offset       Start counting at this byte offset.
+ * @param int    $max_code_points   Stop counting after this many code points have been seen,
+ *                                  or at the end of the string.
+ * @param ?int   $found_code_points Optional. Will be set to number of found code points in
+ *                                  span, as this might be smaller than the maximum count if
+ *                                  the string is not long enough.
+ * @return int Number of bytes spanned by the code points.
+ */
+function _wp_utf8_codepoint_span( string $text, int $byte_offset, int $max_code_points, ?int &$found_code_points = 0 ): int {
+	$was_at            = $byte_offset;
+	$invalid_length    = 0;
+	$end               = strlen( $text );
+	$found_code_points = 0;
+
+	while ( $byte_offset < $end && $found_code_points < $max_code_points ) {
+		$needed      = $max_code_points - $found_code_points;
+		$chunk_count = _wp_scan_utf8( $text, $byte_offset, $invalid_length, null, $needed );
+
+		$found_code_points += $chunk_count;
+
+		// Invalid spans only convey one code point count regardless of how long they are.
+		if ( 0 !== $invalid_length && $found_code_points < $max_code_points ) {
+			++$found_code_points;
+			$byte_offset += $invalid_length;
+		}
+	}
+
+	return $byte_offset - $was_at;
+}
+
 /**
  * Converts a string from ISO-8859-1 to UTF-8, maintaining backwards compatibility
  * with the deprecated function from the PHP standard library.
diff --git a/src/wp-includes/compat.php b/src/wp-includes/compat.php
@@ -33,44 +33,42 @@ function _( $message ) {
  *
  * @ignore
  * @since 4.2.2
+ * @since 6.9.0 Deprecated the `$set` argument.
  * @access private
  *
- * @param bool $set - Used for testing only
- *             null   : default - get PCRE/u capability
- *             false  : Used for testing - return false for future calls to this function
- *             'reset': Used for testing - restore default behavior of this function
+ * @param bool $set Deprecated. This argument is no longer used for testing purposes.
  */
 function _wp_can_use_pcre_u( $set = null ) {
-	static $utf8_pcre = 'reset';
+	static $utf8_pcre = null;
 
-	if ( null !== $set ) {
-		$utf8_pcre = $set;
+	if ( isset( $set ) ) {
+		_deprecated_argument( __FUNCTION__, '6.9.0' );
 	}
 
-	if ( 'reset' === $utf8_pcre ) {
-		$utf8_pcre = true;
-
-		set_error_handler(
-			function ( $errno, $errstr ) use ( &$utf8_pcre ) {
-				if ( str_starts_with( $errstr, 'preg_match():' ) ) {
-					$utf8_pcre = false;
-					return true;
-				}
+	if ( isset( $utf8_pcre ) ) {
+		return $utf8_pcre;
+	}
 
-				return false;
-			},
-			E_WARNING
-		);
+	$utf8_pcre = true;
+	set_error_handler(
+		function ( $errno, $errstr ) use ( &$utf8_pcre ) {
+			if ( str_starts_with( $errstr, 'preg_match():' ) ) {
+				$utf8_pcre = false;
+				return true;
+			}
 
-		/*
-		 * Attempt to compile a PCRE pattern with the PCRE_UTF8 flag. For
-		 * systems lacking Unicode support this will trigger a warning
-		 * during compilation, which the error handler will intercept.
-		 */
-		preg_match( '//u', '' );
+			return false;
+		},
+		E_WARNING
+	);
 
-		restore_error_handler();
-	}
+	/*
+	 * Attempt to compile a PCRE pattern with the PCRE_UTF8 flag. For
+	 * systems lacking Unicode support this will trigger a warning
+	 * during compilation, which the error handler will intercept.
+	 */
+	preg_match( '//u', '' );
+	restore_error_handler();
 
 	return $utf8_pcre;
 }
@@ -136,15 +134,15 @@ function mb_substr( $string, $start, $length = null, $encoding = null ) { // php
 /**
  * Internal compat function to mimic mb_substr().
  *
- * Only understands UTF-8 and 8bit. All other character sets will be treated as 8bit.
- * For `$encoding === UTF-8`, the `$str` input is expected to be a valid UTF-8 byte
- * sequence. The behavior of this function for invalid inputs is undefined.
+ * Only supports UTF-8 and non-shifting single-byte encodings. For all other encodings
+ * expect the substrings to be misaligned. When the given encoding (or the `blog_charset`
+ * if none is provided) isn’t UTF-8 then the function returns the output of {@see \substr()}.
  *
  * @ignore
  * @since 3.2.0
  *
  * @param string      $str      The string to extract the substring from.
- * @param int         $start    Position to being extraction from in `$str`.
+ * @param int         $start    Character offset at which to start the substring extraction.
  * @param int|null    $length   Optional. Maximum number of characters to extract from `$str`.
  *                              Default null.
  * @param string|null $encoding Optional. Character encoding to use. Default null.
@@ -155,56 +153,39 @@ function _mb_substr( $str, $start, $length = null, $encoding = null ) {
 		return '';
 	}
 
-	if ( null === $encoding ) {
-		$encoding = get_option( 'blog_charset' );
-	}
-
-	/*
-	 * The solution below works only for UTF-8, so in case of a different
-	 * charset just use built-in substr().
-	 */
-	if ( ! _is_utf8_charset( $encoding ) ) {
+	// The solution below works only for UTF-8; treat all other encodings as byte streams.
+	if ( ! _is_utf8_charset( $encoding ?? get_option( 'blog_charset' ) ) ) {
 		return is_null( $length ) ? substr( $str, $start ) : substr( $str, $start, $length );
 	}
 
-	if ( _wp_can_use_pcre_u() ) {
-		// Use the regex unicode support to separate the UTF-8 characters into an array.
-		preg_match_all( '/./us', $str, $match );
-		$chars = is_null( $length ) ? array_slice( $match[0], $start ) : array_slice( $match[0], $start, $length );
-		return implode( '', $chars );
-	}
+	$total_length = ( $start < 0 || $length < 0 )
+		? _wp_utf8_codepoint_count( $str )
+		: 0;
 
-	$regex = '/(
-		[\x00-\x7F]                  # single-byte sequences   0xxxxxxx
-		| [\xC2-\xDF][\x80-\xBF]       # double-byte sequences   110xxxxx 10xxxxxx
-		| \xE0[\xA0-\xBF][\x80-\xBF]   # triple-byte sequences   1110xxxx 10xxxxxx * 2
-		| [\xE1-\xEC][\x80-\xBF]{2}
-		| \xED[\x80-\x9F][\x80-\xBF]
-		| [\xEE-\xEF][\x80-\xBF]{2}
-		| \xF0[\x90-\xBF][\x80-\xBF]{2} # four-byte sequences   11110xxx 10xxxxxx * 3
-		| [\xF1-\xF3][\x80-\xBF]{3}
-		| \xF4[\x80-\x8F][\x80-\xBF]{2}
-	)/x';
-
-	// Start with 1 element instead of 0 since the first thing we do is pop.
-	$chars = array( '' );
-
-	do {
-		// We had some string left over from the last round, but we counted it in that last round.
-		array_pop( $chars );
-
-		/*
-		 * Split by UTF-8 character, limit to 1000 characters (last array element will contain
-		 * the rest of the string).
-		 */
-		$pieces = preg_split( $regex, $str, 1000, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY );
+	$normalized_start = $start < 0
+		? max( 0, $total_length + $start )
+		: $start;
 
-		$chars = array_merge( $chars, $pieces );
+	/*
+	 * The starting offset is provided as characters, which means this needs to
+	 * find how many bytes that many characters occupies at the start of the string.
+	 */
+	$starting_byte_offset = _wp_utf8_codepoint_span( $str, 0, $normalized_start );
+
+	$normalized_length = $length < 0
+		? max( 0, $total_length - $normalized_start + $length )
+		: $length;
 
-		// If there's anything left over, repeat the loop.
-	} while ( count( $pieces ) > 1 && $str = array_pop( $pieces ) );
+	/*
+	 * This is the main step. It finds how many bytes the given length of code points
+	 * occupies in the input, starting at the byte offset calculated above.
+	 */
+	$byte_length = isset( $normalized_length )
+		? _wp_utf8_codepoint_span( $str, $starting_byte_offset, $normalized_length )
+		: ( strlen( $str ) - $starting_byte_offset );
 
-	return implode( '', array_slice( $chars, $start, $length ) );
+	// The result is a normal byte-level substring using the computed ranges.
+	return substr( $str, $starting_byte_offset, $byte_length );
 }
 
 if ( ! function_exists( 'mb_strlen' ) ) :
diff --git a/tests/phpunit/tests/compat/mbSubstr.php b/tests/phpunit/tests/compat/mbSubstr.php
@@ -13,88 +13,51 @@ class Tests_Compat_mbSubstr extends WP_UnitTestCase {
 	 * Test that mb_substr() is always available (either from PHP or WP).
 	 */
 	public function test_mb_substr_availability() {
-		$this->assertTrue( function_exists( 'mb_substr' ) );
-	}
-
-	/**
-	 * @dataProvider data_utf8_substrings
-	 */
-	public function test_mb_substr( $input_string, $start, $length, $expected_character_substring ) {
-		$this->assertSame( $expected_character_substring, _mb_substr( $input_string, $start, $length, 'UTF-8' ) );
+		$this->assertTrue(
+			in_array( 'mb_substr', get_defined_functions()['internal'], true ),
+			'Test runner should have `mbstring` extension active but doesn’t.'
+		);
 	}
 
 	/**
 	 * @dataProvider data_utf8_substrings
 	 */
-	public function test_mb_substr_via_regex( $input_string, $start, $length, $expected_character_substring ) {
-		_wp_can_use_pcre_u( false );
-		$this->assertSame( $expected_character_substring, _mb_substr( $input_string, $start, $length, 'UTF-8' ) );
-		_wp_can_use_pcre_u( 'reset' );
+	public function test_mb_substr( $input_string, $start, $length ) {
+		$this->assertSame(
+			mb_substr( $input_string, $start, $length, 'UTF-8' ),
+			_mb_substr( $input_string, $start, $length, 'UTF-8' )
+		);
 	}
 
 	/**
 	 * @dataProvider data_utf8_substrings
 	 */
-	public function test_8bit_mb_substr( $input_string, $start, $length, $expected_character_substring, $expected_byte_substring ) {
-		$this->assertSame( $expected_byte_substring, _mb_substr( $input_string, $start, $length, '8bit' ) );
+	public function test_8bit_mb_substr( $input_string, $start, $length ) {
+		$this->assertSame(
+			mb_substr( $input_string, $start, $length, '8bit' ),
+			_mb_substr( $input_string, $start, $length, '8bit' )
+		);
 	}
 
 	/**
 	 * Data provider.
 	 *
-	 * @return array
+	 * @return array[]
 	 */
 	public function data_utf8_substrings() {
 		return array(
-			array(
-				'input_string'                 => 'баба',
-				'start'                        => 0,
-				'length'                       => 3,
-				'expected_character_substring' => 'баб',
-				'expected_byte_substring'      => "б\xD0",
-			),
-			array(
-				'input_string'                 => 'баба',
-				'start'                        => 0,
-				'length'                       => -1,
-				'expected_character_substring' => 'баб',
-				'expected_byte_substring'      => "баб\xD0",
-			),
-			array(
-				'input_string'                 => 'баба',
-				'start'                        => 1,
-				'length'                       => null,
-				'expected_character_substring' => 'аба',
-				'expected_byte_substring'      => "\xB1аба",
-			),
-			array(
-				'input_string'                 => 'баба',
-				'start'                        => -3,
-				'length'                       => null,
-				'expected_character_substring' => 'аба',
-				'expected_byte_substring'      => "\xB1а",
-			),
-			array(
-				'input_string'                 => 'баба',
-				'start'                        => -3,
-				'length'                       => 2,
-				'expected_character_substring' => 'аб',
-				'expected_byte_substring'      => "\xB1\xD0",
-			),
-			array(
-				'input_string'                 => 'баба',
-				'start'                        => -1,
-				'length'                       => 2,
-				'expected_character_substring' => 'а',
-				'expected_byte_substring'      => "\xB0",
-			),
-			array(
-				'input_string'                 => 'I am your баба',
-				'start'                        => 0,
-				'length'                       => 11,
-				'expected_character_substring' => 'I am your б',
-				'expected_byte_substring'      => "I am your \xD0",
-			),
+			'баба'           => array( 'баба', 0, 3 ),
+			'баба'           => array( 'баба', 0, -1 ),
+			'баба'           => array( 'баба', 1, null ),
+			'баба'           => array( 'баба', -3, null ),
+			'баба'           => array( 'баба', -3, 2 ),
+			'баба'           => array( 'баба', -2, 1 ),
+			'баба'           => array( 'баба', 30, 1 ),
+			'баба'           => array( 'баба', 15, -30 ),
+			'баба'           => array( 'баба', -5, -5 ),
+			'баба'           => array( 'баба', 5, -3 ),
+			'баба'           => array( 'баба', -3, 5 ),
+			'I am your баба' => array( 'I am your баба', 0, 11 ),
 		);
 	}
 
@@ -103,7 +66,7 @@ public function data_utf8_substrings() {
 	 */
 	public function test_mb_substr_phpcore_basic() {
 		$string_ascii = 'ABCDEF';
-		$string_mb    = base64_decode( '5pel5pys6Kqe44OG44Kt44K544OI44Gn44GZ44CCMDEyMzTvvJXvvJbvvJfvvJjvvJnjgII=' );
+		$string_mb    = '日本語テキストです。01234５６７８９。';
 
 		$this->assertSame(
 			'DEF',
@@ -118,13 +81,13 @@ public function test_mb_substr_phpcore_basic() {
 
 		// Specific latin-1 as that is the default the core PHP test operates under.
 		$this->assertSame(
-			'peacrOiqng==',
-			base64_encode( _mb_substr( $string_mb, 2, 7, 'latin-1' ) ),
+			"\xA5本語",
+			_mb_substr( $string_mb, 2, 7, 'latin-1' ),
 			'Substring does not match expected for offset 2, length 7, with latin-1 charset'
 		);
 		$this->assertSame(
-			'6Kqe44OG44Kt44K544OI44Gn44GZ',
-			base64_encode( _mb_substr( $string_mb, 2, 7, 'utf-8' ) ),
+			'語テキストです',
+			_mb_substr( $string_mb, 2, 7, 'utf-8' ),
 			'Substring does not match expected for offset 2, length 7, with utf-8 charset'
 		);
 	}