Charset: Rely on new UTF-8 pipeline for mb_strlen() fallback.

dmsnell · dmsnell · commit 8508427bca90 · 2025-10-16T20:58:26.000Z
The existing polyfill for `mb_strlen()` contains a number of issues leaving plenty of opportunity for improvement. Specifically, the following are all deficiencies: it relies on Unicode PCRE support, assumes input strings are valid UTF-8, splits input strings into an array of character to count them (1,000 at a time, iterating until complete), and entirely gives up when the Unicode support is missing. This patch provides an updated polyfill which will reliably count code points in a UTF-8 string, even in the presence of sequences of invalid bytes. It scans through the input with zero allocations. Additionally, the underlying fallback extends the behavior of `mb_strlen()` to provide character counts for substrings within a larger input without extracting the substring (it can counts characters within a byte offset and length of a larger string). This change improves the reliability of UTF-8 string length calculations and removes behavioral variability based on the runtime system. Developed in #9828 Discussed in https://core.trac.wordpress.org/ticket/63863 See #63863. git-svn-id: https://develop.svn.wordpress.org/trunk@60949 602fd350-edb4-49c9-b593-d223f7449a82
diff --git a/src/wp-includes/compat-utf8.php b/src/wp-includes/compat-utf8.php
@@ -291,3 +291,49 @@ function _wp_scrub_utf8_fallback( string $bytes ): string {
 
 	return $scrubbed;
 }
+
+/**
+ * Returns how many code points are found in the given UTF-8 string.
+ *
+ * Invalid spans of bytes count as a single code point according
+ * to the maximal subpart rule. This function is a fallback method
+ * for calling `mb_strlen( $text, 'UTF-8' )`.
+ *
+ * When negative values are provided for the byte offsets or length,
+ * this will always report zero code points.
+ *
+ * Example:
+ *
+ *     4  === _wp_utf8_codepoint_count( 'text' );
+ *
+ *     // Groups are 'test', "\x90" as '�', 'wp', "\xE2\x80" as '�', "\xC0" as '�', and 'test'.
+ *     13 === _wp_utf8_codepoint_count( "test\x90wp\xE2\x80\xC0test" );
+ *
+ * @since 6.9.0
+ * @access private
+ *
+ * @param string $text            Count code points in this string.
+ * @param ?int   $byte_offset     Start counting after this many bytes in `$text`. Must be positive.
+ * @param ?int   $max_byte_length Optional. Stop counting after having scanned past this many bytes.
+ *                                Default is to scan until the end of the string. Must be positive.
+ * @return int How many code points were found.
+ */
+function _wp_utf8_codepoint_count( string $text, ?int $byte_offset = 0, ?int $max_byte_length = PHP_INT_MAX ): int {
+	if ( $byte_offset < 0 ) {
+		return 0;
+	}
+
+	$count           = 0;
+	$at              = $byte_offset;
+	$end             = strlen( $text );
+	$invalid_length  = 0;
+	$max_byte_length = min( $end - $at, $max_byte_length );
+
+	while ( $at < $end && ( $at - $byte_offset ) < $max_byte_length ) {
+		$count += _wp_scan_utf8( $text, $at, $invalid_length, $max_byte_length - ( $at - $byte_offset ) );
+		$count += $invalid_length > 0 ? 1 : 0;
+		$at    += $invalid_length;
+	}
+
+	return $count;
+}
diff --git a/src/wp-includes/compat.php b/src/wp-includes/compat.php
@@ -228,69 +228,23 @@ function mb_strlen( $string, $encoding = null ) { // phpcs:ignore Universal.Nami
 /**
  * Internal compat function to mimic mb_strlen().
  *
- * Only understands UTF-8 and 8bit. All other character sets will be treated as 8bit.
- * For `$encoding === UTF-8`, the `$str` input is expected to be a valid UTF-8 byte
- * sequence. The behavior of this function for invalid inputs is undefined.
+ * Only supports UTF-8 and non-shifting single-byte encodings. For all other
+ * encodings expect the counts to be wrong. When the given encoding (or the
+ * `blog_charset` if none is provided) isn’t UTF-8 then the function returns
+ * the byte-count of the provided string.
  *
  * @ignore
  * @since 4.2.0
  *
  * @param string      $str      The string to retrieve the character length from.
- * @param string|null $encoding Optional. Character encoding to use. Default null.
- * @return int String length of `$str`.
+ * @param string|null $encoding Optional. Count characters according to this encoding.
+ *                              Default is to consult `blog_charset`.
+ * @return int Count of code points if UTF-8, byte length otherwise.
  */
 function _mb_strlen( $str, $encoding = null ) {
-	if ( null === $encoding ) {
-		$encoding = get_option( 'blog_charset' );
-	}
-
-	/*
-	 * The solution below works only for UTF-8, so in case of a different charset
-	 * just use built-in strlen().
-	 */
-	if ( ! _is_utf8_charset( $encoding ) ) {
-		return strlen( $str );
-	}
-
-	if ( _wp_can_use_pcre_u() ) {
-		// Use the regex unicode support to separate the UTF-8 characters into an array.
-		preg_match_all( '/./us', $str, $match );
-		return count( $match[0] );
-	}
-
-	$regex = '/(?:
-		[\x00-\x7F]                  # single-byte sequences   0xxxxxxx
-		| [\xC2-\xDF][\x80-\xBF]       # double-byte sequences   110xxxxx 10xxxxxx
-		| \xE0[\xA0-\xBF][\x80-\xBF]   # triple-byte sequences   1110xxxx 10xxxxxx * 2
-		| [\xE1-\xEC][\x80-\xBF]{2}
-		| \xED[\x80-\x9F][\x80-\xBF]
-		| [\xEE-\xEF][\x80-\xBF]{2}
-		| \xF0[\x90-\xBF][\x80-\xBF]{2} # four-byte sequences   11110xxx 10xxxxxx * 3
-		| [\xF1-\xF3][\x80-\xBF]{3}
-		| \xF4[\x80-\x8F][\x80-\xBF]{2}
-	)/x';
-
-	// Start at 1 instead of 0 since the first thing we do is decrement.
-	$count = 1;
-
-	do {
-		// We had some string left over from the last round, but we counted it in that last round.
-		--$count;
-
-		/*
-		 * Split by UTF-8 character, limit to 1000 characters (last array element will contain
-		 * the rest of the string).
-		 */
-		$pieces = preg_split( $regex, $str, 1000 );
-
-		// Increment.
-		$count += count( $pieces );
-
-		// If there's anything left over, repeat the loop.
-	} while ( $str = array_pop( $pieces ) );
-
-	// Fencepost: preg_split() always returns one extra item in the array.
-	return --$count;
+	return _is_utf8_charset( $encoding ?? get_option( 'blog_charset' ) )
+		? _wp_utf8_codepoint_count( $str )
+		: strlen( $str );
 }
 
 // sodium_crypto_box() was introduced in PHP 7.2.
diff --git a/tests/phpunit/tests/compat/mbStrlen.php b/tests/phpunit/tests/compat/mbStrlen.php
@@ -10,82 +10,60 @@
 class Tests_Compat_mbStrlen extends WP_UnitTestCase {
 
 	/**
-	 * Test that mb_strlen() is always available (either from PHP or WP).
+	 * Test that the native mb_strlen() is available.
 	 */
 	public function test_mb_strlen_availability() {
-		$this->assertTrue( function_exists( 'mb_strlen' ) );
+		$this->assertTrue(
+			in_array( 'mb_strlen', get_defined_functions()['internal'], true ),
+			'Test runner should have `mbstring` extension active but doesn’t.'
+		);
 	}
 
 	/**
-	 * @dataProvider data_utf8_string_lengths
+	 * @dataProvider data_utf8_strings
 	 */
-	public function test_mb_strlen( $input_string, $expected_character_length ) {
-		$this->assertSame( $expected_character_length, _mb_strlen( $input_string, 'UTF-8' ) );
+	public function test_mb_strlen( $input_string ) {
+		$this->assertSame(
+			mb_strlen( $input_string, 'UTF-8' ),
+			_mb_strlen( $input_string, 'UTF-8' )
+		);
 	}
 
 	/**
-	 * @dataProvider data_utf8_string_lengths
+	 * @dataProvider data_utf8_strings
 	 */
-	public function test_mb_strlen_via_regex( $input_string, $expected_character_length ) {
-		_wp_can_use_pcre_u( false );
-		$this->assertSame( $expected_character_length, _mb_strlen( $input_string, 'UTF-8' ) );
-		_wp_can_use_pcre_u( 'reset' );
+	public function test_mb_strlen_via_regex( $input_string ) {
+		$this->assertSame(
+			mb_strlen( $input_string, 'UTF-8' ),
+			_mb_strlen( $input_string, 'UTF-8' )
+		);
 	}
 
 	/**
-	 * @dataProvider data_utf8_string_lengths
+	 * @dataProvider data_utf8_strings
 	 */
-	public function test_8bit_mb_strlen( $input_string, $expected_character_length, $expected_byte_length ) {
-		$this->assertSame( $expected_byte_length, _mb_strlen( $input_string, '8bit' ) );
+	public function test_8bit_mb_strlen( $input_string ) {
+		$this->assertSame(
+			mb_strlen( $input_string, '8bit' ),
+			_mb_strlen( $input_string, '8bit' )
+		);
 	}
 
 	/**
 	 * Data provider.
 	 *
 	 * @return array
 	 */
-	public function data_utf8_string_lengths() {
+	public function data_utf8_strings() {
 		return array(
-			array(
-				'input_string'              => 'баба',
-				'expected_character_length' => 4,
-				'expected_byte_length'      => 8,
-			),
-			array(
-				'input_string'              => 'баб',
-				'expected_character_length' => 3,
-				'expected_byte_length'      => 6,
-			),
-			array(
-				'input_string'              => 'I am your б',
-				'expected_character_length' => 11,
-				'expected_byte_length'      => 12,
-			),
-			array(
-				'input_string'              => '1111111111',
-				'expected_character_length' => 10,
-				'expected_byte_length'      => 10,
-			),
-			array(
-				'input_string'              => '²²²²²²²²²²',
-				'expected_character_length' => 10,
-				'expected_byte_length'      => 20,
-			),
-			array(
-				'input_string'              => '３３３３３３３３３３',
-				'expected_character_length' => 10,
-				'expected_byte_length'      => 30,
-			),
-			array(
-				'input_string'              => '𝟜𝟜𝟜𝟜𝟜𝟜𝟜𝟜𝟜𝟜',
-				'expected_character_length' => 10,
-				'expected_byte_length'      => 40,
-			),
-			array(
-				'input_string'              => '1²３𝟜1²３𝟜1²３𝟜',
-				'expected_character_length' => 12,
-				'expected_byte_length'      => 30,
-			),
+			array( 'баба' ),
+			array( 'баб' ),
+			array( 'I am your б' ),
+			array( '1111111111' ),
+			array( '²²²²²²²²²²' ),
+			array( '３３３３３３３３３３' ),
+			array( '𝟜𝟜𝟜𝟜𝟜𝟜𝟜𝟜𝟜𝟜' ),
+			array( '1²３𝟜1²３𝟜1²３𝟜' ),
 		);
 	}
 }
diff --git a/tests/phpunit/tests/compat/wpCodePointCount.php b/tests/phpunit/tests/compat/wpCodePointCount.php
@@ -0,0 +1,107 @@
+<?php
+/**
+ * Unit tests covering fallback UTF-8 code-point counting.
+ *
+ * @package    WordPress
+ * @subpackage Charset
+ *
+ * @since      6.9.0
+ *
+ * @group      compat
+ *
+ * @covers ::_wp_utf8_codepoint_count()
+ */
+class Tests_Compat_wpUtf8CodePointCount extends WP_UnitTestCase {
+	/**
+	 * Ensures that there are zero code points reported when starting before the text.
+	 *
+	 * @ticket 63863
+	 */
+	public function test_rejects_negative_byte_offsets() {
+		$this->assertSame(
+			0,
+			_wp_utf8_codepoint_count( 'any old text', -5, 3 ),
+			'Should have indicated that there are zero code points before the start of the text.'
+		);
+
+		$this->assertSame(
+			0,
+			_wp_utf8_codepoint_count( 'any old text', -5, 5 + 12 ),
+			'Should have found no code points before the start of the text, even if the length overlaps the text.'
+		);
+	}
+
+	/**
+	 * Ensures that there are zero code points reported when scanning a negative length.
+	 *
+	 * @ticket 63863
+	 */
+	public function test_rejects_negative_byte_lengths() {
+		$this->assertSame(
+			0,
+			_wp_utf8_codepoint_count( 'any old text', 2, -5 ),
+			'Should have indicated that there are zero code points in a span of negative length.'
+		);
+	}
+
+	/**
+	 * Ensures that code points are counted properly across different byte offsets
+	 * and lengths, equivalent to counting code points for an equivalent substring.
+	 *
+	 * @ticket 63863
+	 *
+	 * @dataProvider data_strings_and_substring_offsets
+	 *
+	 * @param string $text
+	 * @param int    $byte_offset
+	 * @param int    $byte_length
+	 * @return void
+	 */
+	public function test_counts_within_appropriate_offsets( string $text, int $byte_offset, int $byte_length ) {
+		$substring = substr( $text, $byte_offset, $byte_length );
+
+		if (
+			! mb_check_encoding( $substring, 'UTF-8' ) &&
+			// Miscounting bug fixed by removal of “fast path” php/php-src@cca4ca6d3dda8c2e1c5c1b053550f94b3d6fb6bf
+			version_compare( PHP_VERSION, '8.3.0', '<' )
+		) {
+			$this->markTestSkipped( 'Prior to PHP 8.3.0, mb_strlen() misreported lengths of invalid inputs.' );
+		}
+
+		$this->assertSame(
+			mb_strlen( $substring, 'UTF-8' ),
+			_wp_utf8_codepoint_count( $text, $byte_offset, $byte_length ),
+			"Miscounted code points from {$byte_length} bytes starting at {$byte_offset} in '{$text}'"
+		);
+	}
+
+	/**
+	 * Data provider.
+	 *
+	 * @return array[]
+	 */
+	public static function data_strings_and_substring_offsets() {
+		return array(
+			array( 'zero length', 0, 0 ),
+			array( 'zero length (in middle)', 5, 0 ),
+			array( 'full text', 0, 9 ),
+			array( 'prefix', 0, 2 ),
+			array( 'middle span', 2, 4 ),
+			array( 'suffix', 3, 3 ),
+			array( 'overlong', 4, 8 ),
+
+			array( "emoji \u{1F170} partial", 6, 1 ),
+			array( "emoji \u{1F170} partial", 6, 2 ),
+			array( "emoji \u{1F170} full", 6, 3 ),
+			array( "emoji \u{1F170} beyond", 6, 4 ),
+
+			array( "invalid \xF0\x9F before", 8, 5 ),
+			array( "invalid \xF0\x9F before", 9, 5 ),
+			array( "invalid \x95 whole", 8, 1 ),
+			array( "invalid \x95 beyond", 8, 5 ),
+			array( "invalid \x85\xB0 after", 8, 4 ),
+			array( "invalid \x85\xB0 after", 9, 3 ),
+			array( "invalid \x85\xB0\xC0\xF0\x9F subparts", 8, 7 ),
+		);
+	}
+}