Charset: Conditionally polyfill utf8_encode() and utf8_decode().

dmsnell · dmsnell · commit cd61711b864f · 2025-10-16T23:17:14.000Z
The `utf8_encode()` and `utf8_decode()` functions were deprecated in PHP 8.2.0 and will be removed in PHP 9.0. When that happens, any existing code which calls them will trigger a crash. This patch introduces polyfills for those functions when they aren’t already present. The polyfill functions maintain backwards compatibility, including a deprecation notice. Any code calling either of these functions ought to be refactored to avoid using them; there are better options which don’t carry the issues these functions do, and any code calling them is likely calling them inappropriately. Developed in #10011 Discussed in https://core.trac.wordpress.org/ticket/55603 Discussed in https://core.trac.wordpress.org/ticket/63863 See #63863. git-svn-id: https://develop.svn.wordpress.org/trunk@60950 602fd350-edb4-49c9-b593-d223f7449a82
diff --git a/src/wp-includes/compat-utf8.php b/src/wp-includes/compat-utf8.php
@@ -337,3 +337,142 @@ function _wp_utf8_codepoint_count( string $text, ?int $byte_offset = 0, ?int $ma
 
 	return $count;
 }
+
+/**
+ * Converts a string from ISO-8859-1 to UTF-8, maintaining backwards compatibility
+ * with the deprecated function from the PHP standard library.
+ *
+ * @since 6.9.0
+ * @access private
+ *
+ * @see \utf8_encode()
+ *
+ * @param string $iso_8859_1_text Text treated as ISO-8859-1 (latin1) bytes.
+ * @return string Text converted into UTF-8.
+ */
+function _wp_utf8_encode_fallback( $iso_8859_1_text ) {
+	$iso_8859_1_text = (string) $iso_8859_1_text;
+	$at              = 0;
+	$was_at          = 0;
+	$end             = strlen( $iso_8859_1_text );
+	$utf8            = '';
+
+	while ( $at < $end ) {
+		// US-ASCII bytes are identical in ISO-8859-1 and UTF-8. These are 0x00–0x7F.
+		$ascii_byte_count = strspn(
+			$iso_8859_1_text,
+			"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" .
+			"\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" .
+			" !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f",
+			$at
+		);
+
+		if ( $ascii_byte_count > 0 ) {
+			$at += $ascii_byte_count;
+			continue;
+		}
+
+		// All other bytes transform into two-byte UTF-8 sequences.
+		$code_point = ord( $iso_8859_1_text[ $at ] );
+		$byte1      = chr( 0xC0 | ( $code_point >> 6 ) );
+		$byte2      = chr( 0x80 | ( $code_point & 0x3F ) );
+
+		$utf8 .= substr( $iso_8859_1_text, $was_at, $at - $was_at );
+		$utf8 .= "{$byte1}{$byte2}";
+
+		++$at;
+		$was_at = $at;
+	}
+
+	if ( 0 === $was_at ) {
+		return $iso_8859_1_text;
+	}
+
+	$utf8 .= substr( $iso_8859_1_text, $was_at );
+	return $utf8;
+}
+
+/**
+ * Converts a string from UTF-8 to ISO-8859-1, maintaining backwards compatibility
+ * with the deprecated function from the PHP standard library.
+ *
+ * @since 6.9.0
+ * @access private
+ *
+ * @see \utf8_decode()
+ *
+ * @param string $utf8_text Text treated as UTF-8 bytes.
+ * @return string Text converted into ISO-8859-1.
+ */
+function _wp_utf8_decode_fallback( $utf8_text ) {
+	$utf8_text       = (string) $utf8_text;
+	$at              = 0;
+	$was_at          = 0;
+	$end             = strlen( $utf8_text );
+	$iso_8859_1_text = '';
+
+	while ( $at < $end ) {
+		// US-ASCII bytes are identical in ISO-8859-1 and UTF-8. These are 0x00–0x7F.
+		$ascii_byte_count = strspn(
+			$utf8_text,
+			"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" .
+			"\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" .
+			" !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f",
+			$at
+		);
+
+		if ( $ascii_byte_count > 0 ) {
+			$at += $ascii_byte_count;
+			continue;
+		}
+
+		$next_at        = $at;
+		$invalid_length = 0;
+		$found          = _wp_scan_utf8( $utf8_text, $next_at, $invalid_length, null, 1 );
+		$span_length    = $next_at - $at;
+		$next_byte      = '?';
+
+		if ( 1 !== $found ) {
+			if ( $invalid_length > 0 ) {
+				$next_byte = '';
+				goto flush_sub_part;
+			}
+
+			break;
+		}
+
+		// All convertible code points are two-bytes long.
+		$byte1 = ord( $utf8_text[ $at ] );
+		if ( 0xC0 !== ( $byte1 & 0xE0 ) ) {
+			goto flush_sub_part;
+		}
+
+		// All convertible code points are not greater than U+FF.
+		$byte2 = ord( $utf8_text[ $at + 1 ] );
+		$code_point = ( ( $byte1 & 0x1F ) << 6 ) | ( ( $byte2 & 0x3F ) );
+		if ( $code_point > 0xFF ) {
+			goto flush_sub_part;
+		}
+
+		$next_byte = chr( $code_point );
+
+		flush_sub_part:
+		$iso_8859_1_text .= substr( $utf8_text, $was_at, $at - $was_at );
+		$iso_8859_1_text .= $next_byte;
+		$at              += $span_length;
+		$was_at           = $at;
+
+		if ( $invalid_length > 0 ) {
+			$iso_8859_1_text .= '?';
+			$at              += $invalid_length;
+			$was_at           = $at;
+		}
+	}
+
+	if ( 0 === $was_at ) {
+		return $utf8_text;
+	}
+
+	$iso_8859_1_text .= substr( $utf8_text, $was_at );
+	return $iso_8859_1_text;
+}
diff --git a/src/wp-includes/compat.php b/src/wp-includes/compat.php
@@ -247,6 +247,74 @@ function _mb_strlen( $str, $encoding = null ) {
 		: strlen( $str );
 }
 
+if ( ! function_exists( 'utf8_encode' ) ) :
+	if ( extension_loaded( 'mbstring' ) ) :
+		/**
+		 * Converts a string from ISO-8859-1 to UTF-8.
+		 *
+		 * @deprecated Use {@see \mb_convert_encoding()} instead.
+		 *
+		 * @since 6.9.0
+		 *
+		 * @param string $iso_8859_1_text Text treated as ISO-8859-1 (latin1) bytes.
+		 * @return string Text converted into a UTF-8.
+		 */
+		function utf8_encode( $iso_8859_1_text ): string {
+			_deprecated_function( __FUNCTION__, '6.9.0', 'mb_convert_encoding' );
+
+			return mb_convert_encoding( $iso_8859_1_text, 'UTF-8', 'ISO-8859-1' );
+		}
+
+	else :
+		/**
+		 * @ignore
+		 * @private
+		 *
+		 * @since 6.9.0
+		 */
+		function utf8_encode( $iso_8859_1_text ): string {
+			_deprecated_function( __FUNCTION__, '6.9.0', 'mb_convert_encoding' );
+
+			return _wp_utf8_encode_fallback( $iso_8859_1_text );
+		}
+
+	endif;
+endif;
+
+if ( ! function_exists( 'utf8_decode' ) ) :
+	if ( extension_loaded( 'mbstring' ) ) :
+		/**
+		 * Converts a string from UTF-8 to ISO-8859-1.
+		 *
+		 * @deprecated Use {@see \mb_convert_encoding()} instead.
+		 *
+		 * @since 6.9.0
+		 *
+		 * @param string $utf8_text Text treated as UTF-8.
+		 * @return string Text converted into ISO-8859-1.
+		 */
+		function utf8_decode( $utf8_text ): string {
+			_deprecated_function( __FUNCTION__, '6.9.0', 'mb_convert_encoding' );
+
+			return mb_convert_encoding( $utf8_text, 'ISO-8859-1', 'UTF-8' );
+		}
+
+	else :
+		/**
+		 * @ignore
+		 * @private
+		 *
+		 * @since 6.9.0
+		 */
+		function utf8_decode( $utf8_text ): string {
+			_deprecated_function( __FUNCTION__, '6.9.0', 'mb_convert_encoding' );
+
+			return _wp_utf8_decode_fallback( $utf8_text );
+		}
+
+	endif;
+endif;
+
 // sodium_crypto_box() was introduced in PHP 7.2.
 if ( ! function_exists( 'sodium_crypto_box' ) ) {
 	require ABSPATH . WPINC . '/sodium_compat/autoload.php';
diff --git a/tests/phpunit/tests/formatting/deprecatedUtfEncodeDecode.php b/tests/phpunit/tests/formatting/deprecatedUtfEncodeDecode.php
@@ -0,0 +1,114 @@
+<?php
+
+/**
+ * @group formatting
+ */
+class Tests_DeprecatedUtf8EncodeDecodeTest extends WP_UnitTestCase {
+	/**
+	 * Ensures that the fallback for {@see \utf8_encode()} maps the ISO-8859-1 characters properly.
+	 *
+	 * @ticket 63863.
+	 */
+	public function test_utf8_encode_characters() {
+		for ( $i = 0; $i <= 0xFF; $i++ ) {
+			$c     = chr( $i );
+			$hex_i = strtoupper( str_pad( dechex( $i ), 2, '0', STR_PAD_LEFT ) );
+
+			$this->assertSame(
+				bin2hex( mb_convert_encoding( $c, 'UTF-8', 'ISO-8859-1' ) ),
+				bin2hex( _wp_utf8_encode_fallback( $c ) ),
+				"Failed to convert U+{$hex_i} properly."
+			);
+		}
+	}
+
+	/**
+	 * Ensures that the fallback for {@see \utf8_encode()} properly
+	 * matches the legacy behavior for a given set of test cases.
+	 *
+	 * @ticket 63863.
+	 *
+	 * @dataProvider data_utf8_strings
+	 */
+	public function test_utf8_encode_cases( $input ) {
+		$this->assertSame(
+			mb_convert_encoding( $input, 'UTF-8', 'ISO-8859-1' ),
+			_wp_utf8_encode_fallback( $input ),
+			'Failed to properly convert.'
+		);
+	}
+
+	/**
+	 * Data provider.
+	 *
+	 * @return array[].
+	 */
+	public static function data_utf8_strings() {
+		return array(
+			'Basic valid string' => array( 'Dan eats cinnamon toast.' ),
+			'Valid with Emoji'   => array( 'The best Emoji is 🅰.' ),
+			'Truncated bytes'    => array( substr( 'England has 🏴󠁧󠁢󠁥󠁮󠁧󠁿', 0, -1 ) ),
+			'Minimal subpart'    => array( "One \xC0, two \xE2\x80, three \xF0\x95\x85." ),
+		);
+	}
+
+	/**
+	 * Ensures that the fallback for {@see \utf8_decode()} maps the UTF-8 characters properly.
+	 *
+	 * @ticket 63863.
+	 */
+	public function test_utf8_decode_characters() {
+		for ( $i = 0; $i <= 0x10FFFF; $i++ ) {
+			$hex_i = strtoupper( str_pad( dechex( $i ), 2, '0', STR_PAD_LEFT ) );
+
+			if ( $i < 0xD800 || $i > 0xE000 ) {
+				$c = mb_chr( $i );
+			} else {
+				/*
+				 * Since the UTF-16 surrogate halves are not valid Unicode characters,
+				 * these have to be manually constructed as invalid UTF-8.
+				 */
+				$byte1 = 0xE0 | ( $i >> 12 );
+				$byte2 = 0x80 | ( ( $i >> 6 ) & 0x3F );
+				$byte3 = 0x80 | ( $i & 0x3F );
+
+				$c = "{$byte1}{$byte2}{$byte3}";
+			}
+
+			$this->assertSame(
+				bin2hex( mb_convert_encoding( $c, 'ISO-8859-1', 'UTF-8' ) ),
+				bin2hex( _wp_utf8_decode_fallback( $c ) ),
+				"Failed to convert U+{$hex_i} properly."
+			);
+		}
+	}
+
+	/**
+	 * Ensures that the fallback for {@see \utf8_encode()} properly
+	 * matches the legacy behavior for a given set of test cases.
+	 *
+	 * @ticket 63863.
+	 *
+	 * @dataProvider data_iso_8859_1_strings
+	 */
+	public function test_utf8_decode_cases( $input ) {
+		$this->assertSame(
+			mb_convert_encoding( $input, 'ISO-8859-1', 'UTF-8' ),
+			_wp_utf8_decode_fallback( $input ),
+			'Failed to properly convert.'
+		);
+	}
+
+	/**
+	 * Data provider.
+	 *
+	 * @return array[].
+	 */
+	public static function data_iso_8859_1_strings() {
+		return array(
+			'Basic valid string'     => array( 'Dan eats cinnamon toast' ),
+			'Latin1 supplement'      => array( 'Pi\xF1a is another name for Pineapple.' ),
+			'Bytes as invalid UTF-8' => array( 'The \x95 is invalid UTF-8.' ),
+		);
+	}
+}