Normalize UTF-8 charset slug detection.

dmsnell · dmsnell · commit d4967a361925 · 2024-05-14T18:03:43.000Z
There are several exist places in Core that attempt to detect if a blog charset is UTF-8. Each place attempts to perform the same check, except the logic is spread throughout and there's no single method provided to make this determination in a consistent way. The `_canonical_charset()` method exists, but is marked private for use. In this patch the new `unicode` module provides `is_utf8_charset()` as a method taking an optional charset slug and indicating if it represents UTF-8, examining all of the allowable variants of that slug. Associated code is updated to use this new function, including `_canonical_charset()`. If no slug is provided, it will look up the current `get_option( 'blog_charset' )`. Finally, the test functions governing `_canonical_charset()` have been rewritten as a single test with a data provider instead of as separate test functions. Developed in WordPress#6535 Discussed in https://core.trac.wordpress.org/ticket/61182 Fixes #61182. Props dmsnell, jonsurrell. git-svn-id: https://develop.svn.wordpress.org/trunk@58147 602fd350-edb4-49c9-b593-d223f7449a82
diff --git a/src/wp-admin/options-reading.php b/src/wp-admin/options-reading.php
@@ -64,7 +64,7 @@
 <?php
 settings_fields( 'reading' );
 
-if ( ! in_array( get_option( 'blog_charset' ), array( 'utf8', 'utf-8', 'UTF8', 'UTF-8' ), true ) ) {
+if ( ! is_utf8_charset() ) {
 	add_settings_field( 'blog_charset', __( 'Encoding for pages and feeds' ), 'options_reading_blog_charset', 'reading', 'default', array( 'label_for' => 'blog_charset' ) );
 }
 ?>
diff --git a/src/wp-admin/options.php b/src/wp-admin/options.php
@@ -160,7 +160,7 @@
 
 $mail_options = array( 'mailserver_url', 'mailserver_port', 'mailserver_login', 'mailserver_pass' );
 
-if ( ! in_array( get_option( 'blog_charset' ), array( 'utf8', 'utf-8', 'UTF8', 'UTF-8' ), true ) ) {
+if ( ! is_utf8_charset() ) {
 	$allowed_options['reading'][] = 'blog_charset';
 }
 
diff --git a/src/wp-includes/compat.php b/src/wp-includes/compat.php
@@ -91,7 +91,7 @@ function _mb_substr( $str, $start, $length = null, $encoding = null ) {
 	 * The solution below works only for UTF-8, so in case of a different
 	 * charset just use built-in substr().
 	 */
-	if ( ! in_array( $encoding, array( 'utf8', 'utf-8', 'UTF8', 'UTF-8' ), true ) ) {
+	if ( ! is_utf8_charset( $encoding ) ) {
 		return is_null( $length ) ? substr( $str, $start ) : substr( $str, $start, $length );
 	}
 
@@ -176,7 +176,7 @@ function _mb_strlen( $str, $encoding = null ) {
 	 * The solution below works only for UTF-8, so in case of a different charset
 	 * just use built-in strlen().
 	 */
-	if ( ! in_array( $encoding, array( 'utf8', 'utf-8', 'UTF8', 'UTF-8' ), true ) ) {
+	if ( ! is_utf8_charset( $encoding ) ) {
 		return strlen( $str );
 	}
 
diff --git a/src/wp-includes/formatting.php b/src/wp-includes/formatting.php
@@ -960,19 +960,7 @@ function _wp_specialchars( $text, $quote_style = ENT_NOQUOTES, $charset = false,
 		$quote_style = ENT_QUOTES;
 	}
 
-	// Store the site charset as a static to avoid multiple calls to wp_load_alloptions().
-	if ( ! $charset ) {
-		static $_charset = null;
-		if ( ! isset( $_charset ) ) {
-			$alloptions = wp_load_alloptions();
-			$_charset   = isset( $alloptions['blog_charset'] ) ? $alloptions['blog_charset'] : '';
-		}
-		$charset = $_charset;
-	}
-
-	if ( in_array( $charset, array( 'utf8', 'utf-8', 'UTF8' ), true ) ) {
-		$charset = 'UTF-8';
-	}
+	$charset = _canonical_charset( $charset ? $charset : get_option( 'blog_charset' ) );
 
 	$_quote_style = $quote_style;
 
@@ -1114,7 +1102,7 @@ function wp_check_invalid_utf8( $text, $strip = false ) {
 	// Store the site charset as a static to avoid multiple calls to get_option().
 	static $is_utf8 = null;
 	if ( ! isset( $is_utf8 ) ) {
-		$is_utf8 = in_array( get_option( 'blog_charset' ), array( 'utf8', 'utf-8', 'UTF8', 'UTF-8' ), true );
+		$is_utf8 = is_utf8_charset();
 	}
 	if ( ! $is_utf8 ) {
 		return $text;
diff --git a/src/wp-includes/functions.php b/src/wp-includes/functions.php
@@ -7474,17 +7474,27 @@ function get_tag_regex( $tag ) {
  *
  * @see https://core.trac.wordpress.org/ticket/23688
  *
- * @param string $charset A charset name.
+ * @param string $charset A charset name, e.g. "UTF-8", "Windows-1252", "SJIS".
  * @return string The canonical form of the charset.
  */
 function _canonical_charset( $charset ) {
-	if ( 'utf-8' === strtolower( $charset ) || 'utf8' === strtolower( $charset ) ) {
-
+	if ( is_utf8_charset( $charset ) ) {
 		return 'UTF-8';
 	}
 
-	if ( 'iso-8859-1' === strtolower( $charset ) || 'iso8859-1' === strtolower( $charset ) ) {
-
+	/*
+	 * Normalize the ISO-8859-1 family of languages.
+	 *
+	 * This is not required for htmlspecialchars(), as it properly recognizes all of
+	 * the input character sets that here are transformed into "ISO-8859-1".
+	 *
+	 * @todo Should this entire check be removed since it's not required for the stated purpose?
+	 * @todo Should WordPress transform other potential charset equivalents, such as "latin1"?
+	 */
+	if (
+		( 0 === strcasecmp( 'iso-8859-1', $charset ) ) ||
+		( 0 === strcasecmp( 'iso8859-1', $charset ) )
+	) {
 		return 'ISO-8859-1';
 	}
 
diff --git a/src/wp-settings.php b/src/wp-settings.php
@@ -106,6 +106,7 @@
 wp_set_lang_dir();
 
 // Load early WordPress files.
+require ABSPATH . WPINC . '/unicode.php';
 require ABSPATH . WPINC . '/class-wp-list-util.php';
 require ABSPATH . WPINC . '/formatting.php';
 require ABSPATH . WPINC . '/meta.php';
diff --git a/tests/phpunit/tests/functions/canonicalCharset.php b/tests/phpunit/tests/functions/canonicalCharset.php
@@ -10,45 +10,54 @@
  * @covers ::_canonical_charset
  */
 class Tests_Functions_CanonicalCharset extends WP_UnitTestCase {
-
-	public function test_utf_8_lower() {
-		$this->assertSame( 'UTF-8', _canonical_charset( 'utf-8' ) );
-	}
-
-	public function test_utf_8_upper() {
-		$this->assertSame( 'UTF-8', _canonical_charset( 'UTF-8' ) );
-	}
-
-	public function test_utf_8_mixxed() {
-		$this->assertSame( 'UTF-8', _canonical_charset( 'Utf-8' ) );
-	}
-
-	public function test_utf_8() {
-		$this->assertSame( 'UTF-8', _canonical_charset( 'UTF8' ) );
-	}
-
-	public function test_iso_lower() {
-		$this->assertSame( 'ISO-8859-1', _canonical_charset( 'iso-8859-1' ) );
-	}
-
-	public function test_iso_upper() {
-		$this->assertSame( 'ISO-8859-1', _canonical_charset( 'ISO-8859-1' ) );
-	}
-
-	public function test_iso_mixxed() {
-		$this->assertSame( 'ISO-8859-1', _canonical_charset( 'Iso8859-1' ) );
-	}
-
-	public function test_iso() {
-		$this->assertSame( 'ISO-8859-1', _canonical_charset( 'ISO8859-1' ) );
-	}
-
-	public function test_random() {
-		$this->assertSame( 'random', _canonical_charset( 'random' ) );
+	/**
+	 * Ensures that charset variants for common encodings normalize to the expected form.
+	 *
+	 * @ticket 61182
+	 *
+	 * @dataProvider data_charset_normalizations
+	 *
+	 * @param string $given_charset      Potential charset provided by user.
+	 * @param string $normalized_charset Expected normalized form of charset.
+	 */
+	public function test_properly_normalizes_charset_variants( $given_charset, $normalized_charset ) {
+		$this->assertSame(
+			$normalized_charset,
+			_canonical_charset( $given_charset ),
+			'Did not properly transform the provided charset into its normalized form.'
+		);
 	}
 
-	public function test_empty() {
-		$this->assertSame( '', _canonical_charset( '' ) );
+	/**
+	 * Data provider.
+	 *
+	 * @return array[].
+	 */
+	public static function data_charset_normalizations() {
+		return array(
+			// UTF-8 family.
+			array( 'UTF-8', 'UTF-8' ),
+			array( 'Utf-8', 'UTF-8' ),
+			array( 'Utf-8', 'UTF-8' ),
+			array( 'UTF8', 'UTF-8' ),
+
+			// Almost UTF-8.
+			array( 'UTF-8*', 'UTF-8*' ),
+			array( 'UTF.8', 'UTF.8' ),
+			array( 'UTF88', 'UTF88' ),
+			array( 'UTF-7', 'UTF-7' ),
+			array( 'X-UTF-8', 'X-UTF-8' ),
+
+			// ISO-8859-1 family.
+			array( 'iso-8859-1', 'ISO-8859-1' ),
+			array( 'ISO-8859-1', 'ISO-8859-1' ),
+			array( 'Iso-8859-1', 'ISO-8859-1' ),
+			array( 'ISO8859-1', 'ISO-8859-1' ),
+
+			// Other charset slugs should not be adjusted.
+			array( 'random', 'random' ),
+			array( '', '' ),
+		);
 	}
 
 	/**

Original file line number	Diff line number	Diff line change
`@@ -64,7 +64,7 @@`
`64`	`64`	`<?php`
`65`	`65`	`settings_fields( 'reading' );`
`66`	`66`
`67`		`-if ( ! in_array( get_option( 'blog_charset' ), array( 'utf8', 'utf-8', 'UTF8', 'UTF-8' ), true ) ) {`
	`67`	`+if ( ! is_utf8_charset() ) {`
`68`	`68`	`add_settings_field( 'blog_charset', __( 'Encoding for pages and feeds' ), 'options_reading_blog_charset', 'reading', 'default', array( 'label_for' => 'blog_charset' ) );`
`69`	`69`	`}`
`70`	`70`	`?>`
Original file line number	Diff line number	Diff line change
`@@ -160,7 +160,7 @@`
`160`	`160`
`161`	`161`	`$mail_options = array( 'mailserver_url', 'mailserver_port', 'mailserver_login', 'mailserver_pass' );`
`162`	`162`
`163`		`-if ( ! in_array( get_option( 'blog_charset' ), array( 'utf8', 'utf-8', 'UTF8', 'UTF-8' ), true ) ) {`
	`163`	`+if ( ! is_utf8_charset() ) {`
`164`	`164`	`$allowed_options['reading'][] = 'blog_charset';`
`165`	`165`	`}`
`166`	`166`