Skip to content

Commit d4967a3

Browse files
committed
Normalize UTF-8 charset slug detection.
There are several exist places in Core that attempt to detect if a blog charset is UTF-8. Each place attempts to perform the same check, except the logic is spread throughout and there's no single method provided to make this determination in a consistent way. The `_canonical_charset()` method exists, but is marked private for use. In this patch the new `unicode` module provides `is_utf8_charset()` as a method taking an optional charset slug and indicating if it represents UTF-8, examining all of the allowable variants of that slug. Associated code is updated to use this new function, including `_canonical_charset()`. If no slug is provided, it will look up the current `get_option( 'blog_charset' )`. Finally, the test functions governing `_canonical_charset()` have been rewritten as a single test with a data provider instead of as separate test functions. Developed in WordPress#6535 Discussed in https://core.trac.wordpress.org/ticket/61182 Fixes #61182. Props dmsnell, jonsurrell. git-svn-id: https://develop.svn.wordpress.org/trunk@58147 602fd350-edb4-49c9-b593-d223f7449a82
1 parent c3a4e8b commit d4967a3

File tree

7 files changed

+68
-60
lines changed

7 files changed

+68
-60
lines changed

src/wp-admin/options-reading.php

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@
6464
<?php
6565
settings_fields( 'reading' );
6666

67-
if ( ! in_array( get_option( 'blog_charset' ), array( 'utf8', 'utf-8', 'UTF8', 'UTF-8' ), true ) ) {
67+
if ( ! is_utf8_charset() ) {
6868
add_settings_field( 'blog_charset', __( 'Encoding for pages and feeds' ), 'options_reading_blog_charset', 'reading', 'default', array( 'label_for' => 'blog_charset' ) );
6969
}
7070
?>

src/wp-admin/options.php

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -160,7 +160,7 @@
160160

161161
$mail_options = array( 'mailserver_url', 'mailserver_port', 'mailserver_login', 'mailserver_pass' );
162162

163-
if ( ! in_array( get_option( 'blog_charset' ), array( 'utf8', 'utf-8', 'UTF8', 'UTF-8' ), true ) ) {
163+
if ( ! is_utf8_charset() ) {
164164
$allowed_options['reading'][] = 'blog_charset';
165165
}
166166

src/wp-includes/compat.php

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ function _mb_substr( $str, $start, $length = null, $encoding = null ) {
9191
* The solution below works only for UTF-8, so in case of a different
9292
* charset just use built-in substr().
9393
*/
94-
if ( ! in_array( $encoding, array( 'utf8', 'utf-8', 'UTF8', 'UTF-8' ), true ) ) {
94+
if ( ! is_utf8_charset( $encoding ) ) {
9595
return is_null( $length ) ? substr( $str, $start ) : substr( $str, $start, $length );
9696
}
9797

@@ -176,7 +176,7 @@ function _mb_strlen( $str, $encoding = null ) {
176176
* The solution below works only for UTF-8, so in case of a different charset
177177
* just use built-in strlen().
178178
*/
179-
if ( ! in_array( $encoding, array( 'utf8', 'utf-8', 'UTF8', 'UTF-8' ), true ) ) {
179+
if ( ! is_utf8_charset( $encoding ) ) {
180180
return strlen( $str );
181181
}
182182

src/wp-includes/formatting.php

Lines changed: 2 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -960,19 +960,7 @@ function _wp_specialchars( $text, $quote_style = ENT_NOQUOTES, $charset = false,
960960
$quote_style = ENT_QUOTES;
961961
}
962962

963-
// Store the site charset as a static to avoid multiple calls to wp_load_alloptions().
964-
if ( ! $charset ) {
965-
static $_charset = null;
966-
if ( ! isset( $_charset ) ) {
967-
$alloptions = wp_load_alloptions();
968-
$_charset = isset( $alloptions['blog_charset'] ) ? $alloptions['blog_charset'] : '';
969-
}
970-
$charset = $_charset;
971-
}
972-
973-
if ( in_array( $charset, array( 'utf8', 'utf-8', 'UTF8' ), true ) ) {
974-
$charset = 'UTF-8';
975-
}
963+
$charset = _canonical_charset( $charset ? $charset : get_option( 'blog_charset' ) );
976964

977965
$_quote_style = $quote_style;
978966

@@ -1114,7 +1102,7 @@ function wp_check_invalid_utf8( $text, $strip = false ) {
11141102
// Store the site charset as a static to avoid multiple calls to get_option().
11151103
static $is_utf8 = null;
11161104
if ( ! isset( $is_utf8 ) ) {
1117-
$is_utf8 = in_array( get_option( 'blog_charset' ), array( 'utf8', 'utf-8', 'UTF8', 'UTF-8' ), true );
1105+
$is_utf8 = is_utf8_charset();
11181106
}
11191107
if ( ! $is_utf8 ) {
11201108
return $text;

src/wp-includes/functions.php

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7474,17 +7474,27 @@ function get_tag_regex( $tag ) {
74747474
*
74757475
* @see https://core.trac.wordpress.org/ticket/23688
74767476
*
7477-
* @param string $charset A charset name.
7477+
* @param string $charset A charset name, e.g. "UTF-8", "Windows-1252", "SJIS".
74787478
* @return string The canonical form of the charset.
74797479
*/
74807480
function _canonical_charset( $charset ) {
7481-
if ( 'utf-8' === strtolower( $charset ) || 'utf8' === strtolower( $charset ) ) {
7482-
7481+
if ( is_utf8_charset( $charset ) ) {
74837482
return 'UTF-8';
74847483
}
74857484

7486-
if ( 'iso-8859-1' === strtolower( $charset ) || 'iso8859-1' === strtolower( $charset ) ) {
7487-
7485+
/*
7486+
* Normalize the ISO-8859-1 family of languages.
7487+
*
7488+
* This is not required for htmlspecialchars(), as it properly recognizes all of
7489+
* the input character sets that here are transformed into "ISO-8859-1".
7490+
*
7491+
* @todo Should this entire check be removed since it's not required for the stated purpose?
7492+
* @todo Should WordPress transform other potential charset equivalents, such as "latin1"?
7493+
*/
7494+
if (
7495+
( 0 === strcasecmp( 'iso-8859-1', $charset ) ) ||
7496+
( 0 === strcasecmp( 'iso8859-1', $charset ) )
7497+
) {
74887498
return 'ISO-8859-1';
74897499
}
74907500

src/wp-settings.php

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,7 @@
106106
wp_set_lang_dir();
107107

108108
// Load early WordPress files.
109+
require ABSPATH . WPINC . '/unicode.php';
109110
require ABSPATH . WPINC . '/class-wp-list-util.php';
110111
require ABSPATH . WPINC . '/formatting.php';
111112
require ABSPATH . WPINC . '/meta.php';

tests/phpunit/tests/functions/canonicalCharset.php

Lines changed: 46 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -10,45 +10,54 @@
1010
* @covers ::_canonical_charset
1111
*/
1212
class Tests_Functions_CanonicalCharset extends WP_UnitTestCase {
13-
14-
public function test_utf_8_lower() {
15-
$this->assertSame( 'UTF-8', _canonical_charset( 'utf-8' ) );
16-
}
17-
18-
public function test_utf_8_upper() {
19-
$this->assertSame( 'UTF-8', _canonical_charset( 'UTF-8' ) );
20-
}
21-
22-
public function test_utf_8_mixxed() {
23-
$this->assertSame( 'UTF-8', _canonical_charset( 'Utf-8' ) );
24-
}
25-
26-
public function test_utf_8() {
27-
$this->assertSame( 'UTF-8', _canonical_charset( 'UTF8' ) );
28-
}
29-
30-
public function test_iso_lower() {
31-
$this->assertSame( 'ISO-8859-1', _canonical_charset( 'iso-8859-1' ) );
32-
}
33-
34-
public function test_iso_upper() {
35-
$this->assertSame( 'ISO-8859-1', _canonical_charset( 'ISO-8859-1' ) );
36-
}
37-
38-
public function test_iso_mixxed() {
39-
$this->assertSame( 'ISO-8859-1', _canonical_charset( 'Iso8859-1' ) );
40-
}
41-
42-
public function test_iso() {
43-
$this->assertSame( 'ISO-8859-1', _canonical_charset( 'ISO8859-1' ) );
44-
}
45-
46-
public function test_random() {
47-
$this->assertSame( 'random', _canonical_charset( 'random' ) );
13+
/**
14+
* Ensures that charset variants for common encodings normalize to the expected form.
15+
*
16+
* @ticket 61182
17+
*
18+
* @dataProvider data_charset_normalizations
19+
*
20+
* @param string $given_charset Potential charset provided by user.
21+
* @param string $normalized_charset Expected normalized form of charset.
22+
*/
23+
public function test_properly_normalizes_charset_variants( $given_charset, $normalized_charset ) {
24+
$this->assertSame(
25+
$normalized_charset,
26+
_canonical_charset( $given_charset ),
27+
'Did not properly transform the provided charset into its normalized form.'
28+
);
4829
}
4930

50-
public function test_empty() {
51-
$this->assertSame( '', _canonical_charset( '' ) );
31+
/**
32+
* Data provider.
33+
*
34+
* @return array[].
35+
*/
36+
public static function data_charset_normalizations() {
37+
return array(
38+
// UTF-8 family.
39+
array( 'UTF-8', 'UTF-8' ),
40+
array( 'Utf-8', 'UTF-8' ),
41+
array( 'Utf-8', 'UTF-8' ),
42+
array( 'UTF8', 'UTF-8' ),
43+
44+
// Almost UTF-8.
45+
array( 'UTF-8*', 'UTF-8*' ),
46+
array( 'UTF.8', 'UTF.8' ),
47+
array( 'UTF88', 'UTF88' ),
48+
array( 'UTF-7', 'UTF-7' ),
49+
array( 'X-UTF-8', 'X-UTF-8' ),
50+
51+
// ISO-8859-1 family.
52+
array( 'iso-8859-1', 'ISO-8859-1' ),
53+
array( 'ISO-8859-1', 'ISO-8859-1' ),
54+
array( 'Iso-8859-1', 'ISO-8859-1' ),
55+
array( 'ISO8859-1', 'ISO-8859-1' ),
56+
57+
// Other charset slugs should not be adjusted.
58+
array( 'random', 'random' ),
59+
array( '', '' ),
60+
);
5261
}
5362

5463
/**

0 commit comments

Comments
 (0)