Skip to content

Commit cd61711

Browse files
committed
Charset: Conditionally polyfill utf8_encode() and utf8_decode().
The `utf8_encode()` and `utf8_decode()` functions were deprecated in PHP 8.2.0 and will be removed in PHP 9.0. When that happens, any existing code which calls them will trigger a crash. This patch introduces polyfills for those functions when they aren’t already present. The polyfill functions maintain backwards compatibility, including a deprecation notice. Any code calling either of these functions ought to be refactored to avoid using them; there are better options which don’t carry the issues these functions do, and any code calling them is likely calling them inappropriately. Developed in #10011 Discussed in https://core.trac.wordpress.org/ticket/55603 Discussed in https://core.trac.wordpress.org/ticket/63863 See #63863. git-svn-id: https://develop.svn.wordpress.org/trunk@60950 602fd350-edb4-49c9-b593-d223f7449a82
1 parent 8508427 commit cd61711

File tree

3 files changed

+321
-0
lines changed

3 files changed

+321
-0
lines changed

src/wp-includes/compat-utf8.php

Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -337,3 +337,142 @@ function _wp_utf8_codepoint_count( string $text, ?int $byte_offset = 0, ?int $ma
337337

338338
return $count;
339339
}
340+
341+
/**
342+
* Converts a string from ISO-8859-1 to UTF-8, maintaining backwards compatibility
343+
* with the deprecated function from the PHP standard library.
344+
*
345+
* @since 6.9.0
346+
* @access private
347+
*
348+
* @see \utf8_encode()
349+
*
350+
* @param string $iso_8859_1_text Text treated as ISO-8859-1 (latin1) bytes.
351+
* @return string Text converted into UTF-8.
352+
*/
353+
function _wp_utf8_encode_fallback( $iso_8859_1_text ) {
354+
$iso_8859_1_text = (string) $iso_8859_1_text;
355+
$at = 0;
356+
$was_at = 0;
357+
$end = strlen( $iso_8859_1_text );
358+
$utf8 = '';
359+
360+
while ( $at < $end ) {
361+
// US-ASCII bytes are identical in ISO-8859-1 and UTF-8. These are 0x00–0x7F.
362+
$ascii_byte_count = strspn(
363+
$iso_8859_1_text,
364+
"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" .
365+
"\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" .
366+
" !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f",
367+
$at
368+
);
369+
370+
if ( $ascii_byte_count > 0 ) {
371+
$at += $ascii_byte_count;
372+
continue;
373+
}
374+
375+
// All other bytes transform into two-byte UTF-8 sequences.
376+
$code_point = ord( $iso_8859_1_text[ $at ] );
377+
$byte1 = chr( 0xC0 | ( $code_point >> 6 ) );
378+
$byte2 = chr( 0x80 | ( $code_point & 0x3F ) );
379+
380+
$utf8 .= substr( $iso_8859_1_text, $was_at, $at - $was_at );
381+
$utf8 .= "{$byte1}{$byte2}";
382+
383+
++$at;
384+
$was_at = $at;
385+
}
386+
387+
if ( 0 === $was_at ) {
388+
return $iso_8859_1_text;
389+
}
390+
391+
$utf8 .= substr( $iso_8859_1_text, $was_at );
392+
return $utf8;
393+
}
394+
395+
/**
396+
* Converts a string from UTF-8 to ISO-8859-1, maintaining backwards compatibility
397+
* with the deprecated function from the PHP standard library.
398+
*
399+
* @since 6.9.0
400+
* @access private
401+
*
402+
* @see \utf8_decode()
403+
*
404+
* @param string $utf8_text Text treated as UTF-8 bytes.
405+
* @return string Text converted into ISO-8859-1.
406+
*/
407+
function _wp_utf8_decode_fallback( $utf8_text ) {
408+
$utf8_text = (string) $utf8_text;
409+
$at = 0;
410+
$was_at = 0;
411+
$end = strlen( $utf8_text );
412+
$iso_8859_1_text = '';
413+
414+
while ( $at < $end ) {
415+
// US-ASCII bytes are identical in ISO-8859-1 and UTF-8. These are 0x00–0x7F.
416+
$ascii_byte_count = strspn(
417+
$utf8_text,
418+
"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" .
419+
"\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" .
420+
" !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f",
421+
$at
422+
);
423+
424+
if ( $ascii_byte_count > 0 ) {
425+
$at += $ascii_byte_count;
426+
continue;
427+
}
428+
429+
$next_at = $at;
430+
$invalid_length = 0;
431+
$found = _wp_scan_utf8( $utf8_text, $next_at, $invalid_length, null, 1 );
432+
$span_length = $next_at - $at;
433+
$next_byte = '?';
434+
435+
if ( 1 !== $found ) {
436+
if ( $invalid_length > 0 ) {
437+
$next_byte = '';
438+
goto flush_sub_part;
439+
}
440+
441+
break;
442+
}
443+
444+
// All convertible code points are two-bytes long.
445+
$byte1 = ord( $utf8_text[ $at ] );
446+
if ( 0xC0 !== ( $byte1 & 0xE0 ) ) {
447+
goto flush_sub_part;
448+
}
449+
450+
// All convertible code points are not greater than U+FF.
451+
$byte2 = ord( $utf8_text[ $at + 1 ] );
452+
$code_point = ( ( $byte1 & 0x1F ) << 6 ) | ( ( $byte2 & 0x3F ) );
453+
if ( $code_point > 0xFF ) {
454+
goto flush_sub_part;
455+
}
456+
457+
$next_byte = chr( $code_point );
458+
459+
flush_sub_part:
460+
$iso_8859_1_text .= substr( $utf8_text, $was_at, $at - $was_at );
461+
$iso_8859_1_text .= $next_byte;
462+
$at += $span_length;
463+
$was_at = $at;
464+
465+
if ( $invalid_length > 0 ) {
466+
$iso_8859_1_text .= '?';
467+
$at += $invalid_length;
468+
$was_at = $at;
469+
}
470+
}
471+
472+
if ( 0 === $was_at ) {
473+
return $utf8_text;
474+
}
475+
476+
$iso_8859_1_text .= substr( $utf8_text, $was_at );
477+
return $iso_8859_1_text;
478+
}

src/wp-includes/compat.php

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -247,6 +247,74 @@ function _mb_strlen( $str, $encoding = null ) {
247247
: strlen( $str );
248248
}
249249

250+
if ( ! function_exists( 'utf8_encode' ) ) :
251+
if ( extension_loaded( 'mbstring' ) ) :
252+
/**
253+
* Converts a string from ISO-8859-1 to UTF-8.
254+
*
255+
* @deprecated Use {@see \mb_convert_encoding()} instead.
256+
*
257+
* @since 6.9.0
258+
*
259+
* @param string $iso_8859_1_text Text treated as ISO-8859-1 (latin1) bytes.
260+
* @return string Text converted into a UTF-8.
261+
*/
262+
function utf8_encode( $iso_8859_1_text ): string {
263+
_deprecated_function( __FUNCTION__, '6.9.0', 'mb_convert_encoding' );
264+
265+
return mb_convert_encoding( $iso_8859_1_text, 'UTF-8', 'ISO-8859-1' );
266+
}
267+
268+
else :
269+
/**
270+
* @ignore
271+
* @private
272+
*
273+
* @since 6.9.0
274+
*/
275+
function utf8_encode( $iso_8859_1_text ): string {
276+
_deprecated_function( __FUNCTION__, '6.9.0', 'mb_convert_encoding' );
277+
278+
return _wp_utf8_encode_fallback( $iso_8859_1_text );
279+
}
280+
281+
endif;
282+
endif;
283+
284+
if ( ! function_exists( 'utf8_decode' ) ) :
285+
if ( extension_loaded( 'mbstring' ) ) :
286+
/**
287+
* Converts a string from UTF-8 to ISO-8859-1.
288+
*
289+
* @deprecated Use {@see \mb_convert_encoding()} instead.
290+
*
291+
* @since 6.9.0
292+
*
293+
* @param string $utf8_text Text treated as UTF-8.
294+
* @return string Text converted into ISO-8859-1.
295+
*/
296+
function utf8_decode( $utf8_text ): string {
297+
_deprecated_function( __FUNCTION__, '6.9.0', 'mb_convert_encoding' );
298+
299+
return mb_convert_encoding( $utf8_text, 'ISO-8859-1', 'UTF-8' );
300+
}
301+
302+
else :
303+
/**
304+
* @ignore
305+
* @private
306+
*
307+
* @since 6.9.0
308+
*/
309+
function utf8_decode( $utf8_text ): string {
310+
_deprecated_function( __FUNCTION__, '6.9.0', 'mb_convert_encoding' );
311+
312+
return _wp_utf8_decode_fallback( $utf8_text );
313+
}
314+
315+
endif;
316+
endif;
317+
250318
// sodium_crypto_box() was introduced in PHP 7.2.
251319
if ( ! function_exists( 'sodium_crypto_box' ) ) {
252320
require ABSPATH . WPINC . '/sodium_compat/autoload.php';
Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
<?php
2+
3+
/**
4+
* @group formatting
5+
*/
6+
class Tests_DeprecatedUtf8EncodeDecodeTest extends WP_UnitTestCase {
7+
/**
8+
* Ensures that the fallback for {@see \utf8_encode()} maps the ISO-8859-1 characters properly.
9+
*
10+
* @ticket 63863.
11+
*/
12+
public function test_utf8_encode_characters() {
13+
for ( $i = 0; $i <= 0xFF; $i++ ) {
14+
$c = chr( $i );
15+
$hex_i = strtoupper( str_pad( dechex( $i ), 2, '0', STR_PAD_LEFT ) );
16+
17+
$this->assertSame(
18+
bin2hex( mb_convert_encoding( $c, 'UTF-8', 'ISO-8859-1' ) ),
19+
bin2hex( _wp_utf8_encode_fallback( $c ) ),
20+
"Failed to convert U+{$hex_i} properly."
21+
);
22+
}
23+
}
24+
25+
/**
26+
* Ensures that the fallback for {@see \utf8_encode()} properly
27+
* matches the legacy behavior for a given set of test cases.
28+
*
29+
* @ticket 63863.
30+
*
31+
* @dataProvider data_utf8_strings
32+
*/
33+
public function test_utf8_encode_cases( $input ) {
34+
$this->assertSame(
35+
mb_convert_encoding( $input, 'UTF-8', 'ISO-8859-1' ),
36+
_wp_utf8_encode_fallback( $input ),
37+
'Failed to properly convert.'
38+
);
39+
}
40+
41+
/**
42+
* Data provider.
43+
*
44+
* @return array[].
45+
*/
46+
public static function data_utf8_strings() {
47+
return array(
48+
'Basic valid string' => array( 'Dan eats cinnamon toast.' ),
49+
'Valid with Emoji' => array( 'The best Emoji is 🅰.' ),
50+
'Truncated bytes' => array( substr( 'England has 🏴󠁧󠁢󠁥󠁮󠁧󠁿', 0, -1 ) ),
51+
'Minimal subpart' => array( "One \xC0, two \xE2\x80, three \xF0\x95\x85." ),
52+
);
53+
}
54+
55+
/**
56+
* Ensures that the fallback for {@see \utf8_decode()} maps the UTF-8 characters properly.
57+
*
58+
* @ticket 63863.
59+
*/
60+
public function test_utf8_decode_characters() {
61+
for ( $i = 0; $i <= 0x10FFFF; $i++ ) {
62+
$hex_i = strtoupper( str_pad( dechex( $i ), 2, '0', STR_PAD_LEFT ) );
63+
64+
if ( $i < 0xD800 || $i > 0xE000 ) {
65+
$c = mb_chr( $i );
66+
} else {
67+
/*
68+
* Since the UTF-16 surrogate halves are not valid Unicode characters,
69+
* these have to be manually constructed as invalid UTF-8.
70+
*/
71+
$byte1 = 0xE0 | ( $i >> 12 );
72+
$byte2 = 0x80 | ( ( $i >> 6 ) & 0x3F );
73+
$byte3 = 0x80 | ( $i & 0x3F );
74+
75+
$c = "{$byte1}{$byte2}{$byte3}";
76+
}
77+
78+
$this->assertSame(
79+
bin2hex( mb_convert_encoding( $c, 'ISO-8859-1', 'UTF-8' ) ),
80+
bin2hex( _wp_utf8_decode_fallback( $c ) ),
81+
"Failed to convert U+{$hex_i} properly."
82+
);
83+
}
84+
}
85+
86+
/**
87+
* Ensures that the fallback for {@see \utf8_encode()} properly
88+
* matches the legacy behavior for a given set of test cases.
89+
*
90+
* @ticket 63863.
91+
*
92+
* @dataProvider data_iso_8859_1_strings
93+
*/
94+
public function test_utf8_decode_cases( $input ) {
95+
$this->assertSame(
96+
mb_convert_encoding( $input, 'ISO-8859-1', 'UTF-8' ),
97+
_wp_utf8_decode_fallback( $input ),
98+
'Failed to properly convert.'
99+
);
100+
}
101+
102+
/**
103+
* Data provider.
104+
*
105+
* @return array[].
106+
*/
107+
public static function data_iso_8859_1_strings() {
108+
return array(
109+
'Basic valid string' => array( 'Dan eats cinnamon toast' ),
110+
'Latin1 supplement' => array( 'Pi\xF1a is another name for Pineapple.' ),
111+
'Bytes as invalid UTF-8' => array( 'The \x95 is invalid UTF-8.' ),
112+
);
113+
}
114+
}

0 commit comments

Comments
 (0)