Skip to content

Commit 8ec91a4

Browse files
committed
Charset: Rely on new UTF-8 pipeline for mb_substr() fallback.
The existing polyfill for `mb_substr()` contains a number of issues leaving plenty of opportunity for improvement. Specifically, the following are all deficiencies: it relies on Unicode PCRE support, assumes input strings are valid UTF-8, splits input strings into an array of characters (1,000 at a time, iterating until complete), and re-joins them at the end. This patch provides an updated polyfill which will reliably parse UTF-8 strings even in the presence of invalid bytes. It computes boundaries for the substring extraction with zero allocations and then returns a single `substr()` call at the end. This change improves the reliability of UTF-8 string handling and removes behavioral variability based on the runtime system. Developed in #9829 Discussed in https://core.trac.wordpress.org/ticket/63863 See #63863. git-svn-id: https://develop.svn.wordpress.org/trunk@60969 602fd350-edb4-49c9-b593-d223f7449a82
1 parent e37c4b5 commit 8ec91a4

File tree

3 files changed

+130
-144
lines changed

3 files changed

+130
-144
lines changed

src/wp-includes/compat-utf8.php

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -338,6 +338,48 @@ function _wp_utf8_codepoint_count( string $text, ?int $byte_offset = 0, ?int $ma
338338
return $count;
339339
}
340340

341+
/**
342+
* Given a starting offset within a string and a maximum number of code points,
343+
* return how many bytes are occupied by the span of characters.
344+
*
345+
* Invalid spans of bytes count as a single code point according to the maximal
346+
* subpart rule. This function is a fallback method for calling
347+
* `strlen( mb_substr( substr( $text, $at ), 0, $max_code_points ) )`.
348+
*
349+
* @since 6.9.0
350+
* @access private
351+
*
352+
* @param string $text Count bytes of span in this text.
353+
* @param int $byte_offset Start counting at this byte offset.
354+
* @param int $max_code_points Stop counting after this many code points have been seen,
355+
* or at the end of the string.
356+
* @param ?int $found_code_points Optional. Will be set to number of found code points in
357+
* span, as this might be smaller than the maximum count if
358+
* the string is not long enough.
359+
* @return int Number of bytes spanned by the code points.
360+
*/
361+
function _wp_utf8_codepoint_span( string $text, int $byte_offset, int $max_code_points, ?int &$found_code_points = 0 ): int {
362+
$was_at = $byte_offset;
363+
$invalid_length = 0;
364+
$end = strlen( $text );
365+
$found_code_points = 0;
366+
367+
while ( $byte_offset < $end && $found_code_points < $max_code_points ) {
368+
$needed = $max_code_points - $found_code_points;
369+
$chunk_count = _wp_scan_utf8( $text, $byte_offset, $invalid_length, null, $needed );
370+
371+
$found_code_points += $chunk_count;
372+
373+
// Invalid spans only convey one code point count regardless of how long they are.
374+
if ( 0 !== $invalid_length && $found_code_points < $max_code_points ) {
375+
++$found_code_points;
376+
$byte_offset += $invalid_length;
377+
}
378+
}
379+
380+
return $byte_offset - $was_at;
381+
}
382+
341383
/**
342384
* Converts a string from ISO-8859-1 to UTF-8, maintaining backwards compatibility
343385
* with the deprecated function from the PHP standard library.

src/wp-includes/compat.php

Lines changed: 56 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -33,44 +33,42 @@ function _( $message ) {
3333
*
3434
* @ignore
3535
* @since 4.2.2
36+
* @since 6.9.0 Deprecated the `$set` argument.
3637
* @access private
3738
*
38-
* @param bool $set - Used for testing only
39-
* null : default - get PCRE/u capability
40-
* false : Used for testing - return false for future calls to this function
41-
* 'reset': Used for testing - restore default behavior of this function
39+
* @param bool $set Deprecated. This argument is no longer used for testing purposes.
4240
*/
4341
function _wp_can_use_pcre_u( $set = null ) {
44-
static $utf8_pcre = 'reset';
42+
static $utf8_pcre = null;
4543

46-
if ( null !== $set ) {
47-
$utf8_pcre = $set;
44+
if ( isset( $set ) ) {
45+
_deprecated_argument( __FUNCTION__, '6.9.0' );
4846
}
4947

50-
if ( 'reset' === $utf8_pcre ) {
51-
$utf8_pcre = true;
52-
53-
set_error_handler(
54-
function ( $errno, $errstr ) use ( &$utf8_pcre ) {
55-
if ( str_starts_with( $errstr, 'preg_match():' ) ) {
56-
$utf8_pcre = false;
57-
return true;
58-
}
48+
if ( isset( $utf8_pcre ) ) {
49+
return $utf8_pcre;
50+
}
5951

60-
return false;
61-
},
62-
E_WARNING
63-
);
52+
$utf8_pcre = true;
53+
set_error_handler(
54+
function ( $errno, $errstr ) use ( &$utf8_pcre ) {
55+
if ( str_starts_with( $errstr, 'preg_match():' ) ) {
56+
$utf8_pcre = false;
57+
return true;
58+
}
6459

65-
/*
66-
* Attempt to compile a PCRE pattern with the PCRE_UTF8 flag. For
67-
* systems lacking Unicode support this will trigger a warning
68-
* during compilation, which the error handler will intercept.
69-
*/
70-
preg_match( '//u', '' );
60+
return false;
61+
},
62+
E_WARNING
63+
);
7164

72-
restore_error_handler();
73-
}
65+
/*
66+
* Attempt to compile a PCRE pattern with the PCRE_UTF8 flag. For
67+
* systems lacking Unicode support this will trigger a warning
68+
* during compilation, which the error handler will intercept.
69+
*/
70+
preg_match( '//u', '' );
71+
restore_error_handler();
7472

7573
return $utf8_pcre;
7674
}
@@ -136,15 +134,15 @@ function mb_substr( $string, $start, $length = null, $encoding = null ) { // php
136134
/**
137135
* Internal compat function to mimic mb_substr().
138136
*
139-
* Only understands UTF-8 and 8bit. All other character sets will be treated as 8bit.
140-
* For `$encoding === UTF-8`, the `$str` input is expected to be a valid UTF-8 byte
141-
* sequence. The behavior of this function for invalid inputs is undefined.
137+
* Only supports UTF-8 and non-shifting single-byte encodings. For all other encodings
138+
* expect the substrings to be misaligned. When the given encoding (or the `blog_charset`
139+
* if none is provided) isn’t UTF-8 then the function returns the output of {@see \substr()}.
142140
*
143141
* @ignore
144142
* @since 3.2.0
145143
*
146144
* @param string $str The string to extract the substring from.
147-
* @param int $start Position to being extraction from in `$str`.
145+
* @param int $start Character offset at which to start the substring extraction.
148146
* @param int|null $length Optional. Maximum number of characters to extract from `$str`.
149147
* Default null.
150148
* @param string|null $encoding Optional. Character encoding to use. Default null.
@@ -155,56 +153,39 @@ function _mb_substr( $str, $start, $length = null, $encoding = null ) {
155153
return '';
156154
}
157155

158-
if ( null === $encoding ) {
159-
$encoding = get_option( 'blog_charset' );
160-
}
161-
162-
/*
163-
* The solution below works only for UTF-8, so in case of a different
164-
* charset just use built-in substr().
165-
*/
166-
if ( ! _is_utf8_charset( $encoding ) ) {
156+
// The solution below works only for UTF-8; treat all other encodings as byte streams.
157+
if ( ! _is_utf8_charset( $encoding ?? get_option( 'blog_charset' ) ) ) {
167158
return is_null( $length ) ? substr( $str, $start ) : substr( $str, $start, $length );
168159
}
169160

170-
if ( _wp_can_use_pcre_u() ) {
171-
// Use the regex unicode support to separate the UTF-8 characters into an array.
172-
preg_match_all( '/./us', $str, $match );
173-
$chars = is_null( $length ) ? array_slice( $match[0], $start ) : array_slice( $match[0], $start, $length );
174-
return implode( '', $chars );
175-
}
161+
$total_length = ( $start < 0 || $length < 0 )
162+
? _wp_utf8_codepoint_count( $str )
163+
: 0;
176164

177-
$regex = '/(
178-
[\x00-\x7F] # single-byte sequences 0xxxxxxx
179-
| [\xC2-\xDF][\x80-\xBF] # double-byte sequences 110xxxxx 10xxxxxx
180-
| \xE0[\xA0-\xBF][\x80-\xBF] # triple-byte sequences 1110xxxx 10xxxxxx * 2
181-
| [\xE1-\xEC][\x80-\xBF]{2}
182-
| \xED[\x80-\x9F][\x80-\xBF]
183-
| [\xEE-\xEF][\x80-\xBF]{2}
184-
| \xF0[\x90-\xBF][\x80-\xBF]{2} # four-byte sequences 11110xxx 10xxxxxx * 3
185-
| [\xF1-\xF3][\x80-\xBF]{3}
186-
| \xF4[\x80-\x8F][\x80-\xBF]{2}
187-
)/x';
188-
189-
// Start with 1 element instead of 0 since the first thing we do is pop.
190-
$chars = array( '' );
191-
192-
do {
193-
// We had some string left over from the last round, but we counted it in that last round.
194-
array_pop( $chars );
195-
196-
/*
197-
* Split by UTF-8 character, limit to 1000 characters (last array element will contain
198-
* the rest of the string).
199-
*/
200-
$pieces = preg_split( $regex, $str, 1000, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY );
165+
$normalized_start = $start < 0
166+
? max( 0, $total_length + $start )
167+
: $start;
201168

202-
$chars = array_merge( $chars, $pieces );
169+
/*
170+
* The starting offset is provided as characters, which means this needs to
171+
* find how many bytes that many characters occupies at the start of the string.
172+
*/
173+
$starting_byte_offset = _wp_utf8_codepoint_span( $str, 0, $normalized_start );
174+
175+
$normalized_length = $length < 0
176+
? max( 0, $total_length - $normalized_start + $length )
177+
: $length;
203178

204-
// If there's anything left over, repeat the loop.
205-
} while ( count( $pieces ) > 1 && $str = array_pop( $pieces ) );
179+
/*
180+
* This is the main step. It finds how many bytes the given length of code points
181+
* occupies in the input, starting at the byte offset calculated above.
182+
*/
183+
$byte_length = isset( $normalized_length )
184+
? _wp_utf8_codepoint_span( $str, $starting_byte_offset, $normalized_length )
185+
: ( strlen( $str ) - $starting_byte_offset );
206186

207-
return implode( '', array_slice( $chars, $start, $length ) );
187+
// The result is a normal byte-level substring using the computed ranges.
188+
return substr( $str, $starting_byte_offset, $byte_length );
208189
}
209190

210191
if ( ! function_exists( 'mb_strlen' ) ) :

tests/phpunit/tests/compat/mbSubstr.php

Lines changed: 32 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -13,88 +13,51 @@ class Tests_Compat_mbSubstr extends WP_UnitTestCase {
1313
* Test that mb_substr() is always available (either from PHP or WP).
1414
*/
1515
public function test_mb_substr_availability() {
16-
$this->assertTrue( function_exists( 'mb_substr' ) );
17-
}
18-
19-
/**
20-
* @dataProvider data_utf8_substrings
21-
*/
22-
public function test_mb_substr( $input_string, $start, $length, $expected_character_substring ) {
23-
$this->assertSame( $expected_character_substring, _mb_substr( $input_string, $start, $length, 'UTF-8' ) );
16+
$this->assertTrue(
17+
in_array( 'mb_substr', get_defined_functions()['internal'], true ),
18+
'Test runner should have `mbstring` extension active but doesn’t.'
19+
);
2420
}
2521

2622
/**
2723
* @dataProvider data_utf8_substrings
2824
*/
29-
public function test_mb_substr_via_regex( $input_string, $start, $length, $expected_character_substring ) {
30-
_wp_can_use_pcre_u( false );
31-
$this->assertSame( $expected_character_substring, _mb_substr( $input_string, $start, $length, 'UTF-8' ) );
32-
_wp_can_use_pcre_u( 'reset' );
25+
public function test_mb_substr( $input_string, $start, $length ) {
26+
$this->assertSame(
27+
mb_substr( $input_string, $start, $length, 'UTF-8' ),
28+
_mb_substr( $input_string, $start, $length, 'UTF-8' )
29+
);
3330
}
3431

3532
/**
3633
* @dataProvider data_utf8_substrings
3734
*/
38-
public function test_8bit_mb_substr( $input_string, $start, $length, $expected_character_substring, $expected_byte_substring ) {
39-
$this->assertSame( $expected_byte_substring, _mb_substr( $input_string, $start, $length, '8bit' ) );
35+
public function test_8bit_mb_substr( $input_string, $start, $length ) {
36+
$this->assertSame(
37+
mb_substr( $input_string, $start, $length, '8bit' ),
38+
_mb_substr( $input_string, $start, $length, '8bit' )
39+
);
4040
}
4141

4242
/**
4343
* Data provider.
4444
*
45-
* @return array
45+
* @return array[]
4646
*/
4747
public function data_utf8_substrings() {
4848
return array(
49-
array(
50-
'input_string' => 'баба',
51-
'start' => 0,
52-
'length' => 3,
53-
'expected_character_substring' => 'баб',
54-
'expected_byte_substring' => "б\xD0",
55-
),
56-
array(
57-
'input_string' => 'баба',
58-
'start' => 0,
59-
'length' => -1,
60-
'expected_character_substring' => 'баб',
61-
'expected_byte_substring' => "баб\xD0",
62-
),
63-
array(
64-
'input_string' => 'баба',
65-
'start' => 1,
66-
'length' => null,
67-
'expected_character_substring' => 'аба',
68-
'expected_byte_substring' => "\xB1аба",
69-
),
70-
array(
71-
'input_string' => 'баба',
72-
'start' => -3,
73-
'length' => null,
74-
'expected_character_substring' => 'аба',
75-
'expected_byte_substring' => "\xB1а",
76-
),
77-
array(
78-
'input_string' => 'баба',
79-
'start' => -3,
80-
'length' => 2,
81-
'expected_character_substring' => 'аб',
82-
'expected_byte_substring' => "\xB1\xD0",
83-
),
84-
array(
85-
'input_string' => 'баба',
86-
'start' => -1,
87-
'length' => 2,
88-
'expected_character_substring' => 'а',
89-
'expected_byte_substring' => "\xB0",
90-
),
91-
array(
92-
'input_string' => 'I am your баба',
93-
'start' => 0,
94-
'length' => 11,
95-
'expected_character_substring' => 'I am your б',
96-
'expected_byte_substring' => "I am your \xD0",
97-
),
49+
'баба' => array( 'баба', 0, 3 ),
50+
'баба' => array( 'баба', 0, -1 ),
51+
'баба' => array( 'баба', 1, null ),
52+
'баба' => array( 'баба', -3, null ),
53+
'баба' => array( 'баба', -3, 2 ),
54+
'баба' => array( 'баба', -2, 1 ),
55+
'баба' => array( 'баба', 30, 1 ),
56+
'баба' => array( 'баба', 15, -30 ),
57+
'баба' => array( 'баба', -5, -5 ),
58+
'баба' => array( 'баба', 5, -3 ),
59+
'баба' => array( 'баба', -3, 5 ),
60+
'I am your баба' => array( 'I am your баба', 0, 11 ),
9861
);
9962
}
10063

@@ -103,7 +66,7 @@ public function data_utf8_substrings() {
10366
*/
10467
public function test_mb_substr_phpcore_basic() {
10568
$string_ascii = 'ABCDEF';
106-
$string_mb = base64_decode( '5pel5pys6Kqe44OG44Kt44K544OI44Gn44GZ44CCMDEyMzTvvJXvvJbvvJfvvJjvvJnjgII=' );
69+
$string_mb = '日本語テキストです。0123456789。';
10770

10871
$this->assertSame(
10972
'DEF',
@@ -118,13 +81,13 @@ public function test_mb_substr_phpcore_basic() {
11881

11982
// Specific latin-1 as that is the default the core PHP test operates under.
12083
$this->assertSame(
121-
'peacrOiqng==',
122-
base64_encode( _mb_substr( $string_mb, 2, 7, 'latin-1' ) ),
84+
"\xA5本語",
85+
_mb_substr( $string_mb, 2, 7, 'latin-1' ),
12386
'Substring does not match expected for offset 2, length 7, with latin-1 charset'
12487
);
12588
$this->assertSame(
126-
'6Kqe44OG44Kt44K544OI44Gn44GZ',
127-
base64_encode( _mb_substr( $string_mb, 2, 7, 'utf-8' ) ),
89+
'語テキストです',
90+
_mb_substr( $string_mb, 2, 7, 'utf-8' ),
12891
'Substring does not match expected for offset 2, length 7, with utf-8 charset'
12992
);
13093
}

0 commit comments

Comments
 (0)