Skip to content

Commit 8508427

Browse files
committed
Charset: Rely on new UTF-8 pipeline for mb_strlen() fallback.
The existing polyfill for `mb_strlen()` contains a number of issues leaving plenty of opportunity for improvement. Specifically, the following are all deficiencies: it relies on Unicode PCRE support, assumes input strings are valid UTF-8, splits input strings into an array of character to count them (1,000 at a time, iterating until complete), and entirely gives up when the Unicode support is missing. This patch provides an updated polyfill which will reliably count code points in a UTF-8 string, even in the presence of sequences of invalid bytes. It scans through the input with zero allocations. Additionally, the underlying fallback extends the behavior of `mb_strlen()` to provide character counts for substrings within a larger input without extracting the substring (it can counts characters within a byte offset and length of a larger string). This change improves the reliability of UTF-8 string length calculations and removes behavioral variability based on the runtime system. Developed in #9828 Discussed in https://core.trac.wordpress.org/ticket/63863 See #63863. git-svn-id: https://develop.svn.wordpress.org/trunk@60949 602fd350-edb4-49c9-b593-d223f7449a82
1 parent 49d1ded commit 8508427

File tree

4 files changed

+195
-110
lines changed

4 files changed

+195
-110
lines changed

src/wp-includes/compat-utf8.php

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -291,3 +291,49 @@ function _wp_scrub_utf8_fallback( string $bytes ): string {
291291

292292
return $scrubbed;
293293
}
294+
295+
/**
296+
* Returns how many code points are found in the given UTF-8 string.
297+
*
298+
* Invalid spans of bytes count as a single code point according
299+
* to the maximal subpart rule. This function is a fallback method
300+
* for calling `mb_strlen( $text, 'UTF-8' )`.
301+
*
302+
* When negative values are provided for the byte offsets or length,
303+
* this will always report zero code points.
304+
*
305+
* Example:
306+
*
307+
* 4 === _wp_utf8_codepoint_count( 'text' );
308+
*
309+
* // Groups are 'test', "\x90" as '�', 'wp', "\xE2\x80" as '�', "\xC0" as '�', and 'test'.
310+
* 13 === _wp_utf8_codepoint_count( "test\x90wp\xE2\x80\xC0test" );
311+
*
312+
* @since 6.9.0
313+
* @access private
314+
*
315+
* @param string $text Count code points in this string.
316+
* @param ?int $byte_offset Start counting after this many bytes in `$text`. Must be positive.
317+
* @param ?int $max_byte_length Optional. Stop counting after having scanned past this many bytes.
318+
* Default is to scan until the end of the string. Must be positive.
319+
* @return int How many code points were found.
320+
*/
321+
function _wp_utf8_codepoint_count( string $text, ?int $byte_offset = 0, ?int $max_byte_length = PHP_INT_MAX ): int {
322+
if ( $byte_offset < 0 ) {
323+
return 0;
324+
}
325+
326+
$count = 0;
327+
$at = $byte_offset;
328+
$end = strlen( $text );
329+
$invalid_length = 0;
330+
$max_byte_length = min( $end - $at, $max_byte_length );
331+
332+
while ( $at < $end && ( $at - $byte_offset ) < $max_byte_length ) {
333+
$count += _wp_scan_utf8( $text, $at, $invalid_length, $max_byte_length - ( $at - $byte_offset ) );
334+
$count += $invalid_length > 0 ? 1 : 0;
335+
$at += $invalid_length;
336+
}
337+
338+
return $count;
339+
}

src/wp-includes/compat.php

Lines changed: 10 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -228,69 +228,23 @@ function mb_strlen( $string, $encoding = null ) { // phpcs:ignore Universal.Nami
228228
/**
229229
* Internal compat function to mimic mb_strlen().
230230
*
231-
* Only understands UTF-8 and 8bit. All other character sets will be treated as 8bit.
232-
* For `$encoding === UTF-8`, the `$str` input is expected to be a valid UTF-8 byte
233-
* sequence. The behavior of this function for invalid inputs is undefined.
231+
* Only supports UTF-8 and non-shifting single-byte encodings. For all other
232+
* encodings expect the counts to be wrong. When the given encoding (or the
233+
* `blog_charset` if none is provided) isn’t UTF-8 then the function returns
234+
* the byte-count of the provided string.
234235
*
235236
* @ignore
236237
* @since 4.2.0
237238
*
238239
* @param string $str The string to retrieve the character length from.
239-
* @param string|null $encoding Optional. Character encoding to use. Default null.
240-
* @return int String length of `$str`.
240+
* @param string|null $encoding Optional. Count characters according to this encoding.
241+
* Default is to consult `blog_charset`.
242+
* @return int Count of code points if UTF-8, byte length otherwise.
241243
*/
242244
function _mb_strlen( $str, $encoding = null ) {
243-
if ( null === $encoding ) {
244-
$encoding = get_option( 'blog_charset' );
245-
}
246-
247-
/*
248-
* The solution below works only for UTF-8, so in case of a different charset
249-
* just use built-in strlen().
250-
*/
251-
if ( ! _is_utf8_charset( $encoding ) ) {
252-
return strlen( $str );
253-
}
254-
255-
if ( _wp_can_use_pcre_u() ) {
256-
// Use the regex unicode support to separate the UTF-8 characters into an array.
257-
preg_match_all( '/./us', $str, $match );
258-
return count( $match[0] );
259-
}
260-
261-
$regex = '/(?:
262-
[\x00-\x7F] # single-byte sequences 0xxxxxxx
263-
| [\xC2-\xDF][\x80-\xBF] # double-byte sequences 110xxxxx 10xxxxxx
264-
| \xE0[\xA0-\xBF][\x80-\xBF] # triple-byte sequences 1110xxxx 10xxxxxx * 2
265-
| [\xE1-\xEC][\x80-\xBF]{2}
266-
| \xED[\x80-\x9F][\x80-\xBF]
267-
| [\xEE-\xEF][\x80-\xBF]{2}
268-
| \xF0[\x90-\xBF][\x80-\xBF]{2} # four-byte sequences 11110xxx 10xxxxxx * 3
269-
| [\xF1-\xF3][\x80-\xBF]{3}
270-
| \xF4[\x80-\x8F][\x80-\xBF]{2}
271-
)/x';
272-
273-
// Start at 1 instead of 0 since the first thing we do is decrement.
274-
$count = 1;
275-
276-
do {
277-
// We had some string left over from the last round, but we counted it in that last round.
278-
--$count;
279-
280-
/*
281-
* Split by UTF-8 character, limit to 1000 characters (last array element will contain
282-
* the rest of the string).
283-
*/
284-
$pieces = preg_split( $regex, $str, 1000 );
285-
286-
// Increment.
287-
$count += count( $pieces );
288-
289-
// If there's anything left over, repeat the loop.
290-
} while ( $str = array_pop( $pieces ) );
291-
292-
// Fencepost: preg_split() always returns one extra item in the array.
293-
return --$count;
245+
return _is_utf8_charset( $encoding ?? get_option( 'blog_charset' ) )
246+
? _wp_utf8_codepoint_count( $str )
247+
: strlen( $str );
294248
}
295249

296250
// sodium_crypto_box() was introduced in PHP 7.2.

tests/phpunit/tests/compat/mbStrlen.php

Lines changed: 32 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -10,82 +10,60 @@
1010
class Tests_Compat_mbStrlen extends WP_UnitTestCase {
1111

1212
/**
13-
* Test that mb_strlen() is always available (either from PHP or WP).
13+
* Test that the native mb_strlen() is available.
1414
*/
1515
public function test_mb_strlen_availability() {
16-
$this->assertTrue( function_exists( 'mb_strlen' ) );
16+
$this->assertTrue(
17+
in_array( 'mb_strlen', get_defined_functions()['internal'], true ),
18+
'Test runner should have `mbstring` extension active but doesn’t.'
19+
);
1720
}
1821

1922
/**
20-
* @dataProvider data_utf8_string_lengths
23+
* @dataProvider data_utf8_strings
2124
*/
22-
public function test_mb_strlen( $input_string, $expected_character_length ) {
23-
$this->assertSame( $expected_character_length, _mb_strlen( $input_string, 'UTF-8' ) );
25+
public function test_mb_strlen( $input_string ) {
26+
$this->assertSame(
27+
mb_strlen( $input_string, 'UTF-8' ),
28+
_mb_strlen( $input_string, 'UTF-8' )
29+
);
2430
}
2531

2632
/**
27-
* @dataProvider data_utf8_string_lengths
33+
* @dataProvider data_utf8_strings
2834
*/
29-
public function test_mb_strlen_via_regex( $input_string, $expected_character_length ) {
30-
_wp_can_use_pcre_u( false );
31-
$this->assertSame( $expected_character_length, _mb_strlen( $input_string, 'UTF-8' ) );
32-
_wp_can_use_pcre_u( 'reset' );
35+
public function test_mb_strlen_via_regex( $input_string ) {
36+
$this->assertSame(
37+
mb_strlen( $input_string, 'UTF-8' ),
38+
_mb_strlen( $input_string, 'UTF-8' )
39+
);
3340
}
3441

3542
/**
36-
* @dataProvider data_utf8_string_lengths
43+
* @dataProvider data_utf8_strings
3744
*/
38-
public function test_8bit_mb_strlen( $input_string, $expected_character_length, $expected_byte_length ) {
39-
$this->assertSame( $expected_byte_length, _mb_strlen( $input_string, '8bit' ) );
45+
public function test_8bit_mb_strlen( $input_string ) {
46+
$this->assertSame(
47+
mb_strlen( $input_string, '8bit' ),
48+
_mb_strlen( $input_string, '8bit' )
49+
);
4050
}
4151

4252
/**
4353
* Data provider.
4454
*
4555
* @return array
4656
*/
47-
public function data_utf8_string_lengths() {
57+
public function data_utf8_strings() {
4858
return array(
49-
array(
50-
'input_string' => 'баба',
51-
'expected_character_length' => 4,
52-
'expected_byte_length' => 8,
53-
),
54-
array(
55-
'input_string' => 'баб',
56-
'expected_character_length' => 3,
57-
'expected_byte_length' => 6,
58-
),
59-
array(
60-
'input_string' => 'I am your б',
61-
'expected_character_length' => 11,
62-
'expected_byte_length' => 12,
63-
),
64-
array(
65-
'input_string' => '1111111111',
66-
'expected_character_length' => 10,
67-
'expected_byte_length' => 10,
68-
),
69-
array(
70-
'input_string' => '²²²²²²²²²²',
71-
'expected_character_length' => 10,
72-
'expected_byte_length' => 20,
73-
),
74-
array(
75-
'input_string' => '3333333333',
76-
'expected_character_length' => 10,
77-
'expected_byte_length' => 30,
78-
),
79-
array(
80-
'input_string' => '𝟜𝟜𝟜𝟜𝟜𝟜𝟜𝟜𝟜𝟜',
81-
'expected_character_length' => 10,
82-
'expected_byte_length' => 40,
83-
),
84-
array(
85-
'input_string' => '1²3𝟜1²3𝟜1²3𝟜',
86-
'expected_character_length' => 12,
87-
'expected_byte_length' => 30,
88-
),
59+
array( 'баба' ),
60+
array( 'баб' ),
61+
array( 'I am your б' ),
62+
array( '1111111111' ),
63+
array( '²²²²²²²²²²' ),
64+
array( '3333333333' ),
65+
array( '𝟜𝟜𝟜𝟜𝟜𝟜𝟜𝟜𝟜𝟜' ),
66+
array( '1²3𝟜1²3𝟜1²3𝟜' ),
8967
);
9068
}
9169
}
Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
<?php
2+
/**
3+
* Unit tests covering fallback UTF-8 code-point counting.
4+
*
5+
* @package WordPress
6+
* @subpackage Charset
7+
*
8+
* @since 6.9.0
9+
*
10+
* @group compat
11+
*
12+
* @covers ::_wp_utf8_codepoint_count()
13+
*/
14+
class Tests_Compat_wpUtf8CodePointCount extends WP_UnitTestCase {
15+
/**
16+
* Ensures that there are zero code points reported when starting before the text.
17+
*
18+
* @ticket 63863
19+
*/
20+
public function test_rejects_negative_byte_offsets() {
21+
$this->assertSame(
22+
0,
23+
_wp_utf8_codepoint_count( 'any old text', -5, 3 ),
24+
'Should have indicated that there are zero code points before the start of the text.'
25+
);
26+
27+
$this->assertSame(
28+
0,
29+
_wp_utf8_codepoint_count( 'any old text', -5, 5 + 12 ),
30+
'Should have found no code points before the start of the text, even if the length overlaps the text.'
31+
);
32+
}
33+
34+
/**
35+
* Ensures that there are zero code points reported when scanning a negative length.
36+
*
37+
* @ticket 63863
38+
*/
39+
public function test_rejects_negative_byte_lengths() {
40+
$this->assertSame(
41+
0,
42+
_wp_utf8_codepoint_count( 'any old text', 2, -5 ),
43+
'Should have indicated that there are zero code points in a span of negative length.'
44+
);
45+
}
46+
47+
/**
48+
* Ensures that code points are counted properly across different byte offsets
49+
* and lengths, equivalent to counting code points for an equivalent substring.
50+
*
51+
* @ticket 63863
52+
*
53+
* @dataProvider data_strings_and_substring_offsets
54+
*
55+
* @param string $text
56+
* @param int $byte_offset
57+
* @param int $byte_length
58+
* @return void
59+
*/
60+
public function test_counts_within_appropriate_offsets( string $text, int $byte_offset, int $byte_length ) {
61+
$substring = substr( $text, $byte_offset, $byte_length );
62+
63+
if (
64+
! mb_check_encoding( $substring, 'UTF-8' ) &&
65+
// Miscounting bug fixed by removal of “fast path” php/php-src@cca4ca6d3dda8c2e1c5c1b053550f94b3d6fb6bf
66+
version_compare( PHP_VERSION, '8.3.0', '<' )
67+
) {
68+
$this->markTestSkipped( 'Prior to PHP 8.3.0, mb_strlen() misreported lengths of invalid inputs.' );
69+
}
70+
71+
$this->assertSame(
72+
mb_strlen( $substring, 'UTF-8' ),
73+
_wp_utf8_codepoint_count( $text, $byte_offset, $byte_length ),
74+
"Miscounted code points from {$byte_length} bytes starting at {$byte_offset} in '{$text}'"
75+
);
76+
}
77+
78+
/**
79+
* Data provider.
80+
*
81+
* @return array[]
82+
*/
83+
public static function data_strings_and_substring_offsets() {
84+
return array(
85+
array( 'zero length', 0, 0 ),
86+
array( 'zero length (in middle)', 5, 0 ),
87+
array( 'full text', 0, 9 ),
88+
array( 'prefix', 0, 2 ),
89+
array( 'middle span', 2, 4 ),
90+
array( 'suffix', 3, 3 ),
91+
array( 'overlong', 4, 8 ),
92+
93+
array( "emoji \u{1F170} partial", 6, 1 ),
94+
array( "emoji \u{1F170} partial", 6, 2 ),
95+
array( "emoji \u{1F170} full", 6, 3 ),
96+
array( "emoji \u{1F170} beyond", 6, 4 ),
97+
98+
array( "invalid \xF0\x9F before", 8, 5 ),
99+
array( "invalid \xF0\x9F before", 9, 5 ),
100+
array( "invalid \x95 whole", 8, 1 ),
101+
array( "invalid \x95 beyond", 8, 5 ),
102+
array( "invalid \x85\xB0 after", 8, 4 ),
103+
array( "invalid \x85\xB0 after", 9, 3 ),
104+
array( "invalid \x85\xB0\xC0\xF0\x9F subparts", 8, 7 ),
105+
);
106+
}
107+
}

0 commit comments

Comments
 (0)