Skip to content

Commit cfab276

Browse files
committed
Charset: wp_has_noncharacters() for more-specific Unicode handling.
Noncharacters are code points that are permanently reserved in the Unicode Standard for internal use. They are not recommended for use in open interchange of Unicode text data. However, they are valid code points and will not cause a string to return as invalid. Still, HTML and XML both impose semantic rules on their use and it may be important for code to know whether they are present in a string. This patch introduces a new function, `wp_has_noncharacters()`, which answers this question. See https://www.unicode.org/versions/Unicode17.0.0/core-spec/chapter-23/#G12612 Developed in #9827 Discussed in https://core.trac.wordpress.org/ticket/63863 See #63863. git-svn-id: https://develop.svn.wordpress.org/trunk@61000 602fd350-edb4-49c9-b593-d223f7449a82
1 parent 25420f0 commit cfab276

File tree

3 files changed

+285
-11
lines changed

3 files changed

+285
-11
lines changed

src/wp-includes/compat-utf8.php

Lines changed: 55 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -35,19 +35,21 @@
3535
* @since 6.9.0
3636
* @access private
3737
*
38-
* @param string $bytes UTF-8 encoded string which might include invalid spans of bytes.
39-
* @param int $at Where to start scanning.
40-
* @param int $invalid_length Will be set to how many bytes are to be ignored after `$at`.
41-
* @param int|null $max_bytes Stop scanning after this many bytes have been seen.
42-
* @param int|null $max_code_points Stop scanning after this many code points have been seen.
38+
* @param string $bytes UTF-8 encoded string which might include invalid spans of bytes.
39+
* @param int $at Where to start scanning.
40+
* @param int $invalid_length Will be set to how many bytes are to be ignored after `$at`.
41+
* @param int|null $max_bytes Stop scanning after this many bytes have been seen.
42+
* @param int|null $max_code_points Stop scanning after this many code points have been seen.
43+
* @param bool $has_noncharacters Set to indicate if scanned string contained noncharacters.
4344
* @return int How many code points were successfully scanned.
4445
*/
45-
function _wp_scan_utf8( string $bytes, int &$at, int &$invalid_length, ?int $max_bytes = null, ?int $max_code_points = null ): int {
46-
$byte_length = strlen( $bytes );
47-
$end = min( $byte_length, $at + ( $max_bytes ?? PHP_INT_MAX ) );
48-
$invalid_length = 0;
49-
$count = 0;
50-
$max_count = $max_code_points ?? PHP_INT_MAX;
46+
function _wp_scan_utf8( string $bytes, int &$at, int &$invalid_length, ?int $max_bytes = null, ?int $max_code_points = null, ?bool &$has_noncharacters = null ): int {
47+
$byte_length = strlen( $bytes );
48+
$end = min( $byte_length, $at + ( $max_bytes ?? PHP_INT_MAX ) );
49+
$invalid_length = 0;
50+
$count = 0;
51+
$max_count = $max_code_points ?? PHP_INT_MAX;
52+
$has_noncharacters = false;
5153

5254
for ( $i = $at; $i < $end && $count <= $max_count; $i++ ) {
5355
/*
@@ -145,6 +147,15 @@ function _wp_scan_utf8( string $bytes, int &$at, int &$invalid_length, ?int $max
145147
) {
146148
++$count;
147149
$i += 2;
150+
151+
// Covers the range U+FDD0–U+FDEF, U+FFFE, U+FFFF.
152+
if ( 0xEF === $b1 ) {
153+
$has_noncharacters |= (
154+
( 0xB7 === $b2 && $b3 >= 0x90 && $b3 <= 0xAF ) ||
155+
( 0xBF === $b2 && ( 0xBE === $b3 || 0xBF === $b3 ) )
156+
);
157+
}
158+
148159
continue;
149160
}
150161

@@ -162,6 +173,14 @@ function _wp_scan_utf8( string $bytes, int &$at, int &$invalid_length, ?int $max
162173
) {
163174
++$count;
164175
$i += 3;
176+
177+
// Covers U+1FFFE, U+1FFFF, U+2FFFE, U+2FFFF, …, U+10FFFE, U+10FFFF.
178+
$has_noncharacters |= (
179+
( 0x0F === ( $b2 & 0x0F ) ) &&
180+
0xBF === $b3 &&
181+
( 0xBE === $b4 || 0xBF === $b4 )
182+
);
183+
165184
continue;
166185
}
167186

@@ -380,6 +399,31 @@ function _wp_utf8_codepoint_span( string $text, int $byte_offset, int $max_code_
380399
return $byte_offset - $was_at;
381400
}
382401

402+
/**
403+
* Fallback support for determining if a string contains Unicode noncharacters.
404+
*
405+
* @since 6.9.0
406+
* @access private
407+
*
408+
* @see \wp_has_noncharacters()
409+
*
410+
* @param string $text Are there noncharacters in this string?
411+
* @return bool Whether noncharacters were found in the string.
412+
*/
413+
function _wp_has_noncharacters_fallback( string $text ): bool {
414+
$at = 0;
415+
$invalid_length = 0;
416+
$has_noncharacters = false;
417+
$end = strlen( $text );
418+
419+
while ( $at < $end && ! $has_noncharacters ) {
420+
_wp_scan_utf8( $text, $at, $invalid_length, null, null, $has_noncharacters );
421+
$at += $invalid_length;
422+
}
423+
424+
return $has_noncharacters;
425+
}
426+
383427
/**
384428
* Converts a string from ISO-8859-1 to UTF-8, maintaining backwards compatibility
385429
* with the deprecated function from the PHP standard library.

src/wp-includes/utf8.php

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,3 +133,45 @@ function wp_scrub_utf8( $text ) {
133133
return _wp_scrub_utf8_fallback( $text );
134134
}
135135
endif;
136+
137+
if ( _wp_can_use_pcre_u() ) :
138+
/**
139+
* Returns whether the given string contains Unicode noncharacters.
140+
*
141+
* XML recommends against using noncharacters and HTML forbids their
142+
* use in attribute names. Unicode recommends that they not be used
143+
* in open exchange of data.
144+
*
145+
* Noncharacters are code points within the following ranges:
146+
* - U+FDD0–U+FDEF
147+
* - U+FFFE–U+FFFF
148+
* - U+1FFFE, U+1FFFF, U+2FFFE, U+2FFFF, …, U+10FFFE, U+10FFFF
149+
*
150+
* @see https://www.unicode.org/versions/Unicode17.0.0/core-spec/chapter-23/#G12612
151+
* @see https://www.w3.org/TR/xml/#charsets
152+
* @see https://html.spec.whatwg.org/#attributes-2
153+
*
154+
* @since 6.9.0
155+
*
156+
* @param string $text Are there noncharacters in this string?
157+
* @return bool Whether noncharacters were found in the string.
158+
*/
159+
function wp_has_noncharacters( string $text ): bool {
160+
return 1 === preg_match(
161+
'/[\x{FDD0}-\x{FDEF}\x{FFFE}\x{FFFF}\x{1FFFE}\x{1FFFF}\x{2FFFE}\x{2FFFF}\x{3FFFE}\x{3FFFF}\x{4FFFE}\x{4FFFF}\x{5FFFE}\x{5FFFF}\x{6FFFE}\x{6FFFF}\x{7FFFE}\x{7FFFF}\x{8FFFE}\x{8FFFF}\x{9FFFE}\x{9FFFF}\x{AFFFE}\x{AFFFF}\x{BFFFE}\x{BFFFF}\x{CFFFE}\x{CFFFF}\x{DFFFE}\x{DFFFF}\x{EFFFE}\x{EFFFF}\x{FFFFE}\x{FFFFF}\x{10FFFE}\x{10FFFF}]/u',
162+
$text
163+
);
164+
}
165+
else :
166+
/**
167+
* Fallback function for detecting noncharacters in a text.
168+
*
169+
* @ignore
170+
* @private
171+
*
172+
* @since 6.9.0
173+
*/
174+
function wp_has_noncharacters( string $text ): bool {
175+
return _wp_has_noncharacters_fallback( $text );
176+
}
177+
endif;
Lines changed: 188 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,188 @@
1+
<?php
2+
/**
3+
* Unit tests covering WordPress’ UTF-8 handling: noncharacter detection.
4+
*
5+
* @package WordPress
6+
* @group unicode
7+
*/
8+
9+
class Tests_WpHasNoncharacters extends WP_UnitTestCase {
10+
/**
11+
* Ensures that a noncharacter inside a string will be properly detected.
12+
*
13+
* @ticket 63863
14+
*
15+
* @dataProvider data_noncharacters
16+
*
17+
* @param string $noncharacter Noncharacter as a UTF-8 string.
18+
*/
19+
public function test_detects_non_characters( string $noncharacter ) {
20+
$this->assertTrue(
21+
wp_has_noncharacters( $noncharacter ),
22+
'Failed to detect entire string as noncharacter.'
23+
);
24+
25+
$this->assertTrue(
26+
wp_has_noncharacters( "{$noncharacter} and more." ),
27+
'Failed to detect noncharacter prefix.'
28+
);
29+
30+
$this->assertTrue(
31+
wp_has_noncharacters( "Some text and then a {$noncharacter} and more." ),
32+
'Failed to detect medial noncharacter.'
33+
);
34+
35+
$this->assertTrue(
36+
wp_has_noncharacters( "Some text and a {$noncharacter}." ),
37+
'Failed to detect noncharacter suffix.'
38+
);
39+
}
40+
41+
/**
42+
* Ensures that a noncharacter inside a string will be properly detected
43+
* using the fallback function when Unicode PCRE support is missing.
44+
*
45+
* @ticket 63863
46+
*
47+
* @dataProvider data_noncharacters
48+
*
49+
* @param string $noncharacter Noncharacter as a UTF-8 string.
50+
*/
51+
public function test_fallback_detects_non_characters( string $noncharacter ) {
52+
$this->assertTrue(
53+
_wp_has_noncharacters_fallback( $noncharacter ),
54+
'Failed to detect entire string as noncharacter.'
55+
);
56+
57+
$this->assertTrue(
58+
_wp_has_noncharacters_fallback( "{$noncharacter} and more." ),
59+
'Failed to detect noncharacter prefix.'
60+
);
61+
62+
$this->assertTrue(
63+
_wp_has_noncharacters_fallback( "Some text and then a {$noncharacter} and more." ),
64+
'Failed to detect medial noncharacter.'
65+
);
66+
67+
$this->assertTrue(
68+
_wp_has_noncharacters_fallback( "Some text and a {$noncharacter}." ),
69+
'Failed to detect noncharacter suffix.'
70+
);
71+
}
72+
73+
/**
74+
* Ensures that Unicode characters are not falsely detect as noncharacters.
75+
*
76+
* @ticket 63863
77+
*/
78+
public function test_avoids_false_positives() {
79+
// Get all the noncharacters in one long string, each surrounded on both sides by null bytes.
80+
$noncharacters = implode(
81+
"\x00",
82+
array_map(
83+
static function ( $c ) {
84+
return "\x00{$c}";
85+
},
86+
array_column( array_values( iterator_to_array( self::data_noncharacters() ) ), 0 )
87+
)
88+
) . "\x00";
89+
90+
$this->assertFalse(
91+
wp_has_noncharacters( "\x00" ),
92+
'Falsely detected noncharacter in U+0000'
93+
);
94+
95+
for ( $code_point = 1; $code_point <= 0x10FFFF; $code_point++ ) {
96+
// Surrogate halves are invalid UTF-8.
97+
if ( $code_point >= 0xD800 && $code_point <= 0xDFFF ) {
98+
continue;
99+
}
100+
101+
$char = mb_chr( $code_point );
102+
$hex_char = strtoupper( str_pad( dechex( $code_point ), 4, '0', STR_PAD_LEFT ) );
103+
104+
if ( str_contains( $noncharacters, $char ) ) {
105+
$this->assertTrue(
106+
wp_has_noncharacters( $char ),
107+
"Failed to detect noncharacter as test verification for U+{$hex_char}"
108+
);
109+
} else {
110+
$this->assertFalse(
111+
wp_has_noncharacters( $char ),
112+
"Falsely detected noncharacter in U+{$hex_char}."
113+
);
114+
}
115+
}
116+
}
117+
118+
/**
119+
* Ensures that Unicode characters are not falsely detect as noncharacters
120+
* using the fallback function when Unicode PCRE support is missing.
121+
*
122+
* @ticket 63863
123+
*/
124+
public function test_fallback_avoids_false_positives() {
125+
// Get all the noncharacters in one long string, each surrounded on both sides by null bytes.
126+
$noncharacters = implode(
127+
"\x00",
128+
array_map(
129+
static function ( $c ) {
130+
return "\x00{$c}";
131+
},
132+
array_column( array_values( iterator_to_array( self::data_noncharacters() ) ), 0 )
133+
)
134+
) . "\x00";
135+
136+
$this->assertFalse(
137+
_wp_has_noncharacters_fallback( "\x00" ),
138+
'Falsely detected noncharacter in U+0000'
139+
);
140+
141+
for ( $code_point = 1; $code_point <= 0x10FFFF; $code_point++ ) {
142+
// Surrogate halves are invalid UTF-8.
143+
if ( $code_point >= 0xD800 && $code_point <= 0xDFFF ) {
144+
continue;
145+
}
146+
147+
$char = mb_chr( $code_point );
148+
$hex_char = strtoupper( str_pad( dechex( $code_point ), 4, '0', STR_PAD_LEFT ) );
149+
150+
if ( str_contains( $noncharacters, $char ) ) {
151+
$this->assertTrue(
152+
_wp_has_noncharacters_fallback( $char ),
153+
"Failed to detect noncharacter as test verification for U+{$hex_char}"
154+
);
155+
} else {
156+
$this->assertFalse(
157+
_wp_has_noncharacters_fallback( $char ),
158+
"Falsely detected noncharacter in U+{$hex_char}."
159+
);
160+
}
161+
}
162+
}
163+
164+
/**
165+
* Data provider
166+
*
167+
* @return array[]
168+
*/
169+
public static function data_noncharacters() {
170+
for ( $code_point = 0xFDD0; $code_point <= 0xFDEF; $code_point++ ) {
171+
$hex_char = strtoupper( str_pad( dechex( $code_point ), 4, '0', STR_PAD_LEFT ) );
172+
yield "U+{$hex_char}" => array( mb_chr( $code_point ) );
173+
}
174+
175+
yield 'U+FFFE' => array( "\u{FFFE}" );
176+
yield 'U+FFFF' => array( "\u{FFFF}" );
177+
178+
for ( $plane = 0x10000; $plane <= 0x10FFFF; $plane += 0x10000 ) {
179+
$code_point = $plane + 0xFFFE;
180+
$hex_char = strtoupper( str_pad( dechex( $code_point ), 4, '0', STR_PAD_LEFT ) );
181+
yield "U+{$hex_char}" => array( mb_chr( $code_point ) );
182+
183+
$code_point = $plane + 0xFFFF;
184+
$hex_char = strtoupper( str_pad( dechex( $code_point ), 4, '0', STR_PAD_LEFT ) );
185+
yield "U+{$hex_char}" => array( mb_chr( $code_point ) );
186+
}
187+
}
188+
}

0 commit comments

Comments
 (0)