Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 55 additions & 11 deletions src/wp-includes/compat-utf8.php
Original file line number Diff line number Diff line change
Expand Up @@ -35,19 +35,21 @@
* @since 6.9.0
* @access private
*
* @param string $bytes UTF-8 encoded string which might include invalid spans of bytes.
* @param int $at Where to start scanning.
* @param int $invalid_length Will be set to how many bytes are to be ignored after `$at`.
* @param int|null $max_bytes Stop scanning after this many bytes have been seen.
* @param int|null $max_code_points Stop scanning after this many code points have been seen.
* @param string $bytes UTF-8 encoded string which might include invalid spans of bytes.
* @param int $at Where to start scanning.
* @param int $invalid_length Will be set to how many bytes are to be ignored after `$at`.
* @param int|null $max_bytes Stop scanning after this many bytes have been seen.
* @param int|null $max_code_points Stop scanning after this many code points have been seen.
* @param bool $has_noncharacters Set to indicate if scanned string contained noncharacters.
* @return int How many code points were successfully scanned.
*/
function _wp_scan_utf8( string $bytes, int &$at, int &$invalid_length, ?int $max_bytes = null, ?int $max_code_points = null ): int {
$byte_length = strlen( $bytes );
$end = min( $byte_length, $at + ( $max_bytes ?? PHP_INT_MAX ) );
$invalid_length = 0;
$count = 0;
$max_count = $max_code_points ?? PHP_INT_MAX;
function _wp_scan_utf8( string $bytes, int &$at, int &$invalid_length, ?int $max_bytes = null, ?int $max_code_points = null, ?bool &$has_noncharacters = null ): int {
$byte_length = strlen( $bytes );
$end = min( $byte_length, $at + ( $max_bytes ?? PHP_INT_MAX ) );
$invalid_length = 0;
$count = 0;
$max_count = $max_code_points ?? PHP_INT_MAX;
$has_noncharacters = false;

for ( $i = $at; $i < $end && $count <= $max_count; $i++ ) {
/*
Expand Down Expand Up @@ -145,6 +147,15 @@ function _wp_scan_utf8( string $bytes, int &$at, int &$invalid_length, ?int $max
) {
++$count;
$i += 2;

// Covers the range U+FDD0–U+FDEF, U+FFFE, U+FFFF.
if ( 0xEF === $b1 ) {
$has_noncharacters |= (
( 0xB7 === $b2 && $b3 >= 0x90 && $b3 <= 0xAF ) ||
( 0xBF === $b2 && ( 0xBE === $b3 || 0xBF === $b3 ) )
);
}

continue;
}

Expand All @@ -162,6 +173,14 @@ function _wp_scan_utf8( string $bytes, int &$at, int &$invalid_length, ?int $max
) {
++$count;
$i += 3;

// Covers U+1FFFE, U+1FFFF, U+2FFFE, U+2FFFF, …, U+10FFFE, U+10FFFF.
$has_noncharacters |= (
( 0x0F === ( $b2 & 0x0F ) ) &&
0xBF === $b3 &&
( 0xBE === $b4 || 0xBF === $b4 )
);

continue;
}

Expand Down Expand Up @@ -380,6 +399,31 @@ function _wp_utf8_codepoint_span( string $text, int $byte_offset, int $max_code_
return $byte_offset - $was_at;
}

/**
* Fallback support for determining if a string contains Unicode noncharacters.
*
* @since 6.9.0
* @access private
*
* @see \wp_has_noncharacters()
*
* @param string $text Are there noncharacters in this string?
* @return bool Whether noncharacters were found in the string.
*/
function _wp_has_noncharacters_fallback( string $text ): bool {
$at = 0;
$invalid_length = 0;
$has_noncharacters = false;
$end = strlen( $text );

while ( $at < $end && ! $has_noncharacters ) {
_wp_scan_utf8( $text, $at, $invalid_length, null, null, $has_noncharacters );
$at += $invalid_length;
}

return $has_noncharacters;
}

/**
* Converts a string from ISO-8859-1 to UTF-8, maintaining backwards compatibility
* with the deprecated function from the PHP standard library.
Expand Down
42 changes: 42 additions & 0 deletions src/wp-includes/utf8.php
Original file line number Diff line number Diff line change
Expand Up @@ -133,3 +133,45 @@ function wp_scrub_utf8( $text ) {
return _wp_scrub_utf8_fallback( $text );
}
endif;

if ( _wp_can_use_pcre_u() ) :
/**
* Returns whether the given string contains Unicode noncharacters.
*
* XML recommends against using noncharacters and HTML forbids their
* use in attribute names. Unicode recommends that they not be used
* in open exchange of data.
*
* Noncharacters are code points within the following ranges:
* - U+FDD0–U+FDEF
* - U+FFFE–U+FFFF
* - U+1FFFE, U+1FFFF, U+2FFFE, U+2FFFF, …, U+10FFFE, U+10FFFF
*
* @see https://www.unicode.org/versions/Unicode17.0.0/core-spec/chapter-23/#G12612
* @see https://www.w3.org/TR/xml/#charsets
* @see https://html.spec.whatwg.org/#attributes-2
*
* @since 6.9.0
*
* @param string $text Are there noncharacters in this string?
* @return bool Whether noncharacters were found in the string.
*/
function wp_has_noncharacters( string $text ): bool {
return 1 === preg_match(
'/[\x{FDD0}-\x{FDEF}\x{FFFE}\x{FFFF}\x{1FFFE}\x{1FFFF}\x{2FFFE}\x{2FFFF}\x{3FFFE}\x{3FFFF}\x{4FFFE}\x{4FFFF}\x{5FFFE}\x{5FFFF}\x{6FFFE}\x{6FFFF}\x{7FFFE}\x{7FFFF}\x{8FFFE}\x{8FFFF}\x{9FFFE}\x{9FFFF}\x{AFFFE}\x{AFFFF}\x{BFFFE}\x{BFFFF}\x{CFFFE}\x{CFFFF}\x{DFFFE}\x{DFFFF}\x{EFFFE}\x{EFFFF}\x{FFFFE}\x{FFFFF}\x{10FFFE}\x{10FFFF}]/u',
$text
);
}
else :
/**
* Fallback function for detecting noncharacters in a text.
*
* @ignore
* @private
*
* @since 6.9.0
*/
function wp_has_noncharacters( string $text ): bool {
return _wp_has_noncharacters_fallback( $text );
}
endif;
188 changes: 188 additions & 0 deletions tests/phpunit/tests/unicode/wpHasNoncharacters.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
<?php
/**
* Unit tests covering WordPress’ UTF-8 handling: noncharacter detection.
*
* @package WordPress
* @group unicode
*/

class Tests_WpHasNoncharacters extends WP_UnitTestCase {
/**
* Ensures that a noncharacter inside a string will be properly detected.
*
* @ticket 63863
*
* @dataProvider data_noncharacters
*
* @param string $noncharacter Noncharacter as a UTF-8 string.
*/
public function test_detects_non_characters( string $noncharacter ) {
$this->assertTrue(
wp_has_noncharacters( $noncharacter ),
'Failed to detect entire string as noncharacter.'
);

$this->assertTrue(
wp_has_noncharacters( "{$noncharacter} and more." ),
'Failed to detect noncharacter prefix.'
);

$this->assertTrue(
wp_has_noncharacters( "Some text and then a {$noncharacter} and more." ),
'Failed to detect medial noncharacter.'
);

$this->assertTrue(
wp_has_noncharacters( "Some text and a {$noncharacter}." ),
'Failed to detect noncharacter suffix.'
);
}

/**
* Ensures that a noncharacter inside a string will be properly detected
* using the fallback function when Unicode PCRE support is missing.
*
* @ticket 63863
*
* @dataProvider data_noncharacters
*
* @param string $noncharacter Noncharacter as a UTF-8 string.
*/
public function test_fallback_detects_non_characters( string $noncharacter ) {
$this->assertTrue(
_wp_has_noncharacters_fallback( $noncharacter ),
'Failed to detect entire string as noncharacter.'
);

$this->assertTrue(
_wp_has_noncharacters_fallback( "{$noncharacter} and more." ),
'Failed to detect noncharacter prefix.'
);

$this->assertTrue(
_wp_has_noncharacters_fallback( "Some text and then a {$noncharacter} and more." ),
'Failed to detect medial noncharacter.'
);

$this->assertTrue(
_wp_has_noncharacters_fallback( "Some text and a {$noncharacter}." ),
'Failed to detect noncharacter suffix.'
);
}

/**
* Ensures that Unicode characters are not falsely detect as noncharacters.
*
* @ticket 63863
*/
public function test_avoids_false_positives() {
// Get all the noncharacters in one long string, each surrounded on both sides by null bytes.
$noncharacters = implode(
"\x00",
array_map(
static function ( $c ) {
return "\x00{$c}";
},
array_column( array_values( iterator_to_array( self::data_noncharacters() ) ), 0 )
)
) . "\x00";

$this->assertFalse(
wp_has_noncharacters( "\x00" ),
'Falsely detected noncharacter in U+0000'
);

for ( $code_point = 1; $code_point <= 0x10FFFF; $code_point++ ) {
// Surrogate halves are invalid UTF-8.
if ( $code_point >= 0xD800 && $code_point <= 0xDFFF ) {
continue;
}

$char = mb_chr( $code_point );
$hex_char = strtoupper( str_pad( dechex( $code_point ), 4, '0', STR_PAD_LEFT ) );

if ( str_contains( $noncharacters, $char ) ) {
$this->assertTrue(
wp_has_noncharacters( $char ),
"Failed to detect noncharacter as test verification for U+{$hex_char}"
);
} else {
$this->assertFalse(
wp_has_noncharacters( $char ),
"Falsely detected noncharacter in U+{$hex_char}."
);
}
}
}

/**
* Ensures that Unicode characters are not falsely detect as noncharacters
* using the fallback function when Unicode PCRE support is missing.
*
* @ticket 63863
*/
public function test_fallback_avoids_false_positives() {
// Get all the noncharacters in one long string, each surrounded on both sides by null bytes.
$noncharacters = implode(
"\x00",
array_map(
static function ( $c ) {
return "\x00{$c}";
},
array_column( array_values( iterator_to_array( self::data_noncharacters() ) ), 0 )
)
) . "\x00";

$this->assertFalse(
_wp_has_noncharacters_fallback( "\x00" ),
'Falsely detected noncharacter in U+0000'
);

for ( $code_point = 1; $code_point <= 0x10FFFF; $code_point++ ) {
// Surrogate halves are invalid UTF-8.
if ( $code_point >= 0xD800 && $code_point <= 0xDFFF ) {
continue;
}

$char = mb_chr( $code_point );
$hex_char = strtoupper( str_pad( dechex( $code_point ), 4, '0', STR_PAD_LEFT ) );

if ( str_contains( $noncharacters, $char ) ) {
$this->assertTrue(
_wp_has_noncharacters_fallback( $char ),
"Failed to detect noncharacter as test verification for U+{$hex_char}"
);
} else {
$this->assertFalse(
_wp_has_noncharacters_fallback( $char ),
"Falsely detected noncharacter in U+{$hex_char}."
);
}
}
}

/**
* Data provider
*
* @return array[]
*/
public static function data_noncharacters() {
for ( $code_point = 0xFDD0; $code_point <= 0xFDEF; $code_point++ ) {
$hex_char = strtoupper( str_pad( dechex( $code_point ), 4, '0', STR_PAD_LEFT ) );
yield "U+{$hex_char}" => array( mb_chr( $code_point ) );
}

yield 'U+FFFE' => array( "\u{FFFE}" );
yield 'U+FFFF' => array( "\u{FFFF}" );

for ( $plane = 0x10000; $plane <= 0x10FFFF; $plane += 0x10000 ) {
$code_point = $plane + 0xFFFE;
$hex_char = strtoupper( str_pad( dechex( $code_point ), 4, '0', STR_PAD_LEFT ) );
yield "U+{$hex_char}" => array( mb_chr( $code_point ) );

$code_point = $plane + 0xFFFF;
$hex_char = strtoupper( str_pad( dechex( $code_point ), 4, '0', STR_PAD_LEFT ) );
yield "U+{$hex_char}" => array( mb_chr( $code_point ) );
}
}
}
Loading