Charset: Track detection of non-characters when scanning UTF-8.

dmsnell · dmsnell · commit 4975f202232e · 2025-10-18T14:38:40.000-07:00
Noncharacters are code points that are permantently reserved in the Unicode Standard for internal use. They are not recommended for use in open interchange of Unicode text data. However, they are valid code points and will not cause a string to return as invalid. Still, HTML and XML both impose semantic rules on their use and it may be important for code to know whether they are present in a string. This patch introduces a new function, `wp_has_noncharacters()`, which answers this question. This is accomplished through an inline check with the fallback UTF-8 scanner. There are 66 noncharacters, making it difficult to find them properly with common string search functionality. While the inline check adds overhead to the scanning process, the rare occurrance of noncharacters should lead to minimal actual overhead due to strong branch prediction. See https://www.unicode.org/versions/Unicode17.0.0/core-spec/chapter-23/#G12612
diff --git a/src/wp-includes/compat-utf8.php b/src/wp-includes/compat-utf8.php
@@ -35,19 +35,21 @@
  * @since 6.9.0
  * @access private
  *
- * @param string   $bytes           UTF-8 encoded string which might include invalid spans of bytes.
- * @param int      $at              Where to start scanning.
- * @param int      $invalid_length  Will be set to how many bytes are to be ignored after `$at`.
- * @param int|null $max_bytes       Stop scanning after this many bytes have been seen.
- * @param int|null $max_code_points Stop scanning after this many code points have been seen.
+ * @param string   $bytes             UTF-8 encoded string which might include invalid spans of bytes.
+ * @param int      $at                Where to start scanning.
+ * @param int      $invalid_length    Will be set to how many bytes are to be ignored after `$at`.
+ * @param int|null $max_bytes         Stop scanning after this many bytes have been seen.
+ * @param int|null $max_code_points   Stop scanning after this many code points have been seen.
+ * @param bool     $has_noncharacters Set to indicate if scanned string contained noncharacters.
  * @return int How many code points were successfully scanned.
  */
-function _wp_scan_utf8( string $bytes, int &$at, int &$invalid_length, ?int $max_bytes = null, ?int $max_code_points = null ): int {
-	$byte_length    = strlen( $bytes );
-	$end            = min( $byte_length, $at + ( $max_bytes ?? PHP_INT_MAX ) );
-	$invalid_length = 0;
-	$count          = 0;
-	$max_count      = $max_code_points ?? PHP_INT_MAX;
+function _wp_scan_utf8( string $bytes, int &$at, int &$invalid_length, ?int $max_bytes = null, ?int $max_code_points = null, ?bool &$has_noncharacters = null ): int {
+	$byte_length       = strlen( $bytes );
+	$end               = min( $byte_length, $at + ( $max_bytes ?? PHP_INT_MAX ) );
+	$invalid_length    = 0;
+	$count             = 0;
+	$max_count         = $max_code_points ?? PHP_INT_MAX;
+	$has_noncharacters = false;
 
 	for ( $i = $at; $i < $end && $count <= $max_count; $i++ ) {
 		/*
@@ -145,6 +147,15 @@ function _wp_scan_utf8( string $bytes, int &$at, int &$invalid_length, ?int $max
 		) {
 			++$count;
 			$i += 2;
+
+			// Covers the range U+FDD0–U+FDEF, U+FFFE, U+FFFF.
+			if ( 0xEF === $b1 ) {
+				$has_noncharacters |= (
+					( 0xB7 === $b2 && $b3 >= 0x90 && $b3 <= 0xAF ) ||
+					( 0xBF === $b2 && ( 0xBE === $b3 || 0xBF === $b3 ) )
+				);
+			}
+
 			continue;
 		}
 
@@ -162,6 +173,14 @@ function _wp_scan_utf8( string $bytes, int &$at, int &$invalid_length, ?int $max
 		) {
 			++$count;
 			$i += 3;
+
+			// Covers U+1FFFE, U+1FFFF, U+2FFFE, U+2FFFF, …, U+10FFFE, U+10FFFF.
+			$has_noncharacters |= (
+				( 0x0F === ( $b2 & 0x0F ) ) &&
+				0xBF === $b3 &&
+				( 0xBE === $b4 || 0xBF === $b4 )
+			);
+
 			continue;
 		}
 
diff --git a/src/wp-includes/utf8.php b/src/wp-includes/utf8.php
@@ -133,3 +133,36 @@ function wp_scrub_utf8( $text ) {
 		return _wp_scrub_utf8_fallback( $text );
 	}
 endif;
+
+/**
+ * Returns whether the given string contains Unicode noncharacters.
+ *
+ * XML recommends against using noncharacters and HTML forbids their
+ * use in attribute names. Unicode recommends that they not be used
+ * in open exchange of data.
+ *
+ * Noncharacters are code points within the following ranges:
+ *  - U+FDD0–U+FDEF
+ *  - U+FFFE–U+FFFF
+ *  - U+1FFFE, U+1FFFF, U+2FFFE, U+2FFFF, …, U+10FFFE, U+10FFFF
+ *
+ * @see https://www.unicode.org/versions/Unicode17.0.0/core-spec/chapter-23/#G12612
+ * @see https://www.w3.org/TR/xml/#charsets
+ * @see https://html.spec.whatwg.org/#attributes-2
+ *
+ * @param string $text Are there noncharacters in this string?
+ * @return bool Whether noncharacters were found in the string.
+ */
+function wp_has_noncharacters( string $text ): bool {
+	$at                = 0;
+	$invalid_length    = 0;
+	$has_noncharacters = false;
+	$end               = strlen( $text );
+
+	while ( $at < $end && ! $has_noncharacters ) {
+		_wp_scan_utf8( $text, $at, $invalid_length, null, null, $has_noncharacters );
+		$at += $invalid_length;
+	}
+
+	return $has_noncharacters;
+}
diff --git a/tests/phpunit/tests/unicode/wpHasNoncharacters.php b/tests/phpunit/tests/unicode/wpHasNoncharacters.php
@@ -0,0 +1,96 @@
+<?php
+/**
+ * Unit tests covering WordPress’ UTF-8 handling: noncharacter detection.
+ *
+ * @package WordPress
+ * @group unicode
+ */
+
+class Tests_WpHasNoncharacters extends WP_UnitTestCase {
+	/**
+	 * Ensures that a noncharacter inside a string will be properly detected.
+	 *
+	 * @dataProvider data_noncharacters
+	 *
+	 * @param string $noncharacter
+	 */
+	public function test_detects_non_characters( string $noncharacter ) {
+		$this->assertTrue(
+			wp_has_noncharacters( $noncharacter ),
+			'Failed to detect entire string as noncharacter.'
+		);
+
+		$this->assertTrue(
+			wp_has_noncharacters( "{$noncharacter} and more." ),
+			'Failed to detect noncharacter prefix.'
+		);
+
+		$this->assertTrue(
+			wp_has_noncharacters( "Some text and then a {$noncharacter} and more." ),
+			'Failed to detect medial noncharacter.'
+		);
+
+		$this->assertTrue(
+			wp_has_noncharacters( "Some text and a {$noncharacter}." ),
+			'Failed to detect noncharacter suffix.'
+		);
+	}
+
+	public function test_avoids_false_positives() {
+		// Get all the noncharacters in one long string, each surrounded on both sides by null bytes.
+		$noncharacters = array_column( array_values( iterator_to_array( self::data_noncharacters() ) ), 0 );
+		$noncharacters = implode( "\x00", array_map( fn ( $c ) => "\x00{$c}", $noncharacters ) ) . "\x00";
+
+		$this->assertFalse(
+			wp_has_noncharacters( "\x00" ),
+			'Falsely detected noncharacter in U+0000'
+		);
+
+		for ( $code_point = 1; $code_point <= 0x10FFFF; $code_point++ ) {
+			// Surrogate halves are invalid UTF-8.
+			if ( $code_point >= 0xD800 && $code_point <= 0xDFFF ) {
+				continue;
+			}
+
+			$char = mb_chr( $code_point );
+			$hex_char = strtoupper( str_pad( dechex( $code_point ), 4, '0', STR_PAD_LEFT ) );
+
+			if ( str_contains( $noncharacters, $char ) ) {
+				$this->assertTrue(
+					wp_has_noncharacters( $char ),
+					"Failed to detect noncharacter as test verification for U+{$hex_char}"
+				);
+			} else {
+				$this->assertFalse(
+					wp_has_noncharacters( $char ),
+					"Falsely detected noncharacter in U+{$hex_char}."
+				);
+			}
+		}
+	}
+
+	/**
+	 * Data provider
+	 *
+	 * @return array[]
+	 */
+	public static function data_noncharacters() {
+		for ( $code_point = 0xFDD0; $code_point <= 0xFDEF; $code_point++ ) {
+			$hex_char = strtoupper( str_pad( dechex( $code_point ), 4, '0', STR_PAD_LEFT ) );
+			yield "U+{$hex_char}" => array( mb_chr( $code_point ) );
+		}
+
+		yield 'U+FFFE' => array( "\u{FFFE}" );
+		yield 'U+FFFF' => array( "\u{FFFF}" );
+
+		for ( $plane = 0x10000; $plane <= 0x10FFFF; $plane += 0x10000 ) {
+			$code_point = $plane + 0xFFFE;
+			$hex_char = strtoupper( str_pad( dechex( $code_point ), 4, '0', STR_PAD_LEFT ) );
+			yield "U+{$hex_char}" => array( mb_chr( $code_point ) );
+
+			$code_point = $plane + 0xFFFF;
+			$hex_char = strtoupper( str_pad( dechex( $code_point ), 4, '0', STR_PAD_LEFT ) );
+			yield "U+{$hex_char}" => array( mb_chr( $code_point ) );
+		}
+	}
+}