WordPress
diff --git a/‎src/wp-admin/includes/export.php‎
Lines changed: 1 addition & 1 deletion b/‎src/wp-admin/includes/export.php‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/wp-admin/includes/image.php‎
Lines changed: 2 additions & 2 deletions b/‎src/wp-admin/includes/image.php‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/wp-includes/formatting.php‎
Lines changed: 177 additions & 3 deletions b/‎src/wp-includes/formatting.php‎
Lines changed: 177 additions & 3 deletions
diff --git a/‎tests/phpunit/data/unicode/utf8tests/LICENSE‎
Lines changed: 21 additions & 0 deletions b/‎tests/phpunit/data/unicode/utf8tests/LICENSE‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎tests/phpunit/data/unicode/utf8tests/README.md‎
Lines changed: 23 additions & 0 deletions b/‎tests/phpunit/data/unicode/utf8tests/README.md‎
Lines changed: 23 additions & 0 deletions
@@ -243,7 +243,7 @@ function export_wp( $args = array() ) {
 	 * @return string
 	 */
 	function wxr_cdata( $str ) {
-		if ( ! seems_utf8( $str ) ) {
+		if ( ! wp_is_valid_utf8( $str ) ) {
 			$str = utf8_encode( $str );
 		}
 		// $str = ent2ncr(esc_html($str));
 
@@ -1039,13 +1039,13 @@ function wp_read_image_metadata( $file ) {
 	}
 
 	foreach ( array( 'title', 'caption', 'credit', 'copyright', 'camera', 'iso' ) as $key ) {
-		if ( $meta[ $key ] && ! seems_utf8( $meta[ $key ] ) ) {
+		if ( $meta[ $key ] && ! wp_is_valid_utf8( $meta[ $key ] ) ) {
 			$meta[ $key ] = utf8_encode( $meta[ $key ] );
 		}
 	}
 
 	foreach ( $meta['keywords'] as $key => $keyword ) {
-		if ( ! seems_utf8( $keyword ) ) {
+		if ( ! wp_is_valid_utf8( $keyword ) ) {
 			$meta['keywords'][ $key ] = utf8_encode( $keyword );
 		}
 	}
 
@@ -876,11 +876,14 @@ function shortcode_unautop( $text ) {
  *
  * @author bmorel at ssi dot fr (modified)
  * @since 1.2.1
+ * @deprecated 6.9.0 Use {@see wp_is_valid_utf8()} instead.
  *
  * @param string $str The string to be checked.
  * @return bool True if $str fits a UTF-8 model, false otherwise.
  */
 function seems_utf8( $str ) {
+	_deprecated_function( __FUNCTION__, '6.9.0', 'wp_is_valid_utf8()' );
+
 	mbstring_binary_safe_encoding();
 	$length = strlen( $str );
 	reset_mbstring_encoding();
@@ -914,6 +917,177 @@ function seems_utf8( $str ) {
 	return true;
 }
 
+/**
+ * Determines if a given byte string represents a valid UTF-8 encoding.
+ *
+ * Note that it’s unlikely for non-UTF-8 data to validate as UTF-8, but
+ * it is still possible. Many texts are simultaneously valid UTF-8,
+ * valid US-ASCII, and valid ISO-8859-1 (`latin1`).
+ *
+ * Example:
+ *
+ *     true === wp_is_valid_utf8( '' );
+ *     true === wp_is_valid_utf8( 'just a test' );
+ *     true === wp_is_valid_utf8( "\xE2\x9C\x8F" );    // Pencil, U+270F.
+ *     true === wp_is_valid_utf8( "\u{270F}" );        // Pencil, U+270F.
+ *     true === wp_is_valid_utf8( '✏' );              // Pencil, U+270F.
+ *
+ *     false === wp_is_valid_utf8( "just \xC0 test" ); // Invalid bytes.
+ *     false === wp_is_valid_utf8( "\xE2\x9C" );       // Invalid/incomplete sequences.
+ *     false === wp_is_valid_utf8( "\xC1\xBF" );       // Overlong sequences.
+ *     false === wp_is_valid_utf8( "\xED\xB0\x80" );   // Surrogate halves.
+ *     false === wp_is_valid_utf8( "B\xFCch" );        // ISO-8859-1 high-bytes.
+ *                                                     // E.g. The “ü” in ISO-8859-1 is a single byte 0xFC,
+ *                                                     // but in UTF-8 is the two-byte sequence 0xC3 0xBC.
+ *
+ * @see _wp_is_valid_utf8_fallback
+ *
+ * @since 6.9.0
+ *
+ * @param string $bytes String which might contain text encoded as UTF-8.
+ * @return bool Whether the provided bytes can decode as valid UTF-8.
+ */
+function wp_is_valid_utf8( string $bytes ): bool {
+	/*
+	 * Since PHP 8.3.0 the UTF-8 validity is cached internally
+	 * on string objects, making this a direct property lookup.
+	 *
+	 * This is to be preferred exclusively once PHP 8.3.0 is
+	 * the minimum supported version, because even when the
+	 * status isn’t cached, it uses highly-optimized code to
+	 * validate the byte stream.
+	 */
+	return function_exists( 'mb_check_encoding' )
+		? mb_check_encoding( $bytes, 'UTF-8' )
+		: _wp_is_valid_utf8_fallback( $bytes );
+}
+
+/**
+ * Fallback mechanism for safely validating UTF-8 bytes.
+ *
+ * By implementing a raw method here the code will behave in the same way on
+ * all installed systems, regardless of what extensions are installed.
+ *
+ * @see wp_is_valid_utf8
+ *
+ * @since 6.9.0
+ * @access private
+ *
+ * @param string $bytes String which might contain text encoded as UTF-8.
+ * @return bool Whether the provided bytes can decode as valid UTF-8.
+ */
+function _wp_is_valid_utf8_fallback( string $bytes ): bool {
+	$end = strlen( $bytes );
+
+	for ( $i = 0; $i < $end; $i++ ) {
+		/*
+		 * Quickly skip past US-ASCII bytes, all of which are valid UTF-8.
+		 *
+		 * This optimization step improves the speed from 10x to 100x
+		 * depending on whether the JIT has optimized the function.
+		 */
+		$i += strspn(
+			$bytes,
+			"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" .
+			"\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" .
+			" !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f",
+			$i
+		);
+		if ( $i >= $end ) {
+			break;
+		}
+
+		/**
+		 * The above fast-track handled all single-byte UTF-8 characters. What
+		 * follows MUST be a multibyte sequence otherwise there’s invalid UTF-8.
+		 *
+		 * Therefore everything past here is checking those multibyte sequences.
+		 * Because it’s possible that there are truncated characters, the use of
+		 * the null-coalescing operator with "\xC0" is a convenience for skipping
+		 * length checks on every continuation bytes. This works because 0xC0 is
+		 * always invalid in a UTF-8 string, meaning that if the string has been
+		 * truncated, it will find 0xC0 and reject as invalid UTF-8.
+		 *
+		 *  > [The following table] lists all of the byte sequences that are well-formed
+		 * > in UTF-8. A range of byte values such as A0..BF indicates that any byte
+		 * > from A0 to BF (inclusive) is well-formed in that position. Any byte value
+		 * > outside of the ranges listed is ill-formed.
+		 *
+		 * > Table 3-7. Well-Formed UTF-8 Byte Sequences
+		 *  ╭─────────────────────┬────────────┬──────────────┬─────────────┬──────────────╮
+		 *  │ Code Points         │ First Byte │ Second Byte  │ Third Byte  │ Fourth Byte  │
+		 *  ├─────────────────────┼────────────┼──────────────┼─────────────┼──────────────┤
+		 *  │ U+0000..U+007F      │ 00..7F     │              │             │              │
+		 *  │ U+0080..U+07FF      │ C2..DF     │ 80..BF       │             │              │
+		 *  │ U+0800..U+0FFF      │ E0         │ A0..BF       │ 80..BF      │              │
+		 *  │ U+1000..U+CFFF      │ E1..EC     │ 80..BF       │ 80..BF      │              │
+		 *  │ U+D000..U+D7FF      │ ED         │ 80..9F       │ 80..BF      │              │
+		 *  │ U+E000..U+FFFF      │ EE..EF     │ 80..BF       │ 80..BF      │              │
+		 *  │ U+10000..U+3FFFF    │ F0         │ 90..BF       │ 80..BF      │ 80..BF       │
+		 *  │ U+40000..U+FFFFF    │ F1..F3     │ 80..BF       │ 80..BF      │ 80..BF       │
+		 *  │ U+100000..U+10FFFF  │ F4         │ 80..8F       │ 80..BF      │ 80..BF       │
+		 *  ╰─────────────────────┴────────────┴──────────────┴─────────────┴──────────────╯
+		 *
+		 * Notice that all valid third and forth bytes are in the range 80..BF. This
+		 * validator takes advantage of that to only check the range of those bytes once.
+		 *
+		 * @see https://lemire.me/blog/2018/05/09/how-quickly-can-you-check-that-a-string-is-valid-unicode-utf-8/
+		 * @see https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G27506
+		 */
+
+		$b1 = ord( $bytes[ $i ] );
+		$b2 = ord( $bytes[ $i + 1 ] ?? "\xC0" );
+
+		// Valid two-byte code points.
+
+		if ( $b1 >= 0xC2 && $b1 <= 0xDF && $b2 >= 0x80 && $b2 <= 0xBF ) {
+			$i++;
+			continue;
+		}
+
+		$b3 = ord( $bytes[ $i + 2 ] ?? "\xC0" );
+
+		// Valid three-byte code points.
+
+		if ( $b3 < 0x80 || $b3 > 0xBF ) {
+			return false;
+		}
+
+		if (
+			( 0xE0 === $b1 && $b2 >= 0xA0 && $b2 <= 0xBF ) ||
+			( $b1 >= 0xE1 && $b1 <= 0xEC && $b2 >= 0x80 && $b2 <= 0xBF ) ||
+			( 0xED === $b1 && $b2 >= 0x80 && $b2 <= 0x9F ) ||
+			( $b1 >= 0xEE && $b1 <= 0xEF && $b2 >= 0x80 && $b2 <= 0xBF )
+		) {
+			$i += 2;
+			continue;
+		}
+
+		$b4 = ord( $bytes[ $i + 3 ] ?? "\xC0" );
+
+		// Valid four-byte code points.
+
+		if ( $b4 < 0x80 || $b4 > 0xBF ) {
+			return false;
+		}
+
+		if (
+			( 0xF0 === $b1 && $b2 >= 0x90 && $b2 <= 0xBF ) ||
+			( $b1 >= 0xF1 && $b1 <= 0xF3 && $b2 >= 0x80 && $b2 <= 0xBF ) ||
+			( 0xF4 === $b1 && $b2 >= 0x80 && $b2 <= 0x8F )
+		) {
+			$i += 3;
+			continue;
+		}
+
+		// Any other sequence is invalid.
+		return false;
+	}
+
+	// Reaching the end implies validating every byte.
+	return true;
+}
+
 /**
  * Converts a number of special characters into their HTML entities.
  *
@@ -1597,7 +1771,7 @@ function remove_accents( $text, $locale = '' ) {
 		return $text;
 	}
 
-	if ( seems_utf8( $text ) ) {
+	if ( wp_is_valid_utf8( $text ) ) {
 
 		/*
 		 * Unicode sequence normalization from NFD (Normalization Form Decomposed)
@@ -2028,7 +2202,7 @@ function sanitize_file_name( $filename ) {
 		$utf8_pcre = @preg_match( '/^./u', 'a' );
 	}
 
-	if ( ! seems_utf8( $filename ) ) {
+	if ( ! wp_is_valid_utf8( $filename ) ) {
 		$_ext     = pathinfo( $filename, PATHINFO_EXTENSION );
 		$_name    = pathinfo( $filename, PATHINFO_FILENAME );
 		$filename = sanitize_title_with_dashes( $_name ) . '.' . $_ext;
@@ -2277,7 +2451,7 @@ function sanitize_title_with_dashes( $title, $raw_title = '', $context = 'displa
 	// Restore octets.
 	$title = preg_replace( '|---([a-fA-F0-9][a-fA-F0-9])---|', '%$1', $title );
 
-	if ( seems_utf8( $title ) ) {
+	if ( wp_is_valid_utf8( $title ) ) {
 		if ( function_exists( 'mb_strtolower' ) ) {
 			$title = mb_strtolower( $title, 'UTF-8' );
 		}
 
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2021 flenniken
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
@@ -0,0 +1,23 @@
+# utf8tests
+
+This directory contains a third-party test suite used for testing UTF-8 functionality.
+It primarily provides a set of tests containing various valid and invalid UTF-8 byte sequences.
+
+`utf8tests` can be found on GitHub at [flenniken/utf8tests](https://github.com/flenniken/utf8tests/).
+
+The necessary files have been copied to this directory:
+
+- `LICENSE`
+- `utf8tests.txt`
+
+The version of these files was taken from the git commit with
+SHA [`52cbdf830f3603047036070b086a1e5196df94d1`](https://github.com/flenniken/utf8tests/blob/52cbdf830f3603047036070b086a1e5196df94d1).
+
+## Updating
+
+If there have been changes to the `utf8tests` repository, this test suite can be updated. In
+order to update:
+
+1. Check out the latest version of git repository mentioned above.
+1. Copy the files listed above into this directory.
+1. Update the SHA mentioned in this README file with the new `utf8tests` SHA.
Original file line number	Diff line number	Diff line change
`@@ -243,7 +243,7 @@ function export_wp( $args = array() ) {`
`243`	`243`	`* @return string`
`244`	`244`	`*/`
`245`	`245`	`function wxr_cdata( $str ) {`
`246`		`- if ( ! seems_utf8( $str ) ) {`
	`246`	`+ if ( ! wp_is_valid_utf8( $str ) ) {`
`247`	`247`	`$str = utf8_encode( $str );`
`248`	`248`	`}`
`249`	`249`	`// $str = ent2ncr(esc_html($str));`
Original file line number	Diff line number	Diff line change
`@@ -1039,13 +1039,13 @@ function wp_read_image_metadata( $file ) {`
`1039`	`1039`	`}`
`1040`	`1040`
`1041`	`1041`	`foreach ( array( 'title', 'caption', 'credit', 'copyright', 'camera', 'iso' ) as $key ) {`
`1042`		`- if ( $meta[ $key ] && ! seems_utf8( $meta[ $key ] ) ) {`
	`1042`	`+ if ( $meta[ $key ] && ! wp_is_valid_utf8( $meta[ $key ] ) ) {`
`1043`	`1043`	`$meta[ $key ] = utf8_encode( $meta[ $key ] );`
`1044`	`1044`	`}`
`1045`	`1045`	`}`
`1046`	`1046`
`1047`	`1047`	`foreach ( $meta['keywords'] as $key => $keyword ) {`
`1048`		`- if ( ! seems_utf8( $keyword ) ) {`
	`1048`	`+ if ( ! wp_is_valid_utf8( $keyword ) ) {`
`1049`	`1049`	`$meta['keywords'][ $key ] = utf8_encode( $keyword );`
`1050`	`1050`	`}`
`1051`	`1051`	`}`