@@ -876,11 +876,14 @@ function shortcode_unautop( $text ) {
876876 *
877877 * @author bmorel at ssi dot fr (modified)
878878 * @since 1.2.1
879+ * @deprecated 6.9.0 Use {@see wp_is_valid_utf8()} instead.
879880 *
880881 * @param string $str The string to be checked.
881882 * @return bool True if $str fits a UTF-8 model, false otherwise.
882883 */
883884function seems_utf8 ( $ str ) {
885+ _deprecated_function ( __FUNCTION__ , '6.9.0 ' , 'wp_is_valid_utf8() ' );
886+
884887 mbstring_binary_safe_encoding ();
885888 $ length = strlen ( $ str );
886889 reset_mbstring_encoding ();
@@ -914,6 +917,177 @@ function seems_utf8( $str ) {
914917 return true ;
915918}
916919
920+ /**
921+ * Determines if a given byte string represents a valid UTF-8 encoding.
922+ *
923+ * Note that it’s unlikely for non-UTF-8 data to validate as UTF-8, but
924+ * it is still possible. Many texts are simultaneously valid UTF-8,
925+ * valid US-ASCII, and valid ISO-8859-1 (`latin1`).
926+ *
927+ * Example:
928+ *
929+ * true === wp_is_valid_utf8( '' );
930+ * true === wp_is_valid_utf8( 'just a test' );
931+ * true === wp_is_valid_utf8( "\xE2\x9C\x8F" ); // Pencil, U+270F.
932+ * true === wp_is_valid_utf8( "\u{270F}" ); // Pencil, U+270F.
933+ * true === wp_is_valid_utf8( '✏' ); // Pencil, U+270F.
934+ *
935+ * false === wp_is_valid_utf8( "just \xC0 test" ); // Invalid bytes.
936+ * false === wp_is_valid_utf8( "\xE2\x9C" ); // Invalid/incomplete sequences.
937+ * false === wp_is_valid_utf8( "\xC1\xBF" ); // Overlong sequences.
938+ * false === wp_is_valid_utf8( "\xED\xB0\x80" ); // Surrogate halves.
939+ * false === wp_is_valid_utf8( "B\xFCch" ); // ISO-8859-1 high-bytes.
940+ * // E.g. The “ü” in ISO-8859-1 is a single byte 0xFC,
941+ * // but in UTF-8 is the two-byte sequence 0xC3 0xBC.
942+ *
943+ * @see _wp_is_valid_utf8_fallback
944+ *
945+ * @since 6.9.0
946+ *
947+ * @param string $bytes String which might contain text encoded as UTF-8.
948+ * @return bool Whether the provided bytes can decode as valid UTF-8.
949+ */
950+ function wp_is_valid_utf8 ( string $ bytes ): bool {
951+ /*
952+ * Since PHP 8.3.0 the UTF-8 validity is cached internally
953+ * on string objects, making this a direct property lookup.
954+ *
955+ * This is to be preferred exclusively once PHP 8.3.0 is
956+ * the minimum supported version, because even when the
957+ * status isn’t cached, it uses highly-optimized code to
958+ * validate the byte stream.
959+ */
960+ return function_exists ( 'mb_check_encoding ' )
961+ ? mb_check_encoding ( $ bytes , 'UTF-8 ' )
962+ : _wp_is_valid_utf8_fallback ( $ bytes );
963+ }
964+
965+ /**
966+ * Fallback mechanism for safely validating UTF-8 bytes.
967+ *
968+ * By implementing a raw method here the code will behave in the same way on
969+ * all installed systems, regardless of what extensions are installed.
970+ *
971+ * @see wp_is_valid_utf8
972+ *
973+ * @since 6.9.0
974+ * @access private
975+ *
976+ * @param string $bytes String which might contain text encoded as UTF-8.
977+ * @return bool Whether the provided bytes can decode as valid UTF-8.
978+ */
979+ function _wp_is_valid_utf8_fallback ( string $ bytes ): bool {
980+ $ end = strlen ( $ bytes );
981+
982+ for ( $ i = 0 ; $ i < $ end ; $ i ++ ) {
983+ /*
984+ * Quickly skip past US-ASCII bytes, all of which are valid UTF-8.
985+ *
986+ * This optimization step improves the speed from 10x to 100x
987+ * depending on whether the JIT has optimized the function.
988+ */
989+ $ i += strspn (
990+ $ bytes ,
991+ "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" .
992+ "\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" .
993+ " ! \"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[ \\]^_`abcdefghijklmnopqrstuvwxyz{|}~ \x7f" ,
994+ $ i
995+ );
996+ if ( $ i >= $ end ) {
997+ break ;
998+ }
999+
1000+ /**
1001+ * The above fast-track handled all single-byte UTF-8 characters. What
1002+ * follows MUST be a multibyte sequence otherwise there’s invalid UTF-8.
1003+ *
1004+ * Therefore everything past here is checking those multibyte sequences.
1005+ * Because it’s possible that there are truncated characters, the use of
1006+ * the null-coalescing operator with "\xC0" is a convenience for skipping
1007+ * length checks on every continuation bytes. This works because 0xC0 is
1008+ * always invalid in a UTF-8 string, meaning that if the string has been
1009+ * truncated, it will find 0xC0 and reject as invalid UTF-8.
1010+ *
1011+ * > [The following table] lists all of the byte sequences that are well-formed
1012+ * > in UTF-8. A range of byte values such as A0..BF indicates that any byte
1013+ * > from A0 to BF (inclusive) is well-formed in that position. Any byte value
1014+ * > outside of the ranges listed is ill-formed.
1015+ *
1016+ * > Table 3-7. Well-Formed UTF-8 Byte Sequences
1017+ * ╭─────────────────────┬────────────┬──────────────┬─────────────┬──────────────╮
1018+ * │ Code Points │ First Byte │ Second Byte │ Third Byte │ Fourth Byte │
1019+ * ├─────────────────────┼────────────┼──────────────┼─────────────┼──────────────┤
1020+ * │ U+0000..U+007F │ 00..7F │ │ │ │
1021+ * │ U+0080..U+07FF │ C2..DF │ 80..BF │ │ │
1022+ * │ U+0800..U+0FFF │ E0 │ A0..BF │ 80..BF │ │
1023+ * │ U+1000..U+CFFF │ E1..EC │ 80..BF │ 80..BF │ │
1024+ * │ U+D000..U+D7FF │ ED │ 80..9F │ 80..BF │ │
1025+ * │ U+E000..U+FFFF │ EE..EF │ 80..BF │ 80..BF │ │
1026+ * │ U+10000..U+3FFFF │ F0 │ 90..BF │ 80..BF │ 80..BF │
1027+ * │ U+40000..U+FFFFF │ F1..F3 │ 80..BF │ 80..BF │ 80..BF │
1028+ * │ U+100000..U+10FFFF │ F4 │ 80..8F │ 80..BF │ 80..BF │
1029+ * ╰─────────────────────┴────────────┴──────────────┴─────────────┴──────────────╯
1030+ *
1031+ * Notice that all valid third and forth bytes are in the range 80..BF. This
1032+ * validator takes advantage of that to only check the range of those bytes once.
1033+ *
1034+ * @see https://lemire.me/blog/2018/05/09/how-quickly-can-you-check-that-a-string-is-valid-unicode-utf-8/
1035+ * @see https://www.unicode.org/versions/Unicode16.0.0/core-spec/chapter-3/#G27506
1036+ */
1037+
1038+ $ b1 = ord ( $ bytes [ $ i ] );
1039+ $ b2 = ord ( $ bytes [ $ i + 1 ] ?? "\xC0" );
1040+
1041+ // Valid two-byte code points.
1042+
1043+ if ( $ b1 >= 0xC2 && $ b1 <= 0xDF && $ b2 >= 0x80 && $ b2 <= 0xBF ) {
1044+ $ i ++;
1045+ continue ;
1046+ }
1047+
1048+ $ b3 = ord ( $ bytes [ $ i + 2 ] ?? "\xC0" );
1049+
1050+ // Valid three-byte code points.
1051+
1052+ if ( $ b3 < 0x80 || $ b3 > 0xBF ) {
1053+ return false ;
1054+ }
1055+
1056+ if (
1057+ ( 0xE0 === $ b1 && $ b2 >= 0xA0 && $ b2 <= 0xBF ) ||
1058+ ( $ b1 >= 0xE1 && $ b1 <= 0xEC && $ b2 >= 0x80 && $ b2 <= 0xBF ) ||
1059+ ( 0xED === $ b1 && $ b2 >= 0x80 && $ b2 <= 0x9F ) ||
1060+ ( $ b1 >= 0xEE && $ b1 <= 0xEF && $ b2 >= 0x80 && $ b2 <= 0xBF )
1061+ ) {
1062+ $ i += 2 ;
1063+ continue ;
1064+ }
1065+
1066+ $ b4 = ord ( $ bytes [ $ i + 3 ] ?? "\xC0" );
1067+
1068+ // Valid four-byte code points.
1069+
1070+ if ( $ b4 < 0x80 || $ b4 > 0xBF ) {
1071+ return false ;
1072+ }
1073+
1074+ if (
1075+ ( 0xF0 === $ b1 && $ b2 >= 0x90 && $ b2 <= 0xBF ) ||
1076+ ( $ b1 >= 0xF1 && $ b1 <= 0xF3 && $ b2 >= 0x80 && $ b2 <= 0xBF ) ||
1077+ ( 0xF4 === $ b1 && $ b2 >= 0x80 && $ b2 <= 0x8F )
1078+ ) {
1079+ $ i += 3 ;
1080+ continue ;
1081+ }
1082+
1083+ // Any other sequence is invalid.
1084+ return false ;
1085+ }
1086+
1087+ // Reaching the end implies validating every byte.
1088+ return true ;
1089+ }
1090+
9171091/**
9181092 * Converts a number of special characters into their HTML entities.
9191093 *
@@ -1597,7 +1771,7 @@ function remove_accents( $text, $locale = '' ) {
15971771 return $ text ;
15981772 }
15991773
1600- if ( seems_utf8 ( $ text ) ) {
1774+ if ( wp_is_valid_utf8 ( $ text ) ) {
16011775
16021776 /*
16031777 * Unicode sequence normalization from NFD (Normalization Form Decomposed)
@@ -2028,7 +2202,7 @@ function sanitize_file_name( $filename ) {
20282202 $ utf8_pcre = @preg_match ( '/^./u ' , 'a ' );
20292203 }
20302204
2031- if ( ! seems_utf8 ( $ filename ) ) {
2205+ if ( ! wp_is_valid_utf8 ( $ filename ) ) {
20322206 $ _ext = pathinfo ( $ filename , PATHINFO_EXTENSION );
20332207 $ _name = pathinfo ( $ filename , PATHINFO_FILENAME );
20342208 $ filename = sanitize_title_with_dashes ( $ _name ) . '. ' . $ _ext ;
@@ -2277,7 +2451,7 @@ function sanitize_title_with_dashes( $title, $raw_title = '', $context = 'displa
22772451 // Restore octets.
22782452 $ title = preg_replace ( '|---([a-fA-F0-9][a-fA-F0-9])---| ' , '%$1 ' , $ title );
22792453
2280- if ( seems_utf8 ( $ title ) ) {
2454+ if ( wp_is_valid_utf8 ( $ title ) ) {
22812455 if ( function_exists ( 'mb_strtolower ' ) ) {
22822456 $ title = mb_strtolower ( $ title , 'UTF-8 ' );
22832457 }
0 commit comments