@@ -992,6 +992,148 @@ function _wp_specialchars( $text, $quote_style = ENT_NOQUOTES, $charset = false,
992992 return $ text ;
993993}
994994
995+ /**
996+ * Normalize the escaping for content within an HTML string.
997+ *
998+ * @since {WP_VERSION}
999+ *
1000+ * @param string $context "attribute" for strings comprising a full HTML attribute value,
1001+ * or "data" for text nodes.
1002+ * @param string $text string containing HTML-escaped or escapable content, in UTF-8.
1003+ * @return string version of input where all appropriate characters and escapes
1004+ * are standard and predictable.
1005+ */
1006+ function wp_normalize_escaped_html_text ( string $ context , string $ text ): string {
1007+ $ normalized = array ();
1008+ $ end = strlen ( $ text );
1009+ $ at = 0 ;
1010+ $ was_at = 0 ;
1011+ $ token_length = 0 ;
1012+
1013+ while ( $ at < $ end ) {
1014+ $ next_character_reference_at = strpos ( $ text , '& ' , $ at );
1015+ if ( false === $ next_character_reference_at ) {
1016+ break ;
1017+ }
1018+
1019+ $ character_reference = WP_HTML_Decoder::read_character_reference ( $ context , $ text , $ next_character_reference_at , $ token_length );
1020+
1021+ // This is an un-escaped ampersand character, so encode it.
1022+ if ( ! isset ( $ character_reference ) ) {
1023+ $ normalized [] = substr ( $ text , $ was_at , $ next_character_reference_at - $ was_at ) . '& ' ;
1024+ $ at = $ next_character_reference_at + 1 ;
1025+ $ was_at = $ at ;
1026+ continue ;
1027+ }
1028+
1029+ // Some characters are best left visible to the human mind.
1030+ $ should_unhide = 1 === strspn ( $ character_reference , ',%()0123456789:[]ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz{} ' );
1031+ if ( $ should_unhide ) {
1032+ $ normalized [] = substr ( $ text , $ was_at , $ next_character_reference_at - $ was_at ) . $ character_reference ;
1033+ $ at = $ next_character_reference_at + $ token_length ;
1034+ $ was_at = $ at ;
1035+ continue ;
1036+ }
1037+
1038+ $ is_syntax = 1 === strspn ( $ character_reference , '&" \'<> ' );
1039+ if ( $ is_syntax && '# ' === $ text [ $ next_character_reference_at + 1 ] ) {
1040+ $ named_form = strtr (
1041+ $ character_reference ,
1042+ array (
1043+ '& ' => '& ' ,
1044+ '" ' => '" ' ,
1045+ "' " => '' ' ,
1046+ '< ' => '< ' ,
1047+ '> ' => '> ' ,
1048+ )
1049+ );
1050+ $ normalized [] = substr ( $ text , $ was_at , $ next_character_reference_at - $ was_at ) . $ named_form ;
1051+ $ at = $ next_character_reference_at + $ token_length ;
1052+ $ was_at = $ at ;
1053+ continue ;
1054+ }
1055+
1056+ // This is a valid character reference, but it might not be normative.
1057+ $ needs_semicolon = '; ' !== $ text [ $ next_character_reference_at + $ token_length - 1 ];
1058+
1059+ // This is a named character reference.
1060+ if ( '# ' !== $ text [ $ next_character_reference_at + 1 ] ) {
1061+ // Nothing to do for already-normalized named character references.
1062+ if ( ! $ needs_semicolon ) {
1063+ $ at = $ next_character_reference_at + $ token_length ;
1064+ continue ;
1065+ }
1066+
1067+ // Add the missing semicolon.
1068+ $ normalized [] = substr ( $ text , $ was_at , $ next_character_reference_at - $ was_at + $ token_length ) . '; ' ;
1069+ $ at = $ next_character_reference_at + $ token_length ;
1070+ $ was_at = $ at ;
1071+ continue ;
1072+ }
1073+
1074+ /*
1075+ * While named character references have only a single form and are case sensitive,
1076+ * numeric character references may contain upper or lowercase hex values and may
1077+ * contain unlimited preceding zeros.
1078+ */
1079+ $ is_hex = 'x ' === $ text [ $ next_character_reference_at + 2 ] || 'X ' === $ text [ $ next_character_reference_at + 2 ];
1080+ $ digits_at = $ next_character_reference_at + ( $ is_hex ? 3 : 2 );
1081+ $ leading_zeros = '0 ' === $ text [ $ digits_at ] ? strspn ( $ text , '0 ' , $ digits_at ) : 0 ;
1082+
1083+ if ( ! $ needs_semicolon && ! $ is_hex && '' === $ leading_zeros ) {
1084+ // Nothing to do for already-normalized decimal numeric character references.
1085+ $ at = $ next_character_reference_at + $ token_length ;
1086+ continue ;
1087+ }
1088+
1089+ $ digits = substr ( $ text , $ digits_at + $ leading_zeros , $ next_character_reference_at + $ token_length - $ digits_at - $ leading_zeros - ( $ needs_semicolon ? 0 : 1 ) );
1090+ if ( $ is_hex ) {
1091+ $ lower_digits = strtolower ( $ digits );
1092+
1093+ // Nothing to do for already-normalized hexadecimal numeric character references.
1094+ if ( $ lower_digits === $ digits && ! $ needs_semicolon && 0 === $ leading_zeros ) {
1095+ $ at = $ next_character_reference_at + $ token_length ;
1096+ continue ;
1097+ }
1098+
1099+ $ normalized [] = substr ( $ text , $ was_at , $ next_character_reference_at - $ was_at ) . "&#x {$ lower_digits }; " ;
1100+ $ at = $ next_character_reference_at + $ token_length ;
1101+ $ was_at = $ at ;
1102+ continue ;
1103+ } else {
1104+ $ normalized [] = substr ( $ text , $ was_at , $ next_character_reference_at - $ was_at ) . "&# {$ digits }; " ;
1105+ $ at = $ next_character_reference_at + $ token_length ;
1106+ $ was_at = $ at ;
1107+ continue ;
1108+ }
1109+
1110+ die ( 'should not have arrived here ' );
1111+ ++$ at ;
1112+ }
1113+
1114+ if ( 0 === $ was_at ) {
1115+ $ normalized_text = strtr ( $ text , '& ' , '& ' );
1116+ } else {
1117+ $ normalized [] = substr ( $ text , $ was_at , $ end - $ was_at );
1118+ $ normalized_text = implode ( '' , $ normalized );
1119+ }
1120+
1121+ return strtr (
1122+ $ normalized_text ,
1123+ array (
1124+ '< ' => '< ' ,
1125+ '> ' => '> ' ,
1126+ '" ' => '" ' ,
1127+ "' " => '' ' ,
1128+ /*
1129+ * Stray ampersand "&" characters have already been replaced above,
1130+ * so it’s inappropriate to replace again here, as all remaining
1131+ * instances should be part of a normalized character reference.
1132+ */
1133+ )
1134+ );
1135+ }
1136+
9951137/**
9961138 * Converts a number of HTML entities into their special characters.
9971139 *
0 commit comments