Skip to content

Commit b990a2b

Browse files
committed
Formatting: Introduce normalizign function for escaped HTML.
Like `wp_kses_normalize_entities()` but built for UTF-8 and HTML5 and relying on the HTML API for reliabilty.
1 parent 6045c24 commit b990a2b

File tree

2 files changed

+200
-0
lines changed

2 files changed

+200
-0
lines changed

src/wp-includes/formatting.php

Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -992,6 +992,148 @@ function _wp_specialchars( $text, $quote_style = ENT_NOQUOTES, $charset = false,
992992
return $text;
993993
}
994994

995+
/**
996+
* Normalize the escaping for content within an HTML string.
997+
*
998+
* @since {WP_VERSION}
999+
*
1000+
* @param string $context "attribute" for strings comprising a full HTML attribute value,
1001+
* or "data" for text nodes.
1002+
* @param string $text string containing HTML-escaped or escapable content, in UTF-8.
1003+
* @return string version of input where all appropriate characters and escapes
1004+
* are standard and predictable.
1005+
*/
1006+
function wp_normalize_escaped_html_text( string $context, string $text ): string {
1007+
$normalized = array();
1008+
$end = strlen( $text );
1009+
$at = 0;
1010+
$was_at = 0;
1011+
$token_length = 0;
1012+
1013+
while ( $at < $end ) {
1014+
$next_character_reference_at = strpos( $text, '&', $at );
1015+
if ( false === $next_character_reference_at ) {
1016+
break;
1017+
}
1018+
1019+
$character_reference = WP_HTML_Decoder::read_character_reference( $context, $text, $next_character_reference_at, $token_length );
1020+
1021+
// This is an un-escaped ampersand character, so encode it.
1022+
if ( ! isset( $character_reference ) ) {
1023+
$normalized[] = substr( $text, $was_at, $next_character_reference_at - $was_at ) . '&amp;';
1024+
$at = $next_character_reference_at + 1;
1025+
$was_at = $at;
1026+
continue;
1027+
}
1028+
1029+
// Some characters are best left visible to the human mind.
1030+
$should_unhide = 1 === strspn( $character_reference, ',%()0123456789:[]ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz{}' );
1031+
if ( $should_unhide ) {
1032+
$normalized[] = substr( $text, $was_at, $next_character_reference_at - $was_at ) . $character_reference;
1033+
$at = $next_character_reference_at + $token_length;
1034+
$was_at = $at;
1035+
continue;
1036+
}
1037+
1038+
$is_syntax = 1 === strspn( $character_reference, '&"\'<>' );
1039+
if ( $is_syntax && '#' === $text[ $next_character_reference_at + 1 ] ) {
1040+
$named_form = strtr(
1041+
$character_reference,
1042+
array(
1043+
'&' => '&amp;',
1044+
'"' => '&quot;',
1045+
"'" => '&apos;',
1046+
'<' => '&lt;',
1047+
'>' => '&gt;',
1048+
)
1049+
);
1050+
$normalized[] = substr( $text, $was_at, $next_character_reference_at - $was_at ) . $named_form;
1051+
$at = $next_character_reference_at + $token_length;
1052+
$was_at = $at;
1053+
continue;
1054+
}
1055+
1056+
// This is a valid character reference, but it might not be normative.
1057+
$needs_semicolon = ';' !== $text[ $next_character_reference_at + $token_length - 1 ];
1058+
1059+
// This is a named character reference.
1060+
if ( '#' !== $text[ $next_character_reference_at + 1 ] ) {
1061+
// Nothing to do for already-normalized named character references.
1062+
if ( ! $needs_semicolon ) {
1063+
$at = $next_character_reference_at + $token_length;
1064+
continue;
1065+
}
1066+
1067+
// Add the missing semicolon.
1068+
$normalized[] = substr( $text, $was_at, $next_character_reference_at - $was_at + $token_length ) . ';';
1069+
$at = $next_character_reference_at + $token_length;
1070+
$was_at = $at;
1071+
continue;
1072+
}
1073+
1074+
/*
1075+
* While named character references have only a single form and are case sensitive,
1076+
* numeric character references may contain upper or lowercase hex values and may
1077+
* contain unlimited preceding zeros.
1078+
*/
1079+
$is_hex = 'x' === $text[ $next_character_reference_at + 2 ] || 'X' === $text[ $next_character_reference_at + 2 ];
1080+
$digits_at = $next_character_reference_at + ( $is_hex ? 3 : 2 );
1081+
$leading_zeros = '0' === $text[ $digits_at ] ? strspn( $text, '0', $digits_at ) : 0;
1082+
1083+
if ( ! $needs_semicolon && ! $is_hex && '' === $leading_zeros ) {
1084+
// Nothing to do for already-normalized decimal numeric character references.
1085+
$at = $next_character_reference_at + $token_length;
1086+
continue;
1087+
}
1088+
1089+
$digits = substr( $text, $digits_at + $leading_zeros, $next_character_reference_at + $token_length - $digits_at - $leading_zeros - ( $needs_semicolon ? 0 : 1 ) );
1090+
if ( $is_hex ) {
1091+
$lower_digits = strtolower( $digits );
1092+
1093+
// Nothing to do for already-normalized hexadecimal numeric character references.
1094+
if ( $lower_digits === $digits && ! $needs_semicolon && 0 === $leading_zeros ) {
1095+
$at = $next_character_reference_at + $token_length;
1096+
continue;
1097+
}
1098+
1099+
$normalized[] = substr( $text, $was_at, $next_character_reference_at - $was_at ) . "&#x{$lower_digits};";
1100+
$at = $next_character_reference_at + $token_length;
1101+
$was_at = $at;
1102+
continue;
1103+
} else {
1104+
$normalized[] = substr( $text, $was_at, $next_character_reference_at - $was_at ) . "&#{$digits};";
1105+
$at = $next_character_reference_at + $token_length;
1106+
$was_at = $at;
1107+
continue;
1108+
}
1109+
1110+
die( 'should not have arrived here' );
1111+
++$at;
1112+
}
1113+
1114+
if ( 0 === $was_at ) {
1115+
$normalized_text = strtr( $text, '&', '&amp;' );
1116+
} else {
1117+
$normalized[] = substr( $text, $was_at, $end - $was_at );
1118+
$normalized_text = implode( '', $normalized );
1119+
}
1120+
1121+
return strtr(
1122+
$normalized_text,
1123+
array(
1124+
'<' => '&lt;',
1125+
'>' => '&gt;',
1126+
'"' => '&quot;',
1127+
"'" => '&apos;',
1128+
/*
1129+
* Stray ampersand "&" characters have already been replaced above,
1130+
* so it’s inappropriate to replace again here, as all remaining
1131+
* instances should be part of a normalized character reference.
1132+
*/
1133+
)
1134+
);
1135+
}
1136+
9951137
/**
9961138
* Converts a number of HTML entities into their special characters.
9971139
*
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
<?php
2+
3+
/**
4+
* @group formatting
5+
*
6+
* @covers \wp_normalize_escaped_html_text()
7+
*/
8+
class Tests_Formatting_NormalizeEscapedHtmlText extends WP_UnitTestCase {
9+
/**
10+
* Ensures that HTML test is properly normalized.
11+
*
12+
* @dataProvider data_example_datasets
13+
*
14+
* @param string $context
15+
* @param string $text
16+
* @param string $expected
17+
*/
18+
public function test_example_datasets( $context, $text, $expected ) {
19+
$this->assertEquals(
20+
$expected,
21+
wp_normalize_escaped_html_text( $context, $text )
22+
);
23+
}
24+
25+
public static function data_example_datasets() {
26+
return array(
27+
array( 'attribute', 'test', 'test' ),
28+
array( 'attribute', 'test & done', 'test &amp; done' ),
29+
array( 'attribute', '&#XFe; is not iron', '&#xfe; is not iron' ),
30+
array( 'attribute', 'spec > guess', 'spec &gt; guess' ),
31+
array( 'attribute', 'art & copy', 'art &amp; copy' ),
32+
array( 'attribute', '&#x1F170', '&#x1f170;' ),
33+
array( 'attribute', '&#x1F170 ', '&#x1f170; ' ),
34+
35+
array( 'data', 'test', 'test' ),
36+
array( 'data', 'test & done', 'test &amp; done' ),
37+
array( 'data', '&#XFe; is not iron', '&#xfe; is not iron' ),
38+
array( 'data', 'spec > guess', 'spec &gt; guess' ),
39+
array( 'data', 'art & copy', 'art &amp; copy' ),
40+
array( 'data', '&#x1F170', '&#x1f170;' ),
41+
array( 'data', '&#x1F170 ', '&#x1f170; ' ),
42+
43+
// The “ambiguous ampersand” has different rules in the attribute value and data states.
44+
array( 'attribute', '&notmyproblem', '&amp;notmyproblem' ),
45+
array( 'data', '&notmyproblem', '&not;myproblem' ),
46+
47+
// Certain characters should remain plaintext.
48+
array( 'attribute', 'eat &#x000033; apples', 'eat 3 apples' ),
49+
array( 'data', 'eat &#x000033; apples', 'eat 3 apples' ),
50+
array( 'data', '<&#x00073;cr&#0105pt&gt;', '&lt;script&gt;' ),
51+
array( 'attribute', '&#x6a;avascript&#58alert&#40;&#x0000007b"test&quot;&#125;&#41;', 'javascript:alert({&quot;test&quot;})' ),
52+
53+
// Syntax characters should be represented uniformly.
54+
array( 'attribute', '&#X3CIMG&#00062', '&lt;IMG&gt;' ),
55+
array( 'data', '&#X3CIMG&#00062', '&lt;IMG&gt;' ),
56+
);
57+
}
58+
}

0 commit comments

Comments
 (0)