Skip to content

Commit 5565848

Browse files
committed
HTML API: Refactor wp_kses_hair() (#9248)
Trac ticket: Core-63694 `wp_kses_hair()` is built around an impressive state machine for parsing the `$attr` of an HTML tag, that is, the span of text after the tag name and before the closing `>`. Unfortunately, that parsing code doesn’t fully-implement the HTML specification and may be prone to mis-parsing. This patch replaces the existing state machine with a straight-forward use of the HTML API to parse the attributes for us, constructing a shell take for the `$attr` string and reading the attributes structurally. This shell is necessary because a previous stage of the pipeline has already separated what it thinks is the so-called “attribute list” from a tag. Props: dmsnell
1 parent 2077aa2 commit 5565848

File tree

2 files changed

+51
-139
lines changed

2 files changed

+51
-139
lines changed

src/wp-includes/kses.php

Lines changed: 50 additions & 138 deletions
Original file line numberDiff line numberDiff line change
@@ -1405,160 +1405,72 @@ function wp_kses_attr_check( &$name, &$value, &$whole, $vless, $element, $allowe
14051405
}
14061406

14071407
/**
1408-
* Builds an attribute list from string containing attributes.
1409-
*
1410-
* This function does a lot of work. It parses an attribute list into an array
1411-
* with attribute data, and tries to do the right thing even if it gets weird
1412-
* input. It will add quotes around attribute values that don't have any quotes
1413-
* or apostrophes around them, to make it easier to produce HTML code that will
1414-
* conform to W3C's HTML specification. It will also remove bad URL protocols
1415-
* from attribute values. It also reduces duplicate attributes by using the
1416-
* attribute defined first (`foo='bar' foo='baz'` will result in `foo='bar'`).
1408+
* Given a string of HTML attributes and values, parse into a structured attribute list.
1409+
*
1410+
* This function performs a number of transformations while parsing attribute strings:
1411+
* - It normalizes attribute values and surrounds them with double quotes.
1412+
* - It normalizes HTML character references inside attribute values.
1413+
* - It removes “bad” URL protocols from attribute values.
1414+
*
1415+
* Otherwise this reads the attributes as if they were part of an HTML tag. It performs
1416+
* these transformations to lower the risk of mis-parsing down the line and to perform
1417+
* URL sanitization in line with the rest of the `kses` subsystem. Importantly, it does
1418+
* not decode the attribute values, meaning that special HTML syntax characters will
1419+
* be left with character references in the `value` property.
1420+
*
1421+
* Example:
1422+
*
1423+
* $attrs = wp_kses_hair( 'class="is-wide" inert data-lazy=\'<img&#00062\' =/🐮=/' );
1424+
* $attrs === array(
1425+
* 'class' => array( 'name' => 'class', 'value' => 'is-wide', 'whole' => 'class="is-wide"', 'vless' => 'n' ),
1426+
* 'inert' => array( 'name' => 'inert', 'value' => '', 'whole' => 'inert', 'vless' => 'y' ),
1427+
* 'data-lazy' => array( 'name' => 'data-lazy', 'value' => '<img>', 'whole' => 'data-lazy="<img>"', 'vless' => 'n' ),
1428+
* '=' => array( 'name' => '=', 'value' => '', 'whole' => '=', vless => 'y' ),
1429+
* '🐮' => array( 'name' => '🐮', 'value' => '/', 'whole' => '🐮="/"', vless => 'n' ),
1430+
* );
14171431
*
14181432
* @since 1.0.0
1433+
* @since 6.9.0 Rebuilt on HTML API
14191434
*
14201435
* @param string $attr Attribute list from HTML element to closing HTML element tag.
14211436
* @param string[] $allowed_protocols Array of allowed URL protocols.
14221437
* @return array[] Array of attribute information after parsing.
14231438
*/
14241439
function wp_kses_hair( $attr, $allowed_protocols ) {
1425-
$attrarr = array();
1426-
$mode = 0;
1427-
$attrname = '';
1428-
$uris = wp_kses_uri_attributes();
1429-
1430-
// Loop through the whole attribute list.
1431-
1432-
while ( strlen( $attr ) !== 0 ) {
1433-
$working = 0; // Was the last operation successful?
1434-
1435-
switch ( $mode ) {
1436-
case 0:
1437-
if ( preg_match( '/^([_a-zA-Z][-_a-zA-Z0-9:.]*)/', $attr, $match ) ) {
1438-
$attrname = $match[1];
1439-
$working = 1;
1440-
$mode = 1;
1441-
$attr = preg_replace( '/^[_a-zA-Z][-_a-zA-Z0-9:.]*/', '', $attr );
1442-
}
1443-
1444-
break;
1445-
1446-
case 1:
1447-
if ( preg_match( '/^\s*=\s*/', $attr ) ) { // Equals sign.
1448-
$working = 1;
1449-
$mode = 2;
1450-
$attr = preg_replace( '/^\s*=\s*/', '', $attr );
1451-
break;
1452-
}
1453-
1454-
if ( preg_match( '/^\s+/', $attr ) ) { // Valueless.
1455-
$working = 1;
1456-
$mode = 0;
1457-
1458-
if ( false === array_key_exists( $attrname, $attrarr ) ) {
1459-
$attrarr[ $attrname ] = array(
1460-
'name' => $attrname,
1461-
'value' => '',
1462-
'whole' => $attrname,
1463-
'vless' => 'y',
1464-
);
1465-
}
1466-
1467-
$attr = preg_replace( '/^\s+/', '', $attr );
1468-
}
1469-
1470-
break;
1471-
1472-
case 2:
1473-
if ( preg_match( '%^"([^"]*)"(\s+|/?$)%', $attr, $match ) ) {
1474-
// "value"
1475-
$thisval = $match[1];
1476-
if ( in_array( strtolower( $attrname ), $uris, true ) ) {
1477-
$thisval = wp_kses_bad_protocol( $thisval, $allowed_protocols );
1478-
}
1479-
1480-
if ( false === array_key_exists( $attrname, $attrarr ) ) {
1481-
$attrarr[ $attrname ] = array(
1482-
'name' => $attrname,
1483-
'value' => $thisval,
1484-
'whole' => "$attrname=\"$thisval\"",
1485-
'vless' => 'n',
1486-
);
1487-
}
1488-
1489-
$working = 1;
1490-
$mode = 0;
1491-
$attr = preg_replace( '/^"[^"]*"(\s+|$)/', '', $attr );
1492-
break;
1493-
}
1440+
$attributes = array();
1441+
$uris = wp_kses_uri_attributes();
14941442

1495-
if ( preg_match( "%^'([^']*)'(\s+|/?$)%", $attr, $match ) ) {
1496-
// 'value'
1497-
$thisval = $match[1];
1498-
if ( in_array( strtolower( $attrname ), $uris, true ) ) {
1499-
$thisval = wp_kses_bad_protocol( $thisval, $allowed_protocols );
1500-
}
1501-
1502-
if ( false === array_key_exists( $attrname, $attrarr ) ) {
1503-
$attrarr[ $attrname ] = array(
1504-
'name' => $attrname,
1505-
'value' => $thisval,
1506-
'whole' => "$attrname='$thisval'",
1507-
'vless' => 'n',
1508-
);
1509-
}
1510-
1511-
$working = 1;
1512-
$mode = 0;
1513-
$attr = preg_replace( "/^'[^']*'(\s+|$)/", '', $attr );
1514-
break;
1515-
}
1443+
$processor = new WP_HTML_Tag_Processor( "<wp {$attr}>" );
1444+
$processor->next_token();
15161445

1517-
if ( preg_match( "%^([^\s\"']+)(\s+|/?$)%", $attr, $match ) ) {
1518-
// value
1519-
$thisval = $match[1];
1520-
if ( in_array( strtolower( $attrname ), $uris, true ) ) {
1521-
$thisval = wp_kses_bad_protocol( $thisval, $allowed_protocols );
1522-
}
1523-
1524-
if ( false === array_key_exists( $attrname, $attrarr ) ) {
1525-
$attrarr[ $attrname ] = array(
1526-
'name' => $attrname,
1527-
'value' => $thisval,
1528-
'whole' => "$attrname=\"$thisval\"",
1529-
'vless' => 'n',
1530-
);
1531-
}
1532-
1533-
// We add quotes to conform to W3C's HTML spec.
1534-
$working = 1;
1535-
$mode = 0;
1536-
$attr = preg_replace( "%^[^\s\"']+(\s+|$)%", '', $attr );
1537-
}
1446+
foreach ( $processor->get_attribute_names_with_prefix( '' ) as $name ) {
1447+
$value = $processor->get_attribute( $name );
1448+
$is_bool = true === $value;
1449+
if ( is_string( $value ) && in_array( $name, $uris, true ) ) {
1450+
$value = wp_kses_bad_protocol( $value, $allowed_protocols );
1451+
}
15381452

1539-
break;
1540-
} // End switch.
1453+
// Reconstruct and normalize the attribute value.
1454+
$syntax_characters = array(
1455+
'&' => '&amp;',
1456+
'<' => '&lt;',
1457+
'>' => '&gt;',
1458+
"'" => '&apos;',
1459+
'"' => '&quot;',
1460+
);
15411461

1542-
if ( 0 === $working ) { // Not well-formed, remove and try again.
1543-
$attr = wp_kses_html_error( $attr );
1544-
$mode = 0;
1545-
}
1546-
} // End while.
1462+
$recoded = $is_bool ? '' : strtr( $value, $syntax_characters );
1463+
$whole = $is_bool ? $name : "{$name}=\"{$recoded}\"";
15471464

1548-
if ( 1 === $mode && false === array_key_exists( $attrname, $attrarr ) ) {
1549-
/*
1550-
* Special case, for when the attribute list ends with a valueless
1551-
* attribute like "selected".
1552-
*/
1553-
$attrarr[ $attrname ] = array(
1554-
'name' => $attrname,
1555-
'value' => '',
1556-
'whole' => $attrname,
1557-
'vless' => 'y',
1465+
$attributes[ $name ] = array(
1466+
'name' => $name,
1467+
'value' => $recoded,
1468+
'whole' => $whole,
1469+
'vless' => $is_bool ? 'y' : 'n',
15581470
);
15591471
}
15601472

1561-
return $attrarr;
1473+
return $attributes;
15621474
}
15631475

15641476
/**

tests/phpunit/tests/media.php

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -227,7 +227,7 @@ public function test_new_img_caption_shortcode_with_html_caption() {
227227
$this->assertStringNotContainsString(
228228
self::HTML_CONTENT,
229229
$mark,
230-
'Test caption content should not contain the mark surround it: check test setup.'
230+
'Test caption content should not contain the mark surrounding it: check test setup.'
231231
);
232232

233233
$result = img_caption_shortcode(

0 commit comments

Comments
 (0)