Skip to content

Commit 63aecea

Browse files
committed
HTML API: Refactor wp_kses_hair() (#9248)
Trac ticket: Core-63694 `wp_kses_hair()` is built around an impressive state machine for parsing the `$attr` of an HTML tag, that is, the span of text after the tag name and before the closing `>`. Unfortunately, that parsing code doesn’t fully-implement the HTML specification and may be prone to mis-parsing. This patch replaces the existing state machine with a straight-forward use of the HTML API to parse the attributes for us, constructing a shell take for the `$attr` string and reading the attributes structurally. This shell is necessary because a previous stage of the pipeline has already separated what it thinks is the so-called “attribute list” from a tag. Props: dmsnell
1 parent 4de4b72 commit 63aecea

File tree

2 files changed

+51
-139
lines changed

2 files changed

+51
-139
lines changed

src/wp-includes/kses.php

Lines changed: 50 additions & 138 deletions
Original file line numberDiff line numberDiff line change
@@ -1585,160 +1585,72 @@ function wp_kses_attr_check( &$name, &$value, &$whole, $vless, $element, $allowe
15851585
}
15861586

15871587
/**
1588-
* Builds an attribute list from string containing attributes.
1589-
*
1590-
* This function does a lot of work. It parses an attribute list into an array
1591-
* with attribute data, and tries to do the right thing even if it gets weird
1592-
* input. It will add quotes around attribute values that don't have any quotes
1593-
* or apostrophes around them, to make it easier to produce HTML code that will
1594-
* conform to W3C's HTML specification. It will also remove bad URL protocols
1595-
* from attribute values. It also reduces duplicate attributes by using the
1596-
* attribute defined first (`foo='bar' foo='baz'` will result in `foo='bar'`).
1588+
* Given a string of HTML attributes and values, parse into a structured attribute list.
1589+
*
1590+
* This function performs a number of transformations while parsing attribute strings:
1591+
* - It normalizes attribute values and surrounds them with double quotes.
1592+
* - It normalizes HTML character references inside attribute values.
1593+
* - It removes “bad” URL protocols from attribute values.
1594+
*
1595+
* Otherwise this reads the attributes as if they were part of an HTML tag. It performs
1596+
* these transformations to lower the risk of mis-parsing down the line and to perform
1597+
* URL sanitization in line with the rest of the `kses` subsystem. Importantly, it does
1598+
* not decode the attribute values, meaning that special HTML syntax characters will
1599+
* be left with character references in the `value` property.
1600+
*
1601+
* Example:
1602+
*
1603+
* $attrs = wp_kses_hair( 'class="is-wide" inert data-lazy=\'<img&#00062\' =/🐮=/' );
1604+
* $attrs === array(
1605+
* 'class' => array( 'name' => 'class', 'value' => 'is-wide', 'whole' => 'class="is-wide"', 'vless' => 'n' ),
1606+
* 'inert' => array( 'name' => 'inert', 'value' => '', 'whole' => 'inert', 'vless' => 'y' ),
1607+
* 'data-lazy' => array( 'name' => 'data-lazy', 'value' => '<img>', 'whole' => 'data-lazy="<img>"', 'vless' => 'n' ),
1608+
* '=' => array( 'name' => '=', 'value' => '', 'whole' => '=', vless => 'y' ),
1609+
* '🐮' => array( 'name' => '🐮', 'value' => '/', 'whole' => '🐮="/"', vless => 'n' ),
1610+
* );
15971611
*
15981612
* @since 1.0.0
1613+
* @since 6.9.0 Rebuilt on HTML API
15991614
*
16001615
* @param string $attr Attribute list from HTML element to closing HTML element tag.
16011616
* @param string[] $allowed_protocols Array of allowed URL protocols.
16021617
* @return array[] Array of attribute information after parsing.
16031618
*/
16041619
function wp_kses_hair( $attr, $allowed_protocols ) {
1605-
$attrarr = array();
1606-
$mode = 0;
1607-
$attrname = '';
1608-
$uris = wp_kses_uri_attributes();
1609-
1610-
// Loop through the whole attribute list.
1611-
1612-
while ( strlen( $attr ) !== 0 ) {
1613-
$working = 0; // Was the last operation successful?
1614-
1615-
switch ( $mode ) {
1616-
case 0:
1617-
if ( preg_match( '/^([_a-zA-Z][-_a-zA-Z0-9:.]*)/', $attr, $match ) ) {
1618-
$attrname = $match[1];
1619-
$working = 1;
1620-
$mode = 1;
1621-
$attr = preg_replace( '/^[_a-zA-Z][-_a-zA-Z0-9:.]*/', '', $attr );
1622-
}
1623-
1624-
break;
1625-
1626-
case 1:
1627-
if ( preg_match( '/^\s*=\s*/', $attr ) ) { // Equals sign.
1628-
$working = 1;
1629-
$mode = 2;
1630-
$attr = preg_replace( '/^\s*=\s*/', '', $attr );
1631-
break;
1632-
}
1633-
1634-
if ( preg_match( '/^\s+/', $attr ) ) { // Valueless.
1635-
$working = 1;
1636-
$mode = 0;
1637-
1638-
if ( false === array_key_exists( $attrname, $attrarr ) ) {
1639-
$attrarr[ $attrname ] = array(
1640-
'name' => $attrname,
1641-
'value' => '',
1642-
'whole' => $attrname,
1643-
'vless' => 'y',
1644-
);
1645-
}
1646-
1647-
$attr = preg_replace( '/^\s+/', '', $attr );
1648-
}
1649-
1650-
break;
1651-
1652-
case 2:
1653-
if ( preg_match( '%^"([^"]*)"(\s+|/?$)%', $attr, $match ) ) {
1654-
// "value"
1655-
$thisval = $match[1];
1656-
if ( in_array( strtolower( $attrname ), $uris, true ) ) {
1657-
$thisval = wp_kses_bad_protocol( $thisval, $allowed_protocols );
1658-
}
1659-
1660-
if ( false === array_key_exists( $attrname, $attrarr ) ) {
1661-
$attrarr[ $attrname ] = array(
1662-
'name' => $attrname,
1663-
'value' => $thisval,
1664-
'whole' => "$attrname=\"$thisval\"",
1665-
'vless' => 'n',
1666-
);
1667-
}
1668-
1669-
$working = 1;
1670-
$mode = 0;
1671-
$attr = preg_replace( '/^"[^"]*"(\s+|$)/', '', $attr );
1672-
break;
1673-
}
1620+
$attributes = array();
1621+
$uris = wp_kses_uri_attributes();
16741622

1675-
if ( preg_match( "%^'([^']*)'(\s+|/?$)%", $attr, $match ) ) {
1676-
// 'value'
1677-
$thisval = $match[1];
1678-
if ( in_array( strtolower( $attrname ), $uris, true ) ) {
1679-
$thisval = wp_kses_bad_protocol( $thisval, $allowed_protocols );
1680-
}
1681-
1682-
if ( false === array_key_exists( $attrname, $attrarr ) ) {
1683-
$attrarr[ $attrname ] = array(
1684-
'name' => $attrname,
1685-
'value' => $thisval,
1686-
'whole' => "$attrname='$thisval'",
1687-
'vless' => 'n',
1688-
);
1689-
}
1690-
1691-
$working = 1;
1692-
$mode = 0;
1693-
$attr = preg_replace( "/^'[^']*'(\s+|$)/", '', $attr );
1694-
break;
1695-
}
1623+
$processor = new WP_HTML_Tag_Processor( "<wp {$attr}>" );
1624+
$processor->next_token();
16961625

1697-
if ( preg_match( "%^([^\s\"']+)(\s+|/?$)%", $attr, $match ) ) {
1698-
// value
1699-
$thisval = $match[1];
1700-
if ( in_array( strtolower( $attrname ), $uris, true ) ) {
1701-
$thisval = wp_kses_bad_protocol( $thisval, $allowed_protocols );
1702-
}
1703-
1704-
if ( false === array_key_exists( $attrname, $attrarr ) ) {
1705-
$attrarr[ $attrname ] = array(
1706-
'name' => $attrname,
1707-
'value' => $thisval,
1708-
'whole' => "$attrname=\"$thisval\"",
1709-
'vless' => 'n',
1710-
);
1711-
}
1712-
1713-
// We add quotes to conform to W3C's HTML spec.
1714-
$working = 1;
1715-
$mode = 0;
1716-
$attr = preg_replace( "%^[^\s\"']+(\s+|$)%", '', $attr );
1717-
}
1626+
foreach ( $processor->get_attribute_names_with_prefix( '' ) as $name ) {
1627+
$value = $processor->get_attribute( $name );
1628+
$is_bool = true === $value;
1629+
if ( is_string( $value ) && in_array( $name, $uris, true ) ) {
1630+
$value = wp_kses_bad_protocol( $value, $allowed_protocols );
1631+
}
17181632

1719-
break;
1720-
} // End switch.
1633+
// Reconstruct and normalize the attribute value.
1634+
$syntax_characters = array(
1635+
'&' => '&amp;',
1636+
'<' => '&lt;',
1637+
'>' => '&gt;',
1638+
"'" => '&apos;',
1639+
'"' => '&quot;',
1640+
);
17211641

1722-
if ( 0 === $working ) { // Not well-formed, remove and try again.
1723-
$attr = wp_kses_html_error( $attr );
1724-
$mode = 0;
1725-
}
1726-
} // End while.
1642+
$recoded = $is_bool ? '' : strtr( $value, $syntax_characters );
1643+
$whole = $is_bool ? $name : "{$name}=\"{$recoded}\"";
17271644

1728-
if ( 1 === $mode && false === array_key_exists( $attrname, $attrarr ) ) {
1729-
/*
1730-
* Special case, for when the attribute list ends with a valueless
1731-
* attribute like "selected".
1732-
*/
1733-
$attrarr[ $attrname ] = array(
1734-
'name' => $attrname,
1735-
'value' => '',
1736-
'whole' => $attrname,
1737-
'vless' => 'y',
1645+
$attributes[ $name ] = array(
1646+
'name' => $name,
1647+
'value' => $recoded,
1648+
'whole' => $whole,
1649+
'vless' => $is_bool ? 'y' : 'n',
17381650
);
17391651
}
17401652

1741-
return $attrarr;
1653+
return $attributes;
17421654
}
17431655

17441656
/**

tests/phpunit/tests/media.php

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -227,7 +227,7 @@ public function test_new_img_caption_shortcode_with_html_caption() {
227227
$this->assertStringNotContainsString(
228228
self::HTML_CONTENT,
229229
$mark,
230-
'Test caption content should not contain the mark surround it: check test setup.'
230+
'Test caption content should not contain the mark surrounding it: check test setup.'
231231
);
232232

233233
$result = img_caption_shortcode(

0 commit comments

Comments
 (0)