@@ -1585,160 +1585,72 @@ function wp_kses_attr_check( &$name, &$value, &$whole, $vless, $element, $allowe
15851585}
15861586
15871587/**
1588- * Builds an attribute list from string containing attributes.
1589- *
1590- * This function does a lot of work. It parses an attribute list into an array
1591- * with attribute data, and tries to do the right thing even if it gets weird
1592- * input. It will add quotes around attribute values that don't have any quotes
1593- * or apostrophes around them, to make it easier to produce HTML code that will
1594- * conform to W3C's HTML specification. It will also remove bad URL protocols
1595- * from attribute values. It also reduces duplicate attributes by using the
1596- * attribute defined first (`foo='bar' foo='baz'` will result in `foo='bar'`).
1588+ * Given a string of HTML attributes and values, parse into a structured attribute list.
1589+ *
1590+ * This function performs a number of transformations while parsing attribute strings:
1591+ * - It normalizes attribute values and surrounds them with double quotes.
1592+ * - It normalizes HTML character references inside attribute values.
1593+ * - It removes “bad” URL protocols from attribute values.
1594+ *
1595+ * Otherwise this reads the attributes as if they were part of an HTML tag. It performs
1596+ * these transformations to lower the risk of mis-parsing down the line and to perform
1597+ * URL sanitization in line with the rest of the `kses` subsystem. Importantly, it does
1598+ * not decode the attribute values, meaning that special HTML syntax characters will
1599+ * be left with character references in the `value` property.
1600+ *
1601+ * Example:
1602+ *
1603+ * $attrs = wp_kses_hair( 'class="is-wide" inert data-lazy=\'<img>\' =/🐮=/' );
1604+ * $attrs === array(
1605+ * 'class' => array( 'name' => 'class', 'value' => 'is-wide', 'whole' => 'class="is-wide"', 'vless' => 'n' ),
1606+ * 'inert' => array( 'name' => 'inert', 'value' => '', 'whole' => 'inert', 'vless' => 'y' ),
1607+ * 'data-lazy' => array( 'name' => 'data-lazy', 'value' => '<img>', 'whole' => 'data-lazy="<img>"', 'vless' => 'n' ),
1608+ * '=' => array( 'name' => '=', 'value' => '', 'whole' => '=', vless => 'y' ),
1609+ * '🐮' => array( 'name' => '🐮', 'value' => '/', 'whole' => '🐮="/"', vless => 'n' ),
1610+ * );
15971611 *
15981612 * @since 1.0.0
1613+ * @since 6.9.0 Rebuilt on HTML API
15991614 *
16001615 * @param string $attr Attribute list from HTML element to closing HTML element tag.
16011616 * @param string[] $allowed_protocols Array of allowed URL protocols.
16021617 * @return array[] Array of attribute information after parsing.
16031618 */
16041619function wp_kses_hair ( $ attr , $ allowed_protocols ) {
1605- $ attrarr = array ();
1606- $ mode = 0 ;
1607- $ attrname = '' ;
1608- $ uris = wp_kses_uri_attributes ();
1609-
1610- // Loop through the whole attribute list.
1611-
1612- while ( strlen ( $ attr ) !== 0 ) {
1613- $ working = 0 ; // Was the last operation successful?
1614-
1615- switch ( $ mode ) {
1616- case 0 :
1617- if ( preg_match ( '/^([_a-zA-Z][-_a-zA-Z0-9:.]*)/ ' , $ attr , $ match ) ) {
1618- $ attrname = $ match [1 ];
1619- $ working = 1 ;
1620- $ mode = 1 ;
1621- $ attr = preg_replace ( '/^[_a-zA-Z][-_a-zA-Z0-9:.]*/ ' , '' , $ attr );
1622- }
1623-
1624- break ;
1625-
1626- case 1 :
1627- if ( preg_match ( '/^\s*=\s*/ ' , $ attr ) ) { // Equals sign.
1628- $ working = 1 ;
1629- $ mode = 2 ;
1630- $ attr = preg_replace ( '/^\s*=\s*/ ' , '' , $ attr );
1631- break ;
1632- }
1633-
1634- if ( preg_match ( '/^\s+/ ' , $ attr ) ) { // Valueless.
1635- $ working = 1 ;
1636- $ mode = 0 ;
1637-
1638- if ( false === array_key_exists ( $ attrname , $ attrarr ) ) {
1639- $ attrarr [ $ attrname ] = array (
1640- 'name ' => $ attrname ,
1641- 'value ' => '' ,
1642- 'whole ' => $ attrname ,
1643- 'vless ' => 'y ' ,
1644- );
1645- }
1646-
1647- $ attr = preg_replace ( '/^\s+/ ' , '' , $ attr );
1648- }
1649-
1650- break ;
1651-
1652- case 2 :
1653- if ( preg_match ( '%^"([^"]*)"(\s+|/?$)% ' , $ attr , $ match ) ) {
1654- // "value"
1655- $ thisval = $ match [1 ];
1656- if ( in_array ( strtolower ( $ attrname ), $ uris , true ) ) {
1657- $ thisval = wp_kses_bad_protocol ( $ thisval , $ allowed_protocols );
1658- }
1659-
1660- if ( false === array_key_exists ( $ attrname , $ attrarr ) ) {
1661- $ attrarr [ $ attrname ] = array (
1662- 'name ' => $ attrname ,
1663- 'value ' => $ thisval ,
1664- 'whole ' => "$ attrname= \"$ thisval \"" ,
1665- 'vless ' => 'n ' ,
1666- );
1667- }
1668-
1669- $ working = 1 ;
1670- $ mode = 0 ;
1671- $ attr = preg_replace ( '/^"[^"]*"(\s+|$)/ ' , '' , $ attr );
1672- break ;
1673- }
1620+ $ attributes = array ();
1621+ $ uris = wp_kses_uri_attributes ();
16741622
1675- if ( preg_match ( "%^'([^']*)'(\s+|/?$)% " , $ attr , $ match ) ) {
1676- // 'value'
1677- $ thisval = $ match [1 ];
1678- if ( in_array ( strtolower ( $ attrname ), $ uris , true ) ) {
1679- $ thisval = wp_kses_bad_protocol ( $ thisval , $ allowed_protocols );
1680- }
1681-
1682- if ( false === array_key_exists ( $ attrname , $ attrarr ) ) {
1683- $ attrarr [ $ attrname ] = array (
1684- 'name ' => $ attrname ,
1685- 'value ' => $ thisval ,
1686- 'whole ' => "$ attrname=' $ thisval' " ,
1687- 'vless ' => 'n ' ,
1688- );
1689- }
1690-
1691- $ working = 1 ;
1692- $ mode = 0 ;
1693- $ attr = preg_replace ( "/^'[^']*'(\s+|$)/ " , '' , $ attr );
1694- break ;
1695- }
1623+ $ processor = new WP_HTML_Tag_Processor ( "<wp {$ attr }> " );
1624+ $ processor ->next_token ();
16961625
1697- if ( preg_match ( "%^([^\s \"']+)(\s+|/?$)% " , $ attr , $ match ) ) {
1698- // value
1699- $ thisval = $ match [1 ];
1700- if ( in_array ( strtolower ( $ attrname ), $ uris , true ) ) {
1701- $ thisval = wp_kses_bad_protocol ( $ thisval , $ allowed_protocols );
1702- }
1703-
1704- if ( false === array_key_exists ( $ attrname , $ attrarr ) ) {
1705- $ attrarr [ $ attrname ] = array (
1706- 'name ' => $ attrname ,
1707- 'value ' => $ thisval ,
1708- 'whole ' => "$ attrname= \"$ thisval \"" ,
1709- 'vless ' => 'n ' ,
1710- );
1711- }
1712-
1713- // We add quotes to conform to W3C's HTML spec.
1714- $ working = 1 ;
1715- $ mode = 0 ;
1716- $ attr = preg_replace ( "%^[^\s \"']+(\s+|$)% " , '' , $ attr );
1717- }
1626+ foreach ( $ processor ->get_attribute_names_with_prefix ( '' ) as $ name ) {
1627+ $ value = $ processor ->get_attribute ( $ name );
1628+ $ is_bool = true === $ value ;
1629+ if ( is_string ( $ value ) && in_array ( $ name , $ uris , true ) ) {
1630+ $ value = wp_kses_bad_protocol ( $ value , $ allowed_protocols );
1631+ }
17181632
1719- break ;
1720- } // End switch.
1633+ // Reconstruct and normalize the attribute value.
1634+ $ syntax_characters = array (
1635+ '& ' => '& ' ,
1636+ '< ' => '< ' ,
1637+ '> ' => '> ' ,
1638+ "' " => '' ' ,
1639+ '" ' => '" ' ,
1640+ );
17211641
1722- if ( 0 === $ working ) { // Not well-formed, remove and try again.
1723- $ attr = wp_kses_html_error ( $ attr );
1724- $ mode = 0 ;
1725- }
1726- } // End while.
1642+ $ recoded = $ is_bool ? '' : strtr ( $ value , $ syntax_characters );
1643+ $ whole = $ is_bool ? $ name : "{$ name }= \"{$ recoded }\"" ;
17271644
1728- if ( 1 === $ mode && false === array_key_exists ( $ attrname , $ attrarr ) ) {
1729- /*
1730- * Special case, for when the attribute list ends with a valueless
1731- * attribute like "selected".
1732- */
1733- $ attrarr [ $ attrname ] = array (
1734- 'name ' => $ attrname ,
1735- 'value ' => '' ,
1736- 'whole ' => $ attrname ,
1737- 'vless ' => 'y ' ,
1645+ $ attributes [ $ name ] = array (
1646+ 'name ' => $ name ,
1647+ 'value ' => $ recoded ,
1648+ 'whole ' => $ whole ,
1649+ 'vless ' => $ is_bool ? 'y ' : 'n ' ,
17381650 );
17391651 }
17401652
1741- return $ attrarr ;
1653+ return $ attributes ;
17421654}
17431655
17441656/**
0 commit comments