@@ -1405,160 +1405,72 @@ function wp_kses_attr_check( &$name, &$value, &$whole, $vless, $element, $allowe
14051405}
14061406
14071407/**
1408- * Builds an attribute list from string containing attributes.
1409- *
1410- * This function does a lot of work. It parses an attribute list into an array
1411- * with attribute data, and tries to do the right thing even if it gets weird
1412- * input. It will add quotes around attribute values that don't have any quotes
1413- * or apostrophes around them, to make it easier to produce HTML code that will
1414- * conform to W3C's HTML specification. It will also remove bad URL protocols
1415- * from attribute values. It also reduces duplicate attributes by using the
1416- * attribute defined first (`foo='bar' foo='baz'` will result in `foo='bar'`).
1408+ * Given a string of HTML attributes and values, parse into a structured attribute list.
1409+ *
1410+ * This function performs a number of transformations while parsing attribute strings:
1411+ * - It normalizes attribute values and surrounds them with double quotes.
1412+ * - It normalizes HTML character references inside attribute values.
1413+ * - It removes “bad” URL protocols from attribute values.
1414+ *
1415+ * Otherwise this reads the attributes as if they were part of an HTML tag. It performs
1416+ * these transformations to lower the risk of mis-parsing down the line and to perform
1417+ * URL sanitization in line with the rest of the `kses` subsystem. Importantly, it does
1418+ * not decode the attribute values, meaning that special HTML syntax characters will
1419+ * be left with character references in the `value` property.
1420+ *
1421+ * Example:
1422+ *
1423+ * $attrs = wp_kses_hair( 'class="is-wide" inert data-lazy=\'<img>\' =/🐮=/' );
1424+ * $attrs === array(
1425+ * 'class' => array( 'name' => 'class', 'value' => 'is-wide', 'whole' => 'class="is-wide"', 'vless' => 'n' ),
1426+ * 'inert' => array( 'name' => 'inert', 'value' => '', 'whole' => 'inert', 'vless' => 'y' ),
1427+ * 'data-lazy' => array( 'name' => 'data-lazy', 'value' => '<img>', 'whole' => 'data-lazy="<img>"', 'vless' => 'n' ),
1428+ * '=' => array( 'name' => '=', 'value' => '', 'whole' => '=', vless => 'y' ),
1429+ * '🐮' => array( 'name' => '🐮', 'value' => '/', 'whole' => '🐮="/"', vless => 'n' ),
1430+ * );
14171431 *
14181432 * @since 1.0.0
1433+ * @since 6.9.0 Rebuilt on HTML API
14191434 *
14201435 * @param string $attr Attribute list from HTML element to closing HTML element tag.
14211436 * @param string[] $allowed_protocols Array of allowed URL protocols.
14221437 * @return array[] Array of attribute information after parsing.
14231438 */
14241439function wp_kses_hair ( $ attr , $ allowed_protocols ) {
1425- $ attrarr = array ();
1426- $ mode = 0 ;
1427- $ attrname = '' ;
1428- $ uris = wp_kses_uri_attributes ();
1429-
1430- // Loop through the whole attribute list.
1431-
1432- while ( strlen ( $ attr ) !== 0 ) {
1433- $ working = 0 ; // Was the last operation successful?
1434-
1435- switch ( $ mode ) {
1436- case 0 :
1437- if ( preg_match ( '/^([_a-zA-Z][-_a-zA-Z0-9:.]*)/ ' , $ attr , $ match ) ) {
1438- $ attrname = $ match [1 ];
1439- $ working = 1 ;
1440- $ mode = 1 ;
1441- $ attr = preg_replace ( '/^[_a-zA-Z][-_a-zA-Z0-9:.]*/ ' , '' , $ attr );
1442- }
1443-
1444- break ;
1445-
1446- case 1 :
1447- if ( preg_match ( '/^\s*=\s*/ ' , $ attr ) ) { // Equals sign.
1448- $ working = 1 ;
1449- $ mode = 2 ;
1450- $ attr = preg_replace ( '/^\s*=\s*/ ' , '' , $ attr );
1451- break ;
1452- }
1453-
1454- if ( preg_match ( '/^\s+/ ' , $ attr ) ) { // Valueless.
1455- $ working = 1 ;
1456- $ mode = 0 ;
1457-
1458- if ( false === array_key_exists ( $ attrname , $ attrarr ) ) {
1459- $ attrarr [ $ attrname ] = array (
1460- 'name ' => $ attrname ,
1461- 'value ' => '' ,
1462- 'whole ' => $ attrname ,
1463- 'vless ' => 'y ' ,
1464- );
1465- }
1466-
1467- $ attr = preg_replace ( '/^\s+/ ' , '' , $ attr );
1468- }
1469-
1470- break ;
1471-
1472- case 2 :
1473- if ( preg_match ( '%^"([^"]*)"(\s+|/?$)% ' , $ attr , $ match ) ) {
1474- // "value"
1475- $ thisval = $ match [1 ];
1476- if ( in_array ( strtolower ( $ attrname ), $ uris , true ) ) {
1477- $ thisval = wp_kses_bad_protocol ( $ thisval , $ allowed_protocols );
1478- }
1479-
1480- if ( false === array_key_exists ( $ attrname , $ attrarr ) ) {
1481- $ attrarr [ $ attrname ] = array (
1482- 'name ' => $ attrname ,
1483- 'value ' => $ thisval ,
1484- 'whole ' => "$ attrname= \"$ thisval \"" ,
1485- 'vless ' => 'n ' ,
1486- );
1487- }
1488-
1489- $ working = 1 ;
1490- $ mode = 0 ;
1491- $ attr = preg_replace ( '/^"[^"]*"(\s+|$)/ ' , '' , $ attr );
1492- break ;
1493- }
1440+ $ attributes = array ();
1441+ $ uris = wp_kses_uri_attributes ();
14941442
1495- if ( preg_match ( "%^'([^']*)'(\s+|/?$)% " , $ attr , $ match ) ) {
1496- // 'value'
1497- $ thisval = $ match [1 ];
1498- if ( in_array ( strtolower ( $ attrname ), $ uris , true ) ) {
1499- $ thisval = wp_kses_bad_protocol ( $ thisval , $ allowed_protocols );
1500- }
1501-
1502- if ( false === array_key_exists ( $ attrname , $ attrarr ) ) {
1503- $ attrarr [ $ attrname ] = array (
1504- 'name ' => $ attrname ,
1505- 'value ' => $ thisval ,
1506- 'whole ' => "$ attrname=' $ thisval' " ,
1507- 'vless ' => 'n ' ,
1508- );
1509- }
1510-
1511- $ working = 1 ;
1512- $ mode = 0 ;
1513- $ attr = preg_replace ( "/^'[^']*'(\s+|$)/ " , '' , $ attr );
1514- break ;
1515- }
1443+ $ processor = new WP_HTML_Tag_Processor ( "<wp {$ attr }> " );
1444+ $ processor ->next_token ();
15161445
1517- if ( preg_match ( "%^([^\s \"']+)(\s+|/?$)% " , $ attr , $ match ) ) {
1518- // value
1519- $ thisval = $ match [1 ];
1520- if ( in_array ( strtolower ( $ attrname ), $ uris , true ) ) {
1521- $ thisval = wp_kses_bad_protocol ( $ thisval , $ allowed_protocols );
1522- }
1523-
1524- if ( false === array_key_exists ( $ attrname , $ attrarr ) ) {
1525- $ attrarr [ $ attrname ] = array (
1526- 'name ' => $ attrname ,
1527- 'value ' => $ thisval ,
1528- 'whole ' => "$ attrname= \"$ thisval \"" ,
1529- 'vless ' => 'n ' ,
1530- );
1531- }
1532-
1533- // We add quotes to conform to W3C's HTML spec.
1534- $ working = 1 ;
1535- $ mode = 0 ;
1536- $ attr = preg_replace ( "%^[^\s \"']+(\s+|$)% " , '' , $ attr );
1537- }
1446+ foreach ( $ processor ->get_attribute_names_with_prefix ( '' ) as $ name ) {
1447+ $ value = $ processor ->get_attribute ( $ name );
1448+ $ is_bool = true === $ value ;
1449+ if ( is_string ( $ value ) && in_array ( $ name , $ uris , true ) ) {
1450+ $ value = wp_kses_bad_protocol ( $ value , $ allowed_protocols );
1451+ }
15381452
1539- break ;
1540- } // End switch.
1453+ // Reconstruct and normalize the attribute value.
1454+ $ syntax_characters = array (
1455+ '& ' => '& ' ,
1456+ '< ' => '< ' ,
1457+ '> ' => '> ' ,
1458+ "' " => '' ' ,
1459+ '" ' => '" ' ,
1460+ );
15411461
1542- if ( 0 === $ working ) { // Not well-formed, remove and try again.
1543- $ attr = wp_kses_html_error ( $ attr );
1544- $ mode = 0 ;
1545- }
1546- } // End while.
1462+ $ recoded = $ is_bool ? '' : strtr ( $ value , $ syntax_characters );
1463+ $ whole = $ is_bool ? $ name : "{$ name }= \"{$ recoded }\"" ;
15471464
1548- if ( 1 === $ mode && false === array_key_exists ( $ attrname , $ attrarr ) ) {
1549- /*
1550- * Special case, for when the attribute list ends with a valueless
1551- * attribute like "selected".
1552- */
1553- $ attrarr [ $ attrname ] = array (
1554- 'name ' => $ attrname ,
1555- 'value ' => '' ,
1556- 'whole ' => $ attrname ,
1557- 'vless ' => 'y ' ,
1465+ $ attributes [ $ name ] = array (
1466+ 'name ' => $ name ,
1467+ 'value ' => $ recoded ,
1468+ 'whole ' => $ whole ,
1469+ 'vless ' => $ is_bool ? 'y ' : 'n ' ,
15581470 );
15591471 }
15601472
1561- return $ attrarr ;
1473+ return $ attributes ;
15621474}
15631475
15641476/**
0 commit comments