Skip to content

Commit 5655fb0

Browse files
committed
HTML API: Add missing subclass methods to HTML Processor and add token provenance.
This patch introduces two related changes: - It adds missing subclass methods on the HTML Processor which needed to be implemented since it started visiting virtual nodes. These methods need to account for the fact that not all tokens truly exist. - It adds a new concept and internal method, `is_virtual()`, indicating if the currently-matched token comes from the raw text in the input HTML document or if it was the byproduct of semantic parsing rules. This internal method and new vocabulary around token provenance considerably simplifies the logic spread throughout the rest of the class and its subclass methods. Developed in #6860 Discussed in https://core.trac.wordpress.org/ticket/61348 Follow-up to [58304]. Props dmsnell, jonsurrell, gziolo. See #61348. git-svn-id: https://develop.svn.wordpress.org/trunk@58558 602fd350-edb4-49c9-b593-d223f7449a82
1 parent c2e7ab3 commit 5655fb0

File tree

2 files changed

+186
-55
lines changed

2 files changed

+186
-55
lines changed

src/wp-includes/html-api/class-wp-html-processor.php

Lines changed: 168 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -349,13 +349,19 @@ public function __construct( $html, $use_the_static_create_methods_instead = nul
349349

350350
$this->state->stack_of_open_elements->set_push_handler(
351351
function ( WP_HTML_Token $token ) {
352-
$this->element_queue[] = new WP_HTML_Stack_Event( $token, WP_HTML_Stack_Event::PUSH );
352+
$is_virtual = ! isset( $this->state->current_token ) || $this->is_tag_closer();
353+
$same_node = isset( $this->state->current_token ) && $token->node_name === $this->state->current_token->node_name;
354+
$provenance = ( ! $same_node || $is_virtual ) ? 'virtual' : 'real';
355+
$this->element_queue[] = new WP_HTML_Stack_Event( $token, WP_HTML_Stack_Event::PUSH, $provenance );
353356
}
354357
);
355358

356359
$this->state->stack_of_open_elements->set_pop_handler(
357360
function ( WP_HTML_Token $token ) {
358-
$this->element_queue[] = new WP_HTML_Stack_Event( $token, WP_HTML_Stack_Event::POP );
361+
$is_virtual = ! isset( $this->state->current_token ) || ! $this->is_tag_closer();
362+
$same_node = isset( $this->state->current_token ) && $token->node_name === $this->state->current_token->node_name;
363+
$provenance = ( ! $same_node || $is_virtual ) ? 'virtual' : 'real';
364+
$this->element_queue[] = new WP_HTML_Stack_Event( $token, WP_HTML_Stack_Event::POP, $provenance );
359365
}
360366
);
361367

@@ -569,11 +575,26 @@ public function next_token() {
569575
* @return bool Whether the current tag is a tag closer.
570576
*/
571577
public function is_tag_closer() {
572-
return isset( $this->current_element )
573-
? ( WP_HTML_Stack_Event::POP === $this->current_element->operation )
578+
return $this->is_virtual()
579+
? ( WP_HTML_Stack_Event::POP === $this->current_element->operation && '#tag' === $this->get_token_type() )
574580
: parent::is_tag_closer();
575581
}
576582

583+
/**
584+
* Indicates if the currently-matched token is virtual, created by a stack operation
585+
* while processing HTML, rather than a token found in the HTML text itself.
586+
*
587+
* @since 6.6.0
588+
*
589+
* @return bool Whether the current token is virtual.
590+
*/
591+
private function is_virtual() {
592+
return (
593+
isset( $this->current_element->provenance ) &&
594+
'virtual' === $this->current_element->provenance
595+
);
596+
}
597+
577598
/**
578599
* Indicates if the currently-matched tag matches the given breadcrumbs.
579600
*
@@ -1440,7 +1461,7 @@ public function get_tag() {
14401461
return null;
14411462
}
14421463

1443-
if ( isset( $this->current_element ) ) {
1464+
if ( $this->is_virtual() ) {
14441465
return $this->current_element->token->node_name;
14451466
}
14461467

@@ -1459,6 +1480,27 @@ public function get_tag() {
14591480
}
14601481
}
14611482

1483+
/**
1484+
* Indicates if the currently matched tag contains the self-closing flag.
1485+
*
1486+
* No HTML elements ought to have the self-closing flag and for those, the self-closing
1487+
* flag will be ignored. For void elements this is benign because they "self close"
1488+
* automatically. For non-void HTML elements though problems will appear if someone
1489+
* intends to use a self-closing element in place of that element with an empty body.
1490+
* For HTML foreign elements and custom elements the self-closing flag determines if
1491+
* they self-close or not.
1492+
*
1493+
* This function does not determine if a tag is self-closing,
1494+
* but only if the self-closing flag is present in the syntax.
1495+
*
1496+
* @since 6.6.0 Subclassed for the HTML Processor.
1497+
*
1498+
* @return bool Whether the currently matched tag contains the self-closing flag.
1499+
*/
1500+
public function has_self_closing_flag() {
1501+
return $this->is_virtual() ? false : parent::has_self_closing_flag();
1502+
}
1503+
14621504
/**
14631505
* Returns the node name represented by the token.
14641506
*
@@ -1480,11 +1522,9 @@ public function get_tag() {
14801522
* @return string|null Name of the matched token.
14811523
*/
14821524
public function get_token_name() {
1483-
if ( isset( $this->current_element ) ) {
1484-
return $this->current_element->token->node_name;
1485-
}
1486-
1487-
return parent::get_token_name();
1525+
return $this->is_virtual()
1526+
? $this->current_element->token->node_name
1527+
: parent::get_token_name();
14881528
}
14891529

14901530
/**
@@ -1510,9 +1550,16 @@ public function get_token_name() {
15101550
* @return string|null What kind of token is matched, or null.
15111551
*/
15121552
public function get_token_type() {
1513-
if ( isset( $this->current_element ) ) {
1514-
$node_name = $this->current_element->token->node_name;
1515-
if ( ctype_upper( $node_name[0] ) ) {
1553+
if ( $this->is_virtual() ) {
1554+
/*
1555+
* This logic comes from the Tag Processor.
1556+
*
1557+
* @todo It would be ideal not to repeat this here, but it's not clearly
1558+
* better to allow passing a token name to `get_token_type()`.
1559+
*/
1560+
$node_name = $this->current_element->token->node_name;
1561+
$starting_char = $node_name[0];
1562+
if ( 'A' <= $starting_char && 'Z' >= $starting_char ) {
15161563
return '#tag';
15171564
}
15181565

@@ -1546,25 +1593,38 @@ public function get_token_type() {
15461593
* @return string|true|null Value of attribute or `null` if not available. Boolean attributes return `true`.
15471594
*/
15481595
public function get_attribute( $name ) {
1549-
if ( isset( $this->current_element ) ) {
1550-
// Closing tokens cannot contain attributes.
1551-
if ( WP_HTML_Stack_Event::POP === $this->current_element->operation ) {
1552-
return null;
1553-
}
1554-
1555-
$node_name = $this->current_element->token->node_name;
1556-
1557-
// Only tags can contain attributes.
1558-
if ( 'A' > $node_name[0] || 'Z' < $node_name[0] ) {
1559-
return null;
1560-
}
1596+
return $this->is_virtual() ? null : parent::get_attribute( $name );
1597+
}
15611598

1562-
if ( $this->current_element->token->bookmark_name === (string) $this->bookmark_counter ) {
1563-
return parent::get_attribute( $name );
1564-
}
1565-
}
1599+
/**
1600+
* Updates or creates a new attribute on the currently matched tag with the passed value.
1601+
*
1602+
* For boolean attributes special handling is provided:
1603+
* - When `true` is passed as the value, then only the attribute name is added to the tag.
1604+
* - When `false` is passed, the attribute gets removed if it existed before.
1605+
*
1606+
* For string attributes, the value is escaped using the `esc_attr` function.
1607+
*
1608+
* @since 6.6.0 Subclassed for the HTML Processor.
1609+
*
1610+
* @param string $name The attribute name to target.
1611+
* @param string|bool $value The new attribute value.
1612+
* @return bool Whether an attribute value was set.
1613+
*/
1614+
public function set_attribute( $name, $value ) {
1615+
return $this->is_virtual() ? false : parent::set_attribute( $name, $value );
1616+
}
15661617

1567-
return null;
1618+
/**
1619+
* Remove an attribute from the currently-matched tag.
1620+
*
1621+
* @since 6.6.0 Subclassed for HTML Processor.
1622+
*
1623+
* @param string $name The attribute name to remove.
1624+
* @return bool Whether an attribute was removed.
1625+
*/
1626+
public function remove_attribute( $name ) {
1627+
return $this->is_virtual() ? false : parent::remove_attribute( $name );
15681628
}
15691629

15701630
/**
@@ -1594,18 +1654,63 @@ public function get_attribute( $name ) {
15941654
* @return array|null List of attribute names, or `null` when no tag opener is matched.
15951655
*/
15961656
public function get_attribute_names_with_prefix( $prefix ) {
1597-
if ( isset( $this->current_element ) ) {
1598-
if ( WP_HTML_Stack_Event::POP === $this->current_element->operation ) {
1599-
return null;
1600-
}
1657+
return $this->is_virtual() ? null : parent::get_attribute_names_with_prefix( $prefix );
1658+
}
16011659

1602-
$mark = $this->bookmarks[ $this->current_element->token->bookmark_name ];
1603-
if ( 0 === $mark->length ) {
1604-
return null;
1605-
}
1606-
}
1660+
/**
1661+
* Adds a new class name to the currently matched tag.
1662+
*
1663+
* @since 6.6.0 Subclassed for the HTML Processor.
1664+
*
1665+
* @param string $class_name The class name to add.
1666+
* @return bool Whether the class was set to be added.
1667+
*/
1668+
public function add_class( $class_name ) {
1669+
return $this->is_virtual() ? false : parent::add_class( $class_name );
1670+
}
1671+
1672+
/**
1673+
* Removes a class name from the currently matched tag.
1674+
*
1675+
* @since 6.6.0 Subclassed for the HTML Processor.
1676+
*
1677+
* @param string $class_name The class name to remove.
1678+
* @return bool Whether the class was set to be removed.
1679+
*/
1680+
public function remove_class( $class_name ) {
1681+
return $this->is_virtual() ? false : parent::remove_class( $class_name );
1682+
}
1683+
1684+
/**
1685+
* Returns if a matched tag contains the given ASCII case-insensitive class name.
1686+
*
1687+
* @since 6.6.0 Subclassed for the HTML Processor.
1688+
*
1689+
* @param string $wanted_class Look for this CSS class name, ASCII case-insensitive.
1690+
* @return bool|null Whether the matched tag contains the given class name, or null if not matched.
1691+
*/
1692+
public function has_class( $wanted_class ) {
1693+
return $this->is_virtual() ? null : parent::has_class( $wanted_class );
1694+
}
16071695

1608-
return parent::get_attribute_names_with_prefix( $prefix );
1696+
/**
1697+
* Generator for a foreach loop to step through each class name for the matched tag.
1698+
*
1699+
* This generator function is designed to be used inside a "foreach" loop.
1700+
*
1701+
* Example:
1702+
*
1703+
* $p = WP_HTML_Processor::create_fragment( "<div class='free &lt;egg&lt;\tlang-en'>" );
1704+
* $p->next_tag();
1705+
* foreach ( $p->class_list() as $class_name ) {
1706+
* echo "{$class_name} ";
1707+
* }
1708+
* // Outputs: "free <egg> lang-en "
1709+
*
1710+
* @since 6.6.0 Subclassed for the HTML Processor.
1711+
*/
1712+
public function class_list() {
1713+
return $this->is_virtual() ? null : parent::class_list();
16091714
}
16101715

16111716
/**
@@ -1629,17 +1734,30 @@ public function get_attribute_names_with_prefix( $prefix ) {
16291734
* @return string
16301735
*/
16311736
public function get_modifiable_text() {
1632-
if ( isset( $this->current_element ) ) {
1633-
if ( WP_HTML_Stack_Event::POP === $this->current_element->operation ) {
1634-
return '';
1635-
}
1737+
return $this->is_virtual() ? '' : parent::get_modifiable_text();
1738+
}
16361739

1637-
$mark = $this->bookmarks[ $this->current_element->token->bookmark_name ];
1638-
if ( 0 === $mark->length ) {
1639-
return '';
1640-
}
1641-
}
1642-
return parent::get_modifiable_text();
1740+
/**
1741+
* Indicates what kind of comment produced the comment node.
1742+
*
1743+
* Because there are different kinds of HTML syntax which produce
1744+
* comments, the Tag Processor tracks and exposes this as a type
1745+
* for the comment. Nominally only regular HTML comments exist as
1746+
* they are commonly known, but a number of unrelated syntax errors
1747+
* also produce comments.
1748+
*
1749+
* @see self::COMMENT_AS_ABRUPTLY_CLOSED_COMMENT
1750+
* @see self::COMMENT_AS_CDATA_LOOKALIKE
1751+
* @see self::COMMENT_AS_INVALID_HTML
1752+
* @see self::COMMENT_AS_HTML_COMMENT
1753+
* @see self::COMMENT_AS_PI_NODE_LOOKALIKE
1754+
*
1755+
* @since 6.6.0 Subclassed for the HTML Processor.
1756+
*
1757+
* @return string|null
1758+
*/
1759+
public function get_comment_type() {
1760+
return $this->is_virtual() ? null : parent::get_comment_type();
16431761
}
16441762

16451763
/**

src/wp-includes/html-api/class-wp-html-stack-event.php

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -56,14 +56,27 @@ class WP_HTML_Stack_Event {
5656
*/
5757
public $operation;
5858

59+
/**
60+
* Indicates if the stack element is a real or virtual node.
61+
*
62+
* @since 6.6.0
63+
*
64+
* @var string
65+
*/
66+
public $provenance;
67+
5968
/**
6069
* Constructor function.
6170
*
62-
* @param WP_HTML_Token $token Token associated with stack event, always an opening token.
63-
* @param string $operation One of self::PUSH or self::POP.
71+
* @since 6.6.0
72+
*
73+
* @param WP_HTML_Token $token Token associated with stack event, always an opening token.
74+
* @param string $operation One of self::PUSH or self::POP.
75+
* @param string $provenance "virtual" or "real".
6476
*/
65-
public function __construct( $token, $operation ) {
66-
$this->token = $token;
67-
$this->operation = $operation;
77+
public function __construct( $token, $operation, $provenance ) {
78+
$this->token = $token;
79+
$this->operation = $operation;
80+
$this->provenance = $provenance;
6881
}
6982
}

0 commit comments

Comments
 (0)