Skip to content

Commit 98602ab

Browse files
committed
HTML Split: Match legacy behavior from preg_split
1 parent 9e74bce commit 98602ab

File tree

1 file changed

+62
-5
lines changed

1 file changed

+62
-5
lines changed

src/wp-includes/formatting.php

Lines changed: 62 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -611,11 +611,44 @@ function wpautop( $text, $br = true ) {
611611
* No effort is made to clean up, sanitize, or normalize the segments
612612
* of HTML. {@see WP_HTML_Processor::normalize()} for normalization.
613613
*
614+
* Consider using the HTML API directly instead of relying on this
615+
* legacy function: it bloats memory by default and provides a text
616+
* interface for working with HTML whereas the HTML API provides a
617+
* low-overhead and convenient structural interface.
618+
*
619+
* ## Output format:
620+
*
621+
* To maintain legacy behaviors with this function from when it
622+
* operated via {@see preg_split()}, the output array injects text
623+
* nodes which do not appear in the source HTML. That is, the original
624+
* array, relying on {@see PREG_SPLIT_DELIM_CAPTURE}, included a text
625+
* span on each side of every tag-like or comment-like “delimiter” in
626+
* the matched string.
627+
*
628+
* Therefore, the output array will always start and end with text nodes
629+
* and will separate every non-text node with a text node. If there is no
630+
* actual content in the interstitial space between tokens in the source
631+
* document, an empty text node will be created.
632+
*
633+
* Example:
634+
*
635+
* array( '', '<img>', '' ) === wp_html_split( '<img>' );
636+
* array( 'test' ) === wp_html_split( 'test' );
637+
* array( '', '<p>', 'test' ) === wp_html_split( '<p>test' );
638+
* array( 'test', '</p>', '' ) === wp_html_split( 'test</p>' );
639+
*
640+
* array( '', '<br>', '', '<!-- comment -->', '' ) === wp_html_split( '<br><!-- comment -->' );
641+
*
642+
* // To avoid ambiguity, leading less-than signs (<) in text nodes are encoded.
643+
* array( '&#60;3' ) === wp_split_html( '<3' );
644+
*
614645
* @since 4.2.4
615-
* @since {WP_VERSION} Reliably parses HTML via the HTML API.
646+
* @since 6.9.0 Reliably parses HTML via the HTML API.
616647
*
617-
* @param string $input The text which has to be formatted.
618-
* @return string[] Array of the formatted text.
648+
* @param string $input HTML document to split, one item for every token.
649+
* These can be text nodes, tags, comments, or doctype declarations.
650+
* @return string[] Tokens from input; starting and ending in a text node, and with text
651+
* nodes between every non-text node (see docblock note).
619652
*/
620653
function wp_html_split( $input ) {
621654
$token_reporter = new class( $input ) extends WP_HTML_Tag_Processor {
@@ -627,9 +660,33 @@ public function extract_raw_token() {
627660
}
628661
};
629662

630-
$tokens = array();
663+
$tokens = array();
664+
$was_text = false;
631665
while ( $token_reporter->next_token() ) {
632-
$tokens[] = $token_reporter->extract_raw_token();
666+
$raw_token = $token_reporter->extract_raw_token();
667+
$is_text = '#text' === $token_reporter->get_token_name();
668+
669+
if ( ! $is_text && ! $was_text ) {
670+
$tokens[] = '';
671+
}
672+
673+
/*
674+
* Some legacy code assumes that text nodes will never start with a
675+
* less-than sign (<) but this isn’t the case, as some text nodes do
676+
* if the less-than sign doesn’t introduce a syntax token. To avoid
677+
* further corruption a leading less-than sign is replaced by its
678+
* encoded equivalent numeric character reference.
679+
*/
680+
if ( $is_text && '<' === ( $raw_token[0] ?? '' ) ) {
681+
$raw_token = '&#60;' . substr( $raw_token, 1 );
682+
}
683+
684+
$tokens[] = $raw_token;
685+
$was_text = $is_text;
686+
}
687+
688+
if ( ! $was_text ) {
689+
$tokens[] = '';
633690
}
634691

635692
return $tokens;

0 commit comments

Comments
 (0)