Skip to content

Commit ac417f0

Browse files
committed
HTML API: Reliably parse HTML in wp_html_split().
Trac ticket: Core-63694 This probably improves the performance in terms of both CPU time and memory compared to the old PCRE-based approach.
1 parent 1885bee commit ac417f0

File tree

1 file changed

+22
-2
lines changed

1 file changed

+22
-2
lines changed

src/wp-includes/formatting.php

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -605,21 +605,41 @@ function wpautop( $text, $br = true ) {
605605
}
606606

607607
/**
608-
* Separates HTML elements and comments from the text.
608+
* Splits an HTML input into an array of raw strings, where each token
609+
* represents a tag, a comment, a text node, etc…
610+
*
611+
* No effort is made to clean up, sanitize, or normalize the segments
612+
* of HTML. {@see WP_HTML_Processor::normalize()} for normalization.
609613
*
610614
* @since 4.2.4
615+
* @since {WP_VERSION} Reliably parses HTML via the HTML API.
611616
*
612617
* @param string $input The text which has to be formatted.
613618
* @return string[] Array of the formatted text.
614619
*/
615620
function wp_html_split( $input ) {
616-
return preg_split( get_html_split_regex(), $input, -1, PREG_SPLIT_DELIM_CAPTURE );
621+
$token_reporter = new class( $input ) extends WP_HTML_Tag_Processor {
622+
public function extract_raw_token() {
623+
$this->set_bookmark( 'here' );
624+
$here = $this->bookmarks['here'];
625+
626+
return substr( $this->html, $here->start, $here->length );
627+
}
628+
};
629+
630+
$tokens = array();
631+
while ( $token_reporter->next_token() ) {
632+
$tokens[] = $token_reporter->extract_raw_token();
633+
}
634+
635+
return $tokens;
617636
}
618637

619638
/**
620639
* Retrieves the regular expression for an HTML element.
621640
*
622641
* @since 4.4.0
642+
* @deprecated {WP_VERSION} Use the HTML API instead.
623643
*
624644
* @return string The regular expression.
625645
*/

0 commit comments

Comments
 (0)