Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
102 changes: 102 additions & 0 deletions src/wp-includes/l10n.php
Original file line number Diff line number Diff line change
Expand Up @@ -2009,3 +2009,105 @@ function wp_get_word_count_type() {
function has_translation( string $singular, string $textdomain = 'default', ?string $locale = null ): bool {
return WP_Translation_Controller::get_instance()->has_translation( $singular, $textdomain, $locale );
}


/**
* Counts words or characters in a provided text string.
*
* @since 6.3.0
*
* @param string $text Text to count elements in.
* @param string $type The type of count. Accepts 'words', 'characters_excluding_spaces', or 'characters_including_spaces'.
* @param array $settings {
* Optional. Array of arguments used to overrides for settings.
*
* @type string $html_regexp Optional. Regular expression to find HTML elements.
* @type string $html_comment_regexp Optional. Regular expression to find HTML comments.
* @type string $space_regexp Optional. Regular expression to find irregular space
* characters.
* @type string $html_entity_regexp Optional. Regular expression to find HTML entities.
* @type string $connector_regexp Optional. Regular expression to find connectors that
* split words.
* @type string $remove_regexp Optional. Regular expression to find remove unwanted
* characters to reduce false-positives.
* @type string $astral_regexp Optional. Regular expression to find unwanted
* characters when searching for non-words.
* @type string $words_regexp Optional. Regular expression to find words by spaces.
* @type string $characters_excluding_spaces_regexp Optional. Regular expression to find characters which
* are non-spaces.
* @type string $characters_including_spaces_regexp Optional. Regular expression to find characters
* including spaces.
* @type array $shortcodes Optional. Array of shortcodes that should be removed
* from the text.
* }
* @return int The word or character count.
*/
function wp_word_count( $text, $type, $settings = array() ) {
$defaults = array(
'html_regexp' => '/<\/?[a-z][^>]*?>/i',
'html_comment_regexp' => '/<!--[\s\S]*?-->/',
'space_regexp' => '/&nbsp;|&#160;/i',
'html_entity_regexp' => '/&\S+?;/',
'connector_regexp' => "/--|\x{2014}/u",
'remove_regexp' => "/[\x{0021}-\x{0040}\x{005B}-\x{0060}\x{007B}-\x{007E}\x{0080}-\x{00BF}\x{00D7}\x{00F7}\x{2000}-\x{2BFF}\x{2E00}-\x{2E7F}]/u",
'astral_regexp' => "/[\x{010000}-\x{10FFFF}]/u",
'words_regexp' => '/\S\s+/u',
'characters_excluding_spaces_regexp' => '/\S/u',
'characters_including_spaces_regexp' => "/[^\f\n\r\t\v\x{00AD}\x{2028}\x{2029}]/u",
'shortcodes' => array(),
);

$count = 0;

if ( '' === trim( $text ) ) {
return $count;
}

$settings = wp_parse_args( $settings, $defaults );

// If there are any shortcodes, add this as a shortcode regular expression.
if ( is_array( $settings['shortcodes'] ) && ! empty( $settings['shortcodes'] ) ) {
$settings['shortcodes_regexp'] = '/\\[\\/?(?:' . implode( '|', $settings['shortcodes'] ) . ')[^\\]]*?\\]/';
}

// Sanitize type to one of three possibilities: 'words', 'characters_excluding_spaces' or 'characters_including_spaces'.
if ( 'characters_excluding_spaces' !== $type && 'characters_including_spaces' !== $type ) {
$type = 'words';
}

$text .= "\n";

// Replace all HTML with a new-line.
$text = preg_replace( $settings['html_regexp'], "\n", $text );

// Remove all HTML comments.
$text = preg_replace( $settings['html_comment_regexp'], '', $text );

// If a shortcode regular expression has been provided use it to remove shortcodes.
if ( ! empty( $settings['shortcodes_regexp'] ) ) {
$text = preg_replace( $settings['shortcodes_regexp'], "\n", $text );
}

// Normalize non-breaking space to a normal space.
$text = preg_replace( $settings['space_regexp'], ' ', $text );

if ( 'words' === $type ) {
// Remove HTML Entities.
$text = preg_replace( $settings['html_entity_regexp'], '', $text );

// Convert connectors to spaces to count attached text as words.
$text = preg_replace( $settings['connector_regexp'], ' ', $text );

// Remove unwanted characters.
$text = preg_replace( $settings['remove_regexp'], '', $text );
} else {
// Convert HTML Entities to "a".
$text = preg_replace( $settings['html_entity_regexp'], 'a', $text );

// Remove surrogate points.
$text = preg_replace( $settings['astral_regexp'], 'a', $text );
}

// Match with the selected type regular expression to count the items.
return (int) preg_match_all( $settings[ $type . '_regexp' ], $text );
}
207 changes: 207 additions & 0 deletions tests/phpunit/tests/l10n/wpWordCount.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,207 @@
<?php

/**
* @group l10n
* @group i18n
*
* @covers ::wp_word_count
*/
class Tests_L10n_wpWordcount extends WP_UnitTestCase {
protected static $settings;

public static function wpSetUpBeforeClass() {
self::$settings = array(
'shortcodes' => array( 'shortcode' ),
);
}

/**
* @ticket 57987
*
* @dataProvider data_get_string_variations
*
* @param string $text Text to count elements in.
* @param int $expected Expected word count.
*/
public function test_wp_word_count_should_return_the_number_of_words( $text, $expected ) {
$this->assertSame( $expected['words'], wp_word_count( $text, 'words', self::$settings ) );
}

/**
* @ticket 57987
*
* @dataProvider data_get_string_variations
*
* @param string $text Text to count elements in.
* @param int $expected Expected character count.
*/
public function test_wp_word_count_should_return_the_number_of_characters_excluding_spaces( $text, $expected ) {
$this->assertSame( $expected['characters_excluding_spaces'], wp_word_count( $text, 'characters_excluding_spaces', self::$settings ) );
}

/**
* @ticket 57987
*
* @dataProvider data_get_string_variations
*
* @param string $text Text to count elements in.
* @param int $expected Expected character count.
*/
public function test_wp_word_count_should_return_the_number_of_characters_including_spaces( $text, $expected ) {
$this->assertSame( $expected['characters_including_spaces'], wp_word_count( $text, 'characters_including_spaces', self::$settings ) );
}

/**
* @ticket 57987
*
* @dataProvider data_get_string_variations
*
* @param string $text Text to count elements in.
* @param int $expected Expected character count.
*/
public function test_wp_word_count_should_use_the_default_word_count_type( $text, $expected ) {
$this->assertSame( $expected['words'], wp_word_count( $text, 'wrong_type', self::$settings ) );
}

/**
* @ticket 57987
*/
public function test_wp_word_count_containing_non_array_shortcode_setting() {
$text = 'one [shortcode] two';
$settings = array(
'shortcodes' => 'shortcode',
);

$this->assertSame( 3, wp_word_count( $text, 'word', $settings ) );
$this->assertSame( 17, wp_word_count( $text, 'characters_excluding_spaces', $settings ) );
$this->assertSame( 19, wp_word_count( $text, 'characters_including_spaces', $settings ) );
}

/**
* @ticket 57987
*/
public function test_wp_word_count_containing_empty_array_shortcode_setting() {
$text = 'one [shortcode] two';
$settings = array(
'shortcodes' => array(),
);

$this->assertSame( 3, wp_word_count( $text, 'word', $settings ) );
$this->assertSame( 17, wp_word_count( $text, 'characters_excluding_spaces', $settings ) );
$this->assertSame( 19, wp_word_count( $text, 'characters_including_spaces', $settings ) );
}

/**
* Data provider.
*
* @return array[]
*/
public function data_get_string_variations() {
return array(
'text containing spaces' => array(
'text' => 'one two three',
'expected' => array(
'words' => 3,
'characters_excluding_spaces' => 11,
'characters_including_spaces' => 13,
),
),
'text containing HTML tags' => array(
'text' => 'one <em class="test">two</em><br />three',
'expected' => array(
'words' => 3,
'characters_excluding_spaces' => 11,
'characters_including_spaces' => 12,
),
),
'text containing line breaks' => array(
'text' => "one\ntwo\nthree",
'expected' => array(
'words' => 3,
'characters_excluding_spaces' => 11,
'characters_including_spaces' => 11,
),
),
'text containing encoded spaces' => array(
'text' => 'one&nbsp;two&#160;three',
'expected' => array(
'words' => 3,
'characters_excluding_spaces' => 11,
'characters_including_spaces' => 13,
),
),
'text containing punctuation' => array(
'text' => "It's two three " . json_decode( '"\u2026"' ) . ' 4?',
'expected' => array(
'words' => 3,
'characters_excluding_spaces' => 15,
'characters_including_spaces' => 19,
),
),
'text containing an em dash' => array(
'text' => 'one' . json_decode( '"\u2014"' ) . 'two--three',
'expected' => array(
'words' => 3,
'characters_excluding_spaces' => 14,
'characters_including_spaces' => 14,
),
),
'text containing shortcodes' => array(
'text' => 'one [shortcode attribute="value"]two[/shortcode]three',
'expected' => array(
'words' => 3,
'characters_excluding_spaces' => 11,
'characters_including_spaces' => 12,
),
),
'text containing astrals' => array(
'text' => json_decode( '"\uD83D\uDCA9"' ),
'expected' => array(
'words' => 1,
'characters_excluding_spaces' => 1,
'characters_including_spaces' => 1,
),
),
'text containing an HTML comment' => array(
'text' => 'one<!-- comment -->two three',
'expected' => array(
'words' => 2,
'characters_excluding_spaces' => 11,
'characters_including_spaces' => 12,
),
),
'text containing an HTML entity' => array(
'text' => '&gt; test',
'expected' => array(
'words' => 1,
'characters_excluding_spaces' => 5,
'characters_including_spaces' => 6,
),
),
'empty text' => array(
'text' => '',
'expected' => array(
'words' => 0,
'characters_excluding_spaces' => 0,
'characters_including_spaces' => 0,
),
),
'text containing only whitespace' => array(
'text' => "\t\r\n ",
'expected' => array(
'words' => 0,
'characters_excluding_spaces' => 0,
'characters_including_spaces' => 0,
),
),
'text containing a shortcode' => array(
'text' => 'one [shortcode] two',
'expected' => array(
'words' => 2,
'characters_excluding_spaces' => 6,
'characters_including_spaces' => 8,
),
),
);
}
}
Loading