Skip to content
Open
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
import { graphemeCount } from './grapheme-count.mjs'

describe('graphemeCount', () => {
it('counts simple strings', () => {
expect(graphemeCount('Hello')).toBe(5)
expect(graphemeCount('World')).toBe(5)
expect(graphemeCount('NHS')).toBe(3)
})

it('handles empty strings', () => {
expect(graphemeCount('')).toBe(0)
})

it('counts whitespace', () => {
expect(graphemeCount(' ')).toBe(1)
expect(graphemeCount('Hello World')).toBe(11)
expect(graphemeCount('\n\t')).toBe(2)
})

it('counts accented characters', () => {
expect(graphemeCount('café')).toBe(4)
expect(graphemeCount('naïve')).toBe(5)
expect(graphemeCount('résumé')).toBe(6)
})

it('counts non-Latin scripts', () => {
expect(graphemeCount('你好')).toBe(2)
expect(graphemeCount('こんにちは')).toBe(5)
expect(graphemeCount('안녕하세요')).toBe(5)
expect(graphemeCount('Привет')).toBe(6)
})

it('counts emoji', () => {
expect(graphemeCount('👋')).toBe(1)
expect(graphemeCount('😀')).toBe(1)
expect(graphemeCount('🇬🇧')).toBe(1)
})

it('counts emoji with skin tones', () => {
expect(graphemeCount('👋🏼')).toBe(1)
expect(graphemeCount('👍🏿')).toBe(1)
})

it('counts emoji sequences', () => {
expect(graphemeCount('👨‍👩‍👧‍👦')).toBe(1)
expect(graphemeCount('👨‍💻')).toBe(1)
})

it('counts multiple emoji', () => {
expect(graphemeCount('👋 Hello 👋🏼')).toBe(9)
expect(graphemeCount('😀😃😄')).toBe(3)
})

it('handles mixed content', () => {
expect(graphemeCount('Hello 👋 World')).toBe(13)
expect(graphemeCount('café 👋🏼')).toBe(6)
})

it('handles surrogate pairs', () => {
const note = '\u{1F3B5}'
expect(graphemeCount(note)).toBe(1)
})

it('throws for invalid input', () => {
expect(() => graphemeCount(null)).toThrow(TypeError)
expect(() => graphemeCount(undefined)).toThrow(TypeError)
// @ts-expect-error invalid type
expect(() => graphemeCount(123)).toThrow(TypeError)
// @ts-expect-error invalid type
expect(() => graphemeCount({})).toThrow(TypeError)
// @ts-expect-error invalid type
expect(() => graphemeCount([])).toThrow(TypeError)
})

it('works with real examples', () => {
expect(graphemeCount('NHS 👨‍⚕️')).toBe(5)
expect(graphemeCount('Call 111 🏥')).toBe(10)
expect(graphemeCount('Please describe your symptoms')).toBe(29)
expect(graphemeCount('Feeling unwell 😷')).toBe(16)
expect(graphemeCount('José García')).toBe(11)
})
})
45 changes: 45 additions & 0 deletions packages/nhsuk-frontend/src/nhsuk/common/grapheme-count.mjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
/**
* @param {string} text
* @returns {number}
*/
export function codePointCount(text) {
if (typeof text !== 'string') {
throw new TypeError('codePointCount expects a string argument')
}

let count = 0
let i = 0

while (i < text.length) {
const codePoint = text.codePointAt(i)
if (codePoint !== undefined) {
count++
i += codePoint > 0xffff ? 2 : 1
} else {
i++
}
}

return count
}

/**
* @param {string} text
* @returns {number}
*/
export function graphemeCount(text) {
if (typeof text !== 'string') {
throw new TypeError('graphemeCount expects a string argument')
}

if ('Segmenter' in Intl) {
try {
const segmenter = new Intl.Segmenter('en', { granularity: 'grapheme' })
return [...segmenter.segment(text)].length
} catch (_) {
void _
}
}

return codePointCount(text)
}
1 change: 1 addition & 0 deletions packages/nhsuk-frontend/src/nhsuk/common/index.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,7 @@ export function formatErrorMessage(Component, message) {
}

export * from './closest-attribute-value.mjs'
export * from './grapheme-count.mjs'
export * from './nhsuk-frontend-version.mjs'

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,84 @@ See the [main README quick start guide](https://github.com/nhsuk/nhsuk-frontend#
## Guidance and examples

To learn more about the character count component and when to use it, visit the [design system in the NHS digital service manual](https://service-manual.nhs.uk/design-system/components/character-count) for guidance, examples and options.

## How characters are counted

By default, the character count component uses **code point counting**, which matches Python's `len()` function for Unicode strings. This ensures consistency between client-side (JavaScript) and server-side (Python) validation in `nhsuk-frontend-jinja`, preventing mismatched error messages.
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not everyone is using python and jinja, so I don't know if this paragraph should be geared at a broader audience than teams using python.

nhsuk-frontend-jinja is a port of the nunjucks components, so it's not actually the thing providing server-side validation, but wherever the template relies on the len filter, it will be using the code point counting, so it's good that its consistent.

This is a great contribution btw 👍🏻

Copy link
Copy Markdown
Contributor

@colinrotherham colinrotherham Feb 5, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What do you both think about a customisable count function?

It might need to support promises (should a server-side count be needed)

Similar to a config option, you could pass in pre-exported count functions that we publish. Or alternatively use your own if necessary

Copy link
Copy Markdown
Contributor

@MatMoore MatMoore Feb 5, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I like the idea in theory. I think calling the server for the count is a bit excessive though. Just being able to specify a count function on the client side to match what the server does seems like it would be flexible enough.

Counting bytes seems like the main use case other than the options we've got now (graphemes, codepoints, words). The byte count would depend on the encoding the server/database uses, so if we export a function for this it probably shouldn't assume utf-8.

I'm sure there are plenty of cases where we're constrained by some legacy backend thing, but I'm not sure the ideal UX in that case. With graphemes - and to a lesser extent, the existing codepoint counting - the display will track with what the user is entering, but it will lie about whether they are over the limit if they enter any multi-byte characters. But if they customise the count function to use byte counting, the count will jump by more than 1 when they type multi-byte characters, which might confuse users as well. 🤷🏻

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if we can partially mitigate mis-matches by:

  • recommending that you set quite a high limit where possible
  • recommending that teams use a threshold, so that most users don’t see the character count at all
  • even if both client side and server-side are counting characters, you could allow an extra 5 or so characters server-side vs client-side, just in case there’s a discrepancy in the counting functions...


You can optionally enable **grapheme cluster counting** (user-perceived characters) by setting `useGraphemeCounting: true` in the component configuration. This provides more accurate counting for:

- **Emoji and emoji sequences**: Emoji like 👋, 👋🏼 (with skin tone), and 👨‍👩‍👧‍👦 (family emoji) are each counted as a single character
- **Characters with combining marks**: Accented characters like é, ñ, and ü are counted correctly regardless of whether they're stored as a single code point or as a base character plus combining mark
- **Complex scripts**: Non-Latin scripts (Chinese, Japanese, Korean, Arabic, etc.) are counted accurately

**Important**: Only enable grapheme counting if your server-side validation also uses grapheme counting. Otherwise, you may see different counts between client and server validation messages.
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it be helpful to signpost the way to do this in javascript and the various other languages we use on the backend? (In python I think you probably need a 3rd party library like https://grapheme.readthedocs.io/en/latest/grapheme.html)


### Examples

**Default behavior (code point counting - matches Python `len()`):**

```javascript
// Simple ASCII
"Hello" // 5 characters

// Emoji (counted as code points)
"👋" // 1 character
"👋🏼" // 2 characters (base emoji + skin tone modifier)
"👨‍👩‍👧‍👦" // 7 characters (multiple code points)

// Accented characters
"café" // 4 characters
"naïve" // 5 characters

// Mixed content
"Hello 👋" // 7 characters (5 letters + 1 space + 1 emoji)
```

**With grapheme counting enabled (`useGraphemeCounting: true`):**

```javascript
// Emoji (counted as grapheme clusters)
"👋" // 1 character
"👋🏼" // 1 character (emoji with skin tone modifier)
"👨‍👩‍👧‍👦" // 1 character (family emoji sequence)
```

### Configuration

#### Default behavior (code point counting)

By default, the component uses code point counting to match Python's `len()` behavior:

```javascript
new CharacterCount($root, {
maxlength: 200
})
```

This ensures server-side consistency with `nhsuk-frontend-jinja` validation.

#### Enabling grapheme cluster counting

To use grapheme cluster counting (only if your server also uses it):

```javascript
new CharacterCount($root, {
maxlength: 200,
useGraphemeCounting: true
})
```

Or via data attribute:

```html
<div data-module="nhsuk-character-count"
data-maxlength="200"
data-use-grapheme-counting="true">
```

### Browser support

The default code point counting works across all supported browsers and matches Python's `len()` behavior, ensuring consistency with server-side validation.

When `useGraphemeCounting` is enabled, the component uses `Intl.Segmenter` when available (Baseline 2024 browsers: Chrome 87+, Firefox 125+, Safari 17+, Edge 87+) for accurate grapheme cluster counting. For browsers that support ES6 modules but not `Intl.Segmenter` (such as Safari 11-16, Firefox 60-124, and older Chrome/Edge versions), it automatically falls back to code point counting.
Original file line number Diff line number Diff line change
Expand Up @@ -279,4 +279,96 @@ describe('Character count: Format count message', () => {
'You have 10,000 words too many'
)
})

describe('Unicode and grapheme cluster counting', () => {
let component

beforeEach(() => {
const example = examples['to configure in JavaScript']

document.body.outerHTML = outdent`
<body class="nhsuk-frontend-supported">
${components.render('character-count', example)}
</body>
`

const $root = document.querySelector(
`[data-module="${CharacterCount.moduleName}"]`
)

component = new CharacterCount($root, {
maxlength: 10,
useGraphemeCounting: true
})
})

it('counts emoji correctly', () => {
const $textarea = /** @type {HTMLTextAreaElement} */ (
document.querySelector('.nhsuk-js-character-count')
)
$textarea.value = '👋👋👋'
expect(component.count($textarea.value)).toBe(3)
})

it('counts emoji with skin tones', () => {
const $textarea = /** @type {HTMLTextAreaElement} */ (
document.querySelector('.nhsuk-js-character-count')
)
$textarea.value = '👋🏼👋🏿'
expect(component.count($textarea.value)).toBe(2)
})

it('counts emoji sequences', () => {
const $textarea = /** @type {HTMLTextAreaElement} */ (
document.querySelector('.nhsuk-js-character-count')
)
$textarea.value = '👨‍👩‍👧‍👦'
expect(component.count($textarea.value)).toBe(1)
})

it('counts accented characters', () => {
const $textarea = /** @type {HTMLTextAreaElement} */ (
document.querySelector('.nhsuk-js-character-count')
)
$textarea.value = 'café'
expect(component.count($textarea.value)).toBe(4)
})

it('counts mixed text and emoji', () => {
const $textarea = /** @type {HTMLTextAreaElement} */ (
document.querySelector('.nhsuk-js-character-count')
)
$textarea.value = 'Hi 👋'
expect(component.count($textarea.value)).toBe(4)
})

it('counts non-Latin scripts', () => {
const $textarea = /** @type {HTMLTextAreaElement} */ (
document.querySelector('.nhsuk-js-character-count')
)
$textarea.value = '你好'
expect(component.count($textarea.value)).toBe(2)
})

it('updates message with emoji', () => {
const $textarea = /** @type {HTMLTextAreaElement} */ (
document.querySelector('.nhsuk-js-character-count')
)
$textarea.value = '👋👋👋👋👋👋👋👋👋👋'
component.updateCountMessage()
const $status = document.querySelector('.nhsuk-character-count__status')
expect($status).toHaveTextContent('You have 0 characters remaining')
})

it('shows error when over limit with emoji', () => {
const $textarea = /** @type {HTMLTextAreaElement} */ (
document.querySelector('.nhsuk-js-character-count')
)
$textarea.value = '👋👋👋👋👋👋👋👋👋👋👋'
component.updateCountMessage()
const $status = document.querySelector('.nhsuk-character-count__status')
expect($status).toHaveTextContent('You have 1 character too many')
expect($status).toHaveClass('nhsuk-error-message')
})
})
})
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ import {
normaliseOptions,
validateConfig
} from '../../common/configuration/index.mjs'
import { codePointCount, graphemeCount } from '../../common/grapheme-count.mjs'
import { formatErrorMessage } from '../../common/index.mjs'
import { ConfigurableComponent } from '../../configurable-component.mjs'
import { ConfigError, ElementError } from '../../errors/index.mjs'
Expand Down Expand Up @@ -175,19 +176,20 @@ export class CharacterCount extends ConfigurableComponent {
}

/**
* Count the number of characters (or words, if `config.maxwords` is set)
* in the given text
*
* @param {string} text - The text to count the characters of
* @returns {number} the number of characters (or words) in the text
* @param {string} text
* @returns {number}
*/
count(text) {
if (this.config.maxwords) {
const tokens = text.match(/\S+/g) ?? [] // Matches consecutive non-whitespace chars
const tokens = text.match(/\S+/g) ?? []
return tokens.length
}

return text.length
if (this.config.useGraphemeCounting) {
return graphemeCount(text)
}

return codePointCount(text)
}

/**
Expand Down Expand Up @@ -390,6 +392,7 @@ export class CharacterCount extends ConfigurableComponent {
textareaDescriptionClass: 'nhsuk-character-count__message',
visibleCountMessageClass: 'nhsuk-character-count__status',
screenReaderCountMessageClass: 'nhsuk-character-count__sr-status',
useGraphemeCounting: false,
i18n: {
// Characters
charactersUnderLimit: {
Expand Down Expand Up @@ -431,6 +434,7 @@ export class CharacterCount extends ConfigurableComponent {
textareaDescriptionClass: { type: 'string' },
visibleCountMessageClass: { type: 'string' },
screenReaderCountMessageClass: { type: 'string' },
useGraphemeCounting: { type: 'boolean' },
i18n: { type: 'object' }
},
anyOf: [
Expand Down Expand Up @@ -479,6 +483,10 @@ export function initCharacterCounts(options) {
* @property {string} textareaDescriptionClass - Textarea description class
* @property {string} visibleCountMessageClass - Visible count message class
* @property {string} screenReaderCountMessageClass - Screen reader count message class
* @property {boolean} [useGraphemeCounting=false] - If true, uses grapheme cluster
* counting (user-perceived characters) instead of code point counting. Defaults
* to false to ensure consistency with Python's `len()` and server-side validation.
* Only enable if your server-side validation also uses grapheme counting.
* @property {CharacterCountTranslations} [i18n=CharacterCount.defaults.i18n] - Character count translations
*/

Expand Down