-
Notifications
You must be signed in to change notification settings - Fork 121
Fix #1619: Implement grapheme cluster counting for character count co… #1765
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
moshaid
wants to merge
7
commits into
nhsuk:main
Choose a base branch
from
moshaid:fix/1619-grapheme-counting
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from 6 commits
Commits
Show all changes
7 commits
Select commit
Hold shift + click to select a range
27cf1c1
Fix #1619: Implement grapheme cluster counting for character count co…
45afec1
Fixed test errors
956899e
Merge branch 'nhsuk:main' into fix/1619-grapheme-counting
moshaid 8575123
-Added fallback to support gap and server consistency
2bcca4b
tests updated and passing
cd27a4d
prettier test checked --passed
c9871b6
Merge main into fix/1619-grapheme-counting
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
82 changes: 82 additions & 0 deletions
82
packages/nhsuk-frontend/src/nhsuk/common/grapheme-count.jsdom.test.mjs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,82 @@ | ||
| import { graphemeCount } from './grapheme-count.mjs' | ||
|
|
||
| describe('graphemeCount', () => { | ||
| it('counts simple strings', () => { | ||
| expect(graphemeCount('Hello')).toBe(5) | ||
| expect(graphemeCount('World')).toBe(5) | ||
| expect(graphemeCount('NHS')).toBe(3) | ||
| }) | ||
|
|
||
| it('handles empty strings', () => { | ||
| expect(graphemeCount('')).toBe(0) | ||
| }) | ||
|
|
||
| it('counts whitespace', () => { | ||
| expect(graphemeCount(' ')).toBe(1) | ||
| expect(graphemeCount('Hello World')).toBe(11) | ||
| expect(graphemeCount('\n\t')).toBe(2) | ||
| }) | ||
|
|
||
| it('counts accented characters', () => { | ||
| expect(graphemeCount('café')).toBe(4) | ||
| expect(graphemeCount('naïve')).toBe(5) | ||
| expect(graphemeCount('résumé')).toBe(6) | ||
| }) | ||
|
|
||
| it('counts non-Latin scripts', () => { | ||
| expect(graphemeCount('你好')).toBe(2) | ||
| expect(graphemeCount('こんにちは')).toBe(5) | ||
| expect(graphemeCount('안녕하세요')).toBe(5) | ||
| expect(graphemeCount('Привет')).toBe(6) | ||
| }) | ||
|
|
||
| it('counts emoji', () => { | ||
| expect(graphemeCount('👋')).toBe(1) | ||
| expect(graphemeCount('😀')).toBe(1) | ||
| expect(graphemeCount('🇬🇧')).toBe(1) | ||
| }) | ||
|
|
||
| it('counts emoji with skin tones', () => { | ||
| expect(graphemeCount('👋🏼')).toBe(1) | ||
| expect(graphemeCount('👍🏿')).toBe(1) | ||
| }) | ||
|
|
||
| it('counts emoji sequences', () => { | ||
| expect(graphemeCount('👨👩👧👦')).toBe(1) | ||
| expect(graphemeCount('👨💻')).toBe(1) | ||
| }) | ||
|
|
||
| it('counts multiple emoji', () => { | ||
| expect(graphemeCount('👋 Hello 👋🏼')).toBe(9) | ||
| expect(graphemeCount('😀😃😄')).toBe(3) | ||
| }) | ||
|
|
||
| it('handles mixed content', () => { | ||
| expect(graphemeCount('Hello 👋 World')).toBe(13) | ||
| expect(graphemeCount('café 👋🏼')).toBe(6) | ||
| }) | ||
|
|
||
| it('handles surrogate pairs', () => { | ||
| const note = '\u{1F3B5}' | ||
| expect(graphemeCount(note)).toBe(1) | ||
| }) | ||
|
|
||
| it('throws for invalid input', () => { | ||
| expect(() => graphemeCount(null)).toThrow(TypeError) | ||
| expect(() => graphemeCount(undefined)).toThrow(TypeError) | ||
| // @ts-expect-error invalid type | ||
| expect(() => graphemeCount(123)).toThrow(TypeError) | ||
| // @ts-expect-error invalid type | ||
| expect(() => graphemeCount({})).toThrow(TypeError) | ||
| // @ts-expect-error invalid type | ||
| expect(() => graphemeCount([])).toThrow(TypeError) | ||
| }) | ||
|
|
||
| it('works with real examples', () => { | ||
| expect(graphemeCount('NHS 👨⚕️')).toBe(5) | ||
| expect(graphemeCount('Call 111 🏥')).toBe(10) | ||
| expect(graphemeCount('Please describe your symptoms')).toBe(29) | ||
| expect(graphemeCount('Feeling unwell 😷')).toBe(16) | ||
| expect(graphemeCount('José García')).toBe(11) | ||
| }) | ||
| }) |
45 changes: 45 additions & 0 deletions
45
packages/nhsuk-frontend/src/nhsuk/common/grapheme-count.mjs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,45 @@ | ||
| /** | ||
| * @param {string} text | ||
| * @returns {number} | ||
| */ | ||
| export function codePointCount(text) { | ||
| if (typeof text !== 'string') { | ||
| throw new TypeError('codePointCount expects a string argument') | ||
| } | ||
|
|
||
| let count = 0 | ||
| let i = 0 | ||
|
|
||
| while (i < text.length) { | ||
| const codePoint = text.codePointAt(i) | ||
| if (codePoint !== undefined) { | ||
| count++ | ||
| i += codePoint > 0xffff ? 2 : 1 | ||
| } else { | ||
| i++ | ||
| } | ||
| } | ||
|
|
||
| return count | ||
| } | ||
|
|
||
| /** | ||
| * @param {string} text | ||
| * @returns {number} | ||
| */ | ||
| export function graphemeCount(text) { | ||
| if (typeof text !== 'string') { | ||
| throw new TypeError('graphemeCount expects a string argument') | ||
| } | ||
|
|
||
| if ('Segmenter' in Intl) { | ||
| try { | ||
| const segmenter = new Intl.Segmenter('en', { granularity: 'grapheme' }) | ||
| return [...segmenter.segment(text)].length | ||
| } catch (_) { | ||
| void _ | ||
| } | ||
| } | ||
|
|
||
| return codePointCount(text) | ||
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -7,3 +7,84 @@ See the [main README quick start guide](https://github.com/nhsuk/nhsuk-frontend# | |
| ## Guidance and examples | ||
|
|
||
| To learn more about the character count component and when to use it, visit the [design system in the NHS digital service manual](https://service-manual.nhs.uk/design-system/components/character-count) for guidance, examples and options. | ||
|
|
||
| ## How characters are counted | ||
|
|
||
| By default, the character count component uses **code point counting**, which matches Python's `len()` function for Unicode strings. This ensures consistency between client-side (JavaScript) and server-side (Python) validation in `nhsuk-frontend-jinja`, preventing mismatched error messages. | ||
|
|
||
| You can optionally enable **grapheme cluster counting** (user-perceived characters) by setting `useGraphemeCounting: true` in the component configuration. This provides more accurate counting for: | ||
|
|
||
| - **Emoji and emoji sequences**: Emoji like 👋, 👋🏼 (with skin tone), and 👨👩👧👦 (family emoji) are each counted as a single character | ||
| - **Characters with combining marks**: Accented characters like é, ñ, and ü are counted correctly regardless of whether they're stored as a single code point or as a base character plus combining mark | ||
| - **Complex scripts**: Non-Latin scripts (Chinese, Japanese, Korean, Arabic, etc.) are counted accurately | ||
|
|
||
| **Important**: Only enable grapheme counting if your server-side validation also uses grapheme counting. Otherwise, you may see different counts between client and server validation messages. | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would it be helpful to signpost the way to do this in javascript and the various other languages we use on the backend? (In python I think you probably need a 3rd party library like https://grapheme.readthedocs.io/en/latest/grapheme.html) |
||
|
|
||
| ### Examples | ||
|
|
||
| **Default behavior (code point counting - matches Python `len()`):** | ||
|
|
||
| ```javascript | ||
| // Simple ASCII | ||
| "Hello" // 5 characters | ||
|
|
||
| // Emoji (counted as code points) | ||
| "👋" // 1 character | ||
| "👋🏼" // 2 characters (base emoji + skin tone modifier) | ||
| "👨👩👧👦" // 7 characters (multiple code points) | ||
|
|
||
| // Accented characters | ||
| "café" // 4 characters | ||
| "naïve" // 5 characters | ||
|
|
||
| // Mixed content | ||
| "Hello 👋" // 7 characters (5 letters + 1 space + 1 emoji) | ||
| ``` | ||
|
|
||
| **With grapheme counting enabled (`useGraphemeCounting: true`):** | ||
|
|
||
| ```javascript | ||
| // Emoji (counted as grapheme clusters) | ||
| "👋" // 1 character | ||
| "👋🏼" // 1 character (emoji with skin tone modifier) | ||
| "👨👩👧👦" // 1 character (family emoji sequence) | ||
| ``` | ||
|
|
||
| ### Configuration | ||
|
|
||
| #### Default behavior (code point counting) | ||
|
|
||
| By default, the component uses code point counting to match Python's `len()` behavior: | ||
|
|
||
| ```javascript | ||
| new CharacterCount($root, { | ||
| maxlength: 200 | ||
| }) | ||
| ``` | ||
|
|
||
| This ensures server-side consistency with `nhsuk-frontend-jinja` validation. | ||
|
|
||
| #### Enabling grapheme cluster counting | ||
|
|
||
| To use grapheme cluster counting (only if your server also uses it): | ||
|
|
||
| ```javascript | ||
| new CharacterCount($root, { | ||
| maxlength: 200, | ||
| useGraphemeCounting: true | ||
| }) | ||
| ``` | ||
|
|
||
| Or via data attribute: | ||
|
|
||
| ```html | ||
| <div data-module="nhsuk-character-count" | ||
| data-maxlength="200" | ||
| data-use-grapheme-counting="true"> | ||
| ``` | ||
|
|
||
| ### Browser support | ||
|
|
||
| The default code point counting works across all supported browsers and matches Python's `len()` behavior, ensuring consistency with server-side validation. | ||
|
|
||
| When `useGraphemeCounting` is enabled, the component uses `Intl.Segmenter` when available (Baseline 2024 browsers: Chrome 87+, Firefox 125+, Safari 17+, Edge 87+) for accurate grapheme cluster counting. For browsers that support ES6 modules but not `Intl.Segmenter` (such as Safari 11-16, Firefox 60-124, and older Chrome/Edge versions), it automatically falls back to code point counting. | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not everyone is using python and jinja, so I don't know if this paragraph should be geared at a broader audience than teams using python.
nhsuk-frontend-jinja is a port of the nunjucks components, so it's not actually the thing providing server-side validation, but wherever the template relies on the
lenfilter, it will be using the code point counting, so it's good that its consistent.This is a great contribution btw 👍🏻
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What do you both think about a customisable count function?
It might need to support promises (should a server-side count be needed)
Similar to a config option, you could pass in pre-exported count functions that we publish. Or alternatively use your own if necessary
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I like the idea in theory. I think calling the server for the count is a bit excessive though. Just being able to specify a count function on the client side to match what the server does seems like it would be flexible enough.
Counting bytes seems like the main use case other than the options we've got now (graphemes, codepoints, words). The byte count would depend on the encoding the server/database uses, so if we export a function for this it probably shouldn't assume utf-8.
I'm sure there are plenty of cases where we're constrained by some legacy backend thing, but I'm not sure the ideal UX in that case. With graphemes - and to a lesser extent, the existing codepoint counting - the display will track with what the user is entering, but it will lie about whether they are over the limit if they enter any multi-byte characters. But if they customise the count function to use byte counting, the count will jump by more than 1 when they type multi-byte characters, which might confuse users as well. 🤷🏻
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I wonder if we can partially mitigate mis-matches by:
threshold, so that most users don’t see the character count at all