Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .changeset/many-years-deny.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"unicode-segmenter": patch
---

Add `splitGraphemes()` utility
19 changes: 16 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,19 @@ import { graphemeSegments } from 'unicode-segmenter/grapheme';
// 3: { segment: '\r\n', index: 7, input: 'a̐éö̲\r\n' }
```

#### Example: Split graphemes

```js
import { splitGraphemes } from 'unicode-segmenter/grapheme';

[...splitGraphemes('#️⃣*️⃣0️⃣1️⃣2️⃣')];
// 0: #️⃣
// 1: *️⃣
// 2: 0️⃣
// 3: 1️⃣
// 4: 2️⃣
```

#### Example: Count graphemes

```js
Expand All @@ -77,7 +90,7 @@ countGrapheme('a̐éö̲');
> [!NOTE]
> `countGrapheme()` is a small wrapper around `graphemeSegments()`.
>
> If you call it more than once, use `graphemeSegments()` once instead, Or memoize it yourself.
> If you need it more than once at a time, consider memoization or use `graphemeSegments()` or `splitSegments()` once instead.

#### Example: Build an advanced grapheme matcher

Expand Down Expand Up @@ -238,7 +251,7 @@ Since [Hermes doesn't support the `Intl.Segmenter` API](https://github.com/faceb

| Name | Unicode® | ESM? | Size | Size (min) | Size (min+gzip) | Size (min+br) |
|------------------------------|----------|------|----------:|-----------:|----------------:|--------------:|
| `unicode-segmenter/grapheme` | 16.0.0 | ✔️ | 17,125 | 12,720 | 5,256 | 3,913 |
| `unicode-segmenter/grapheme` | 16.0.0 | ✔️ | 17,347 | 12,822 | 5,307 | 4,093 |
| `graphemer` | 15.0.0 | ✖️ ️| 410,435 | 95,104 | 15,752 | 10,660 |
| `grapheme-splitter` | 10.0.0 | ✖️ | 122,252 | 23,680 | 7,852 | 4,841 |
| `@formatjs/intl-segmenter`* | 15.0.0 | ✖️ | 491,043 | 318,721 | 54,248 | 34,380 |
Expand All @@ -254,7 +267,7 @@ Since [Hermes doesn't support the `Intl.Segmenter` API](https://github.com/faceb

| Name | Bytecode size | Bytecode size (gzip)* |
|------------------------------|--------------:|----------------------:|
| `unicode-segmenter/grapheme` | 23,992 | 12,533 |
| `unicode-segmenter/grapheme` | 24,521 | 12,773 |
| `graphemer` | 133,949 | 31,710 |
| `grapheme-splitter` | 63,810 | 19,125 |
| `@formatjs/intl-segmenter`* | 315,865 | 99,063 |
Expand Down
6 changes: 5 additions & 1 deletion benchmark/grapheme/bundle-entries/unicode-segmenter.js
Original file line number Diff line number Diff line change
@@ -1 +1,5 @@
export { graphemeSegments } from '../../../src/grapheme.js';
export {
graphemeSegments,
countGrapheme,
splitGraphemes,
} from '../../../src/grapheme.js';
8 changes: 8 additions & 0 deletions src/grapheme.js
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,14 @@ export function countGrapheme(str) {
return count;
}

/**
* @param {string} str
* @return {IterableIterator<string>}
*/
export function* splitGraphemes(str) {
for (let s of graphemeSegments(str)) yield s.segment;
}

/**
* `Grapheme_Cluster_Break` property value of a given codepoint
*
Expand Down
66 changes: 61 additions & 5 deletions test/grapheme.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,12 @@ import { test } from 'node:test';
import * as assert from 'node:assert/strict';
import fc from 'fast-check';

import { graphemeSegments, countGrapheme, GraphemeCategory } from 'unicode-segmenter/grapheme';
import {
GraphemeCategory,
graphemeSegments,
countGrapheme,
splitGraphemes,
} from 'unicode-segmenter/grapheme';
import { assertObjectContaining } from './_helper.js';

test('graphemeSegments', async t => {
Expand Down Expand Up @@ -74,10 +79,6 @@ test('countGrapheme', async t => {
assert.equal(countGrapheme('abcd'), 4);
});

await t.test('latin', () => {
assert.equal(countGrapheme('abcd'), 4);
});

await t.test('flags', () => {
assert.equal(countGrapheme('🇷🇸🇮🇴'), 2);
});
Expand All @@ -104,6 +105,61 @@ test('countGrapheme', async t => {
});
});

test('splitGrapheme', async t => {
await t.test('latin', () => {
assert.deepEqual(
[...splitGraphemes('abcd')],
['a', 'b', 'c', 'd'],
);
});

await t.test('flags', () => {
assert.deepEqual(
[...splitGraphemes('🇷🇸🇮🇴')],
['🇷🇸', '🇮🇴'],
);
});

await t.test('emoji', () => {
assert.deepEqual(
[...splitGraphemes('👻👩‍👩‍👦‍👦')],
['👻', '👩‍👩‍👦‍👦'],
);
assert.deepEqual(
[...splitGraphemes('🌷🎁💩😜👍🏳️‍🌈')],
['🌷', '🎁', '💩', '😜', '👍', '🏳️‍🌈'],
);
});

await t.test('diacritics as combining marks', () => {
assert.deepEqual(
[...splitGraphemes('Ĺo͂řȩm̅')],
['Ĺ', 'o͂', 'ř', 'ȩ', 'm̅'],
);
});

await t.test('Jamo', () => {
assert.deepEqual(
[...splitGraphemes('가갉')],
['가', '갉'],
);
});

await t.test('Hindi', () => {
assert.deepEqual(
[...splitGraphemes('अनुच्छेद')],
['अ', 'नु', 'च्छे', 'द'],
);
});

await t.test('demonic', () => {
assert.deepEqual(
[...splitGraphemes('Z͑ͫ̓ͪ̂ͫ̽͏̴̙̤̞͉͚̯̞̠͍A̴̵̜̰͔ͫ͗͢L̠ͨͧͩ͘G̴̻͈͍͔̹̑͗̎̅͛́Ǫ̵̹̻̝̳͂̌̌͘!͖̬̰̙̗̿̋ͥͥ̂ͣ̐́́͜͞')],
['Z͑ͫ̓ͪ̂ͫ̽͏̴̙̤̞͉͚̯̞̠͍', 'A̴̵̜̰͔ͫ͗͢', 'L̠ͨͧͩ͘', 'G̴̻͈͍͔̹̑͗̎̅͛́', 'Ǫ̵̹̻̝̳͂̌̌͘', '!͖̬̰̙̗̿̋ͥͥ̂ͣ̐́́͜͞'],
);
});
});

test('spec compliant', async t => {
fc.configureGlobal({
// Fix seed here for stable coverage report
Expand Down