Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions .changeset/gentle-suns-design.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
---
"unicode-segmenter": minor
---

Expose an internal state: `_hd`;

The first codepoint of a segment, which is often need to be checked its bounds.

For example,

```ts
for (const { segment } of graphemeSegments(text)) {
const cp = segment.codePointAt(0)!;
// Also need to `!` assertions in TypeScript.
if (isBMP(cp)) {
// ...
}
}
```

It can be replaced by `_hd` state. no additional overhead.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -254,7 +254,7 @@ Since [Hermes doesn't support the `Intl.Segmenter` API](https://github.com/faceb

| Name | Unicode® | ESM? | Size | Size (min) | Size (min+gzip) | Size (min+br) |
|------------------------------|----------|------|----------:|-----------:|----------------:|--------------:|
| `unicode-segmenter/grapheme` | 16.0.0 | ✔️ | 15,929 | 12,110 | 5,050 | 3,738 |
| `unicode-segmenter/grapheme` | 16.0.0 | ✔️ | 15,997 | 12,130 | 5,061 | 3,751 |
| `graphemer` | 15.0.0 | ✖️ ️| 410,435 | 95,104 | 15,752 | 10,660 |
| `grapheme-splitter` | 10.0.0 | ✖️ | 122,252 | 23,680 | 7,852 | 4,841 |
| `@formatjs/intl-segmenter`* | 15.0.0 | ✖️ | 603,285 | 369,560 | 72,218 | 49,416 |
Expand All @@ -270,7 +270,7 @@ Since [Hermes doesn't support the `Intl.Segmenter` API](https://github.com/faceb

| Name | Bytecode size | Bytecode size (gzip)* |
|------------------------------|--------------:|----------------------:|
| `unicode-segmenter/grapheme` | 22,019 | 11,513 |
| `unicode-segmenter/grapheme` | 22,061 | 11,539 |
| `graphemer` | 133,974 | 31,715 |
| `grapheme-splitter` | 63,855 | 19,133 |

Expand Down
9 changes: 8 additions & 1 deletion src/grapheme.js
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import { consonant_ranges } from './_incb_data.js';
* @typedef {import('./_grapheme_data.js').GraphemeCategoryRange} GraphemeCategoryRange
*
* @typedef {object} GraphemeSegmentExtra
* @property {number} _hd The first code point of the segment
* @property {GraphemeCategoryNum} _catBegin Beginning Grapheme_Cluster_Break category of the segment
* @property {GraphemeCategoryNum} _catEnd Ending Grapheme_Cluster_Break category of the segment
*
Expand Down Expand Up @@ -81,7 +82,10 @@ export function* graphemeSegments(input) {
/** InCB=Consonant InCB=Linker x InCB=Consonant */
let incb = false;

let cp = /** @type number */ (input.codePointAt(cursor));
let cp = /** @type {number} */ (input.codePointAt(cursor));

/** Memoize the beginnig code point a the segment. */
let _hd = cp;

let index = 0;
let segment = '';
Expand Down Expand Up @@ -117,6 +121,7 @@ export function* graphemeSegments(input) {
segment,
index,
input,
_hd,
_catBegin: /** @type {typeof catBefore} */ (catBegin),
_catEnd: catBefore,
};
Expand Down Expand Up @@ -146,6 +151,7 @@ export function* graphemeSegments(input) {
segment,
index,
input,
_hd,
_catBegin: /** @type {typeof catBefore} */ (catBegin),
_catEnd: catBefore,
};
Expand All @@ -156,6 +162,7 @@ export function* graphemeSegments(input) {
emoji = false;
incb = false;
catBegin = catAfter;
_hd = cp;
}
}
}
Expand Down
32 changes: 16 additions & 16 deletions test/grapheme.js
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,12 @@ test('graphemeSegments', async t => {
assert.deepEqual(
[...graphemeSegments('abc123')],
[
{ segment: 'a', index: 0, input: 'abc123', _catBegin: GraphemeCategory.Any, _catEnd: GraphemeCategory.Any },
{ segment: 'b', index: 1, input: 'abc123', _catBegin: GraphemeCategory.Any, _catEnd: GraphemeCategory.Any },
{ segment: 'c', index: 2, input: 'abc123', _catBegin: GraphemeCategory.Any, _catEnd: GraphemeCategory.Any },
{ segment: '1', index: 3, input: 'abc123', _catBegin: GraphemeCategory.Any, _catEnd: GraphemeCategory.Any },
{ segment: '2', index: 4, input: 'abc123', _catBegin: GraphemeCategory.Any, _catEnd: GraphemeCategory.Any },
{ segment: '3', index: 5, input: 'abc123', _catBegin: GraphemeCategory.Any, _catEnd: GraphemeCategory.Any },
{ segment: 'a', index: 0, input: 'abc123', _hd: 'a'.codePointAt(0), _catBegin: GraphemeCategory.Any, _catEnd: GraphemeCategory.Any },
{ segment: 'b', index: 1, input: 'abc123', _hd: 'b'.codePointAt(0), _catBegin: GraphemeCategory.Any, _catEnd: GraphemeCategory.Any },
{ segment: 'c', index: 2, input: 'abc123', _hd: 'c'.codePointAt(0), _catBegin: GraphemeCategory.Any, _catEnd: GraphemeCategory.Any },
{ segment: '1', index: 3, input: 'abc123', _hd: '1'.codePointAt(0), _catBegin: GraphemeCategory.Any, _catEnd: GraphemeCategory.Any },
{ segment: '2', index: 4, input: 'abc123', _hd: '2'.codePointAt(0), _catBegin: GraphemeCategory.Any, _catEnd: GraphemeCategory.Any },
{ segment: '3', index: 5, input: 'abc123', _hd: '3'.codePointAt(0), _catBegin: GraphemeCategory.Any, _catEnd: GraphemeCategory.Any },
],
);
});
Expand All @@ -35,10 +35,10 @@ test('graphemeSegments', async t => {
assert.deepEqual(
[...graphemeSegments('a̐éö̲\r\n')],
[
{ segment: 'a̐', index: 0, input: 'a̐éö̲\r\n', _catBegin: GraphemeCategory.Any, _catEnd: GraphemeCategory.Extend },
{ segment: 'é', index: 2, input: 'a̐éö̲\r\n', _catBegin: GraphemeCategory.Any, _catEnd: GraphemeCategory.Extend },
{ segment: 'ö̲', index: 4, input: 'a̐éö̲\r\n', _catBegin: GraphemeCategory.Any, _catEnd: GraphemeCategory.Extend },
{ segment: '\r\n', index: 7, input: 'a̐éö̲\r\n', _catBegin: GraphemeCategory.CR, _catEnd: GraphemeCategory.LF },
{ segment: 'a̐', index: 0, input: 'a̐éö̲\r\n', _hd: 'a̐'.codePointAt(0), _catBegin: GraphemeCategory.Any, _catEnd: GraphemeCategory.Extend },
{ segment: 'é', index: 2, input: 'a̐éö̲\r\n', _hd: 'é'.codePointAt(0), _catBegin: GraphemeCategory.Any, _catEnd: GraphemeCategory.Extend },
{ segment: 'ö̲', index: 4, input: 'a̐éö̲\r\n', _hd: 'ö̲'.codePointAt(0), _catBegin: GraphemeCategory.Any, _catEnd: GraphemeCategory.Extend },
{ segment: '\r\n', index: 7, input: 'a̐éö̲\r\n', _hd: '\r\n'.codePointAt(0), _catBegin: GraphemeCategory.CR, _catEnd: GraphemeCategory.LF },
],
);
});
Expand All @@ -47,8 +47,8 @@ test('graphemeSegments', async t => {
assert.deepEqual(
[...graphemeSegments('🇷🇸🇮🇴')],
[
{ segment: '🇷🇸', index: 0, input: '🇷🇸🇮🇴', _catBegin: GraphemeCategory.Regional_Indicator, _catEnd: GraphemeCategory.Regional_Indicator },
{ segment: '🇮🇴', index: 4, input: '🇷🇸🇮🇴', _catBegin: GraphemeCategory.Regional_Indicator, _catEnd: GraphemeCategory.Regional_Indicator },
{ segment: '🇷🇸', index: 0, input: '🇷🇸🇮🇴', _hd: '🇷🇸'.codePointAt(0), _catBegin: GraphemeCategory.Regional_Indicator, _catEnd: GraphemeCategory.Regional_Indicator },
{ segment: '🇮🇴', index: 4, input: '🇷🇸🇮🇴', _hd: '🇮🇴'.codePointAt(0), _catBegin: GraphemeCategory.Regional_Indicator, _catEnd: GraphemeCategory.Regional_Indicator },
],
);
});
Expand All @@ -57,8 +57,8 @@ test('graphemeSegments', async t => {
assert.deepEqual(
[...graphemeSegments('🇷🇸🇮')],
[
{ segment: '🇷🇸', index: 0, input: '🇷🇸🇮', _catBegin: GraphemeCategory.Regional_Indicator, _catEnd: GraphemeCategory.Regional_Indicator },
{ segment: '🇮', index: 4, input: '🇷🇸🇮', _catBegin: GraphemeCategory.Regional_Indicator, _catEnd: GraphemeCategory.Regional_Indicator },
{ segment: '🇷🇸', index: 0, input: '🇷🇸🇮', _hd: '🇷🇸'.codePointAt(0), _catBegin: GraphemeCategory.Regional_Indicator, _catEnd: GraphemeCategory.Regional_Indicator },
{ segment: '🇮', index: 4, input: '🇷🇸🇮', _hd: '🇮'.codePointAt(0), _catBegin: GraphemeCategory.Regional_Indicator, _catEnd: GraphemeCategory.Regional_Indicator },
],
);
});
Expand All @@ -67,8 +67,8 @@ test('graphemeSegments', async t => {
assert.deepEqual(
[...graphemeSegments('👻👩‍👩‍👦‍👦')],
[
{ segment: '👻', index: 0, input: '👻👩‍👩‍👦‍👦', _catBegin: GraphemeCategory.Extended_Pictographic, _catEnd: GraphemeCategory.Extended_Pictographic },
{ segment: '👩‍👩‍👦‍👦', index: 2, input: '👻👩‍👩‍👦‍👦', _catBegin: GraphemeCategory.Extended_Pictographic, _catEnd: GraphemeCategory.Extended_Pictographic },
{ segment: '👻', index: 0, input: '👻👩‍👩‍👦‍👦', _hd: '👻'.codePointAt(0), _catBegin: GraphemeCategory.Extended_Pictographic, _catEnd: GraphemeCategory.Extended_Pictographic },
{ segment: '👩‍👩‍👦‍👦', index: 2, input: '👻👩‍👩‍👦‍👦', _hd: '👩‍👩‍👦‍👦'.codePointAt(0), _catBegin: GraphemeCategory.Extended_Pictographic, _catEnd: GraphemeCategory.Extended_Pictographic },
],
);
});
Expand Down
18 changes: 9 additions & 9 deletions test/intl-adapter.js
Original file line number Diff line number Diff line change
Expand Up @@ -37,39 +37,39 @@ test('containing', async _ => {

assert.deepEqual(
segments.containing(0),
{ segment: 'a̐', index: 0, input: 'a̐éö̲\r\n', _catBegin: GraphemeCategory.Any, _catEnd: GraphemeCategory.Extend },
{ segment: 'a̐', index: 0, input: 'a̐éö̲\r\n', _hd: 'a̐'.codePointAt(0), _catBegin: GraphemeCategory.Any, _catEnd: GraphemeCategory.Extend },
);
assert.deepEqual(
segments.containing(1),
{ segment: 'a̐', index: 0, input: 'a̐éö̲\r\n', _catBegin: GraphemeCategory.Any, _catEnd: GraphemeCategory.Extend },
{ segment: 'a̐', index: 0, input: 'a̐éö̲\r\n', _hd: 'a̐'.codePointAt(0), _catBegin: GraphemeCategory.Any, _catEnd: GraphemeCategory.Extend },
);
assert.deepEqual(
segments.containing(2),
{ segment: 'é', index: 2, input: 'a̐éö̲\r\n', _catBegin: GraphemeCategory.Any, _catEnd: GraphemeCategory.Extend },
{ segment: 'é', index: 2, input: 'a̐éö̲\r\n', _hd: 'é'.codePointAt(0), _catBegin: GraphemeCategory.Any, _catEnd: GraphemeCategory.Extend },
);
assert.deepEqual(
segments.containing(3),
{ segment: 'é', index: 2, input: 'a̐éö̲\r\n', _catBegin: GraphemeCategory.Any, _catEnd: GraphemeCategory.Extend },
{ segment: 'é', index: 2, input: 'a̐éö̲\r\n', _hd: 'é'.codePointAt(0), _catBegin: GraphemeCategory.Any, _catEnd: GraphemeCategory.Extend },
);
assert.deepEqual(
segments.containing(4),
{ segment: 'ö̲', index: 4, input: 'a̐éö̲\r\n', _catBegin: GraphemeCategory.Any, _catEnd: GraphemeCategory.Extend },
{ segment: 'ö̲', index: 4, input: 'a̐éö̲\r\n', _hd: 'ö̲'.codePointAt(0), _catBegin: GraphemeCategory.Any, _catEnd: GraphemeCategory.Extend },
);
assert.deepEqual(
segments.containing(5),
{ segment: 'ö̲', index: 4, input: 'a̐éö̲\r\n', _catBegin: GraphemeCategory.Any, _catEnd: GraphemeCategory.Extend },
{ segment: 'ö̲', index: 4, input: 'a̐éö̲\r\n', _hd: 'ö̲'.codePointAt(0), _catBegin: GraphemeCategory.Any, _catEnd: GraphemeCategory.Extend },
);
assert.deepEqual(
segments.containing(6),
{ segment: 'ö̲', index: 4, input: 'a̐éö̲\r\n', _catBegin: GraphemeCategory.Any, _catEnd: GraphemeCategory.Extend },
{ segment: 'ö̲', index: 4, input: 'a̐éö̲\r\n', _hd: 'ö̲'.codePointAt(0), _catBegin: GraphemeCategory.Any, _catEnd: GraphemeCategory.Extend },
);
assert.deepEqual(
segments.containing(7),
{ segment: '\r\n', index: 7, input: 'a̐éö̲\r\n', _catBegin: GraphemeCategory.CR, _catEnd: GraphemeCategory.LF },
{ segment: '\r\n', index: 7, input: 'a̐éö̲\r\n', _hd: '\r\n'.codePointAt(0), _catBegin: GraphemeCategory.CR, _catEnd: GraphemeCategory.LF },
);
assert.deepEqual(
segments.containing(8),
{ segment: '\r\n', index: 7, input: 'a̐éö̲\r\n', _catBegin: GraphemeCategory.CR, _catEnd: GraphemeCategory.LF },
{ segment: '\r\n', index: 7, input: 'a̐éö̲\r\n', _hd: '\r\n'.codePointAt(0), _catBegin: GraphemeCategory.CR, _catEnd: GraphemeCategory.LF },
);
assert.equal(segments.containing(9), undefined);
});
Expand Down