Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions AGENT.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# unicode-segmenter

A TypeScript/JavaScript library for Unicode text segmentation (UAX #29).

## Commands

- Build: `yarn build` (also runs `yarn clean && tsc -p tsconfig.build.json`)
- Test: `yarn test` (Node.js test runner with `node --test`)
- Test with coverage: `yarn test:coverage`
- Run single test file: `node --test test/{filename}.js`
- Clean: `yarn clean` (removes built artifacts)
- Performance benchmarks: `yarn perf:grapheme`, `yarn perf:emoji`, `yarn perf:general`

## Architecture

Unicode text segmentation library with multiple entry points:
- `src/index.js`: Main export aggregating all functionality
- `src/grapheme.js`: Extended grapheme cluster segmentation (primary feature)
- `src/emoji.js`: Single codepoint emoji matching
- `src/general.js`: Alphanumeric character matching
- `src/intl-adapter.js`: Intl.Segmenter compatibility adapter
- `src/utils.js`: Shared utilities
- `src/_*_data.js`: Unicode data tables (generated from Unicode specs)
- `test/`: Node.js test files with comprehensive Unicode test cases

## Code Style

- ES modules with `.js` extensions (`type: "module"` in package.json)
- TypeScript via JSDoc with `// @ts-check` headers
- Semicolons required, single quotes preferred
- Node.js built-in test runner for testing (`import { test } from 'node:test'`)
- Property testing with fast-check for Unicode compliance
- No external dependencies in runtime code
95 changes: 56 additions & 39 deletions src/grapheme.js
Original file line number Diff line number Diff line change
Expand Up @@ -48,54 +48,77 @@
return;
}

/** @type {number} Current cursor position. */
let cursor = 0;

/** @type {number} Total length of the input string. */
let len = input.length;
const len = input.length;

// Fast path for pure ASCII text - most common case
let isAscii = true;
for (let i = 0; i < len; i++) {
if (input.charCodeAt(i) >= 127) {
isAscii = false;
break;
}
}

if (isAscii) {
// Optimized ASCII-only path
for (let i = 0; i < len; i++) {
const charCode = input.charCodeAt(i);
const isControl = charCode < 32;

if (isControl) {
if (charCode === 10 && i > 0 && input.charCodeAt(i - 1) === 13) {
// Skip LF after CR - already handled
continue;
}
if (charCode === 13 && i + 1 < len && input.charCodeAt(i + 1) === 10) {
// CR+LF sequence
yield {
segment: input.slice(i, i + 2),
index: i,
input,
_hd: charCode,
_catBegin: 1, // CR
_catEnd: 6, // LF
};
i++; // Skip the LF
continue;
}
}

// Regular ASCII character
yield {
segment: input[i],
index: i,
input,
_hd: charCode,
_catBegin: isControl ? (charCode === 10 ? 6 : charCode === 13 ? 1 : 2) : 0,
_catEnd: isControl ? (charCode === 10 ? 6 : charCode === 13 ? 1 : 2) : 0,
};
}
return;
}

/** @type {GraphemeCategoryNum | null} Category of codepoint immediately preceding cursor, if known. */
// Original Unicode path for non-ASCII text
let cursor = 0;
let catBefore = null;

/** @type {GraphemeCategoryNum | null} Category of codepoint immediately preceding cursor, if known. */
let catAfter = null;

/** @type {GraphemeCategoryNum | null} Beginning category of a segment */
let catBegin = null;

/** @type {import('./_grapheme_data.js').GraphemeCategoryRange} */
let cache = [0, 0, 2 /* GC_Control */];

/** @type {number} The number of RIS codepoints preceding `cursor`. */
const cache = [0, 0, 2 /* GC_Control */];
let risCount = 0;

/** Emoji state */
let emoji = false;

/** InCB=Consonant */
let consonant = false;

/** InCB=Linker */
let linker = false;

/** InCB=Consonant InCB=Linker x InCB=Consonant */
let incb = false;

let cp = /** @type {number} */ (input.codePointAt(cursor));

/** Memoize the beginnig code point a the segment. */
let _hd = cp;

let index = 0;

while (true) {
cursor += cp < 0xFFFF ? 1 : 2;

// Note: Of course the nullish coalescing is useful here,
// but avoid it for aggressive compatibility and perf claim
catBefore = catAfter;
if (catBefore === null) {
catBefore = cat(cp, cache);

Check failure on line 121 in src/grapheme.js

View workflow job for this annotation

GitHub Actions / Checking build

Argument of type 'number[]' is not assignable to parameter of type 'GraphemeCategoryRange'.
catBegin = catBefore;
}

Expand All @@ -111,34 +134,29 @@
return;
}

// Note: Lazily update `consonant` and `linker` state
// which is a extra overhead only for Hindi text.
// Lazily update consonant and linker state for Hindi text only
if (cp >= 2325) {
if (!consonant && catBefore === 0) {
consonant = isIndicConjunctConsonant(cp);
} else if (catBefore === 3 /* Extend */) {
// Note: \p{InCB=Linker} is a subset of \p{Extend}
linker = isIndicConjunctLinker(cp);
}
}

cp = /** @type {number} */ (input.codePointAt(cursor));
catAfter = cat(cp, cache);

Check failure on line 147 in src/grapheme.js

View workflow job for this annotation

GitHub Actions / Checking build

Argument of type 'number[]' is not assignable to parameter of type 'GraphemeCategoryRange'.

if (catBefore === 10 /* Regional_Indicator */) {
risCount += 1;
risCount++;
} else {
risCount = 0;
if (
catAfter === 14 /* ZWJ */
&& (catBefore === 3 /* Extend */ || catBefore === 4 /* Extended_Pictographic */)
) {
emoji = true;

} else if (catAfter === 0 /* Any */ && cp >= 2325) {
// Note: Put GB9c rule checking here to reduce.
incb = consonant && linker && (consonant = isIndicConjunctConsonant(cp));
// It cannot be both a linker and a consonant.
linker = linker && !consonant;
}
}
Expand All @@ -153,7 +171,6 @@
_catEnd: catBefore,
};

// flush
index = cursor;
emoji = false;
incb = false;
Expand Down Expand Up @@ -232,11 +249,11 @@
// If this char isn't within the cached range, update the cache to the
// range that includes it.
if (cp < cache[0] || cp > cache[1]) {
let index = findUnicodeRangeIndex(cp, grapheme_ranges);
const index = findUnicodeRangeIndex(cp, grapheme_ranges);
if (index < 0) {
return 0;
}
let range = grapheme_ranges[index];
const range = grapheme_ranges[index];
cache[0] = range[0];
cache[1] = range[1];
cache[2] = range[2];
Expand Down
Loading