diff --git a/.changeset/spotty-rockets-notice.md b/.changeset/spotty-rockets-notice.md new file mode 100644 index 0000000..e1a84bd --- /dev/null +++ b/.changeset/spotty-rockets-notice.md @@ -0,0 +1,12 @@ +--- +"unicode-segmenter": minor +--- + +Reduced bundle size, while keeping the best perf + +Some details: + +- Refactored to use the same code path internally as possible. +- Removed pre-computed jump table, the optimization were compensated for by other perf improvements. +- Previous array layout to avoid accidental de-opt turned out to be overkill. The regular tuple array is well optimized, so I fall back to using good old plain binary search. +- Some experiments like new encoding and eytzinger layout for more aggressive improvements, but no success. diff --git a/README.md b/README.md index 1693c2f..d4c134e 100644 --- a/README.md +++ b/README.md @@ -7,19 +7,19 @@ A lightweight implementation of the [Unicode Text Segmentation (UAX \#29)](https://www.unicode.org/reports/tr29) -- **Verified spec-compliance**: Up-to-date Unicode data, passes the official Unicode test suites, verifies full compliance with the `Intl.Segmenter` API via additional property-based testing, maintaining 100% coverage. +- **Spec compliant**: Up-to-date Unicode data, verified by the official Unicode test suites and fuzzed with the native `Intl.Segmenter`, and maintaining 100% test coverage. -- **Excellent compatibility**: It works well on older browsers, edge runtimes, and React Native (Hermes). +- **Excellent compatibility**: It works well on older browsers, edge runtimes, React Native (Hermes) and QuickJS. -- **Zero-dependencies**: It doesn't bloat `node_modules` or the networks tab. Just a small minimal snippet. +- **Zero-dependencies**: It doesn't bloat `node_modules` or the network bandwidth. Like a small minimal snippet. -- **Small bundle size**: It effectively compresses Unicode data and provides a tree-shakeable format, eliminating unused codes. +- **Small bundle size**: It effectively compresses the Unicode data and provides a bundler-friendly format. -- **Extremely efficient**: It's carefully optimized for performance, making it the fastest one in the ecosystem—outperforming even the built-in `Intl.Segmenter`. +- **Extremely efficient**: It's carefully optimized for runtime performance, making it the fastest one in the ecosystem—outperforming even the built-in `Intl.Segmenter`. - **TypeScript**: It's fully type-checked, and provides type definitions and JSDoc. -- **ESM-first**: It natively supports ES Modules, and still supports CommonJS. +- **ESM-first**: It primarily supports ES modules, and still supports CommonJS. > [!NOTE] > unicode-segmenter is now **[e18e] recommendation!** @@ -42,7 +42,7 @@ And extra utilities for combined use cases. - [`unicode-segmenter/emoji`](#export-unicode-segmenteremoji): Matches single codepoint emojis - [`unicode-segmenter/general`](#export-unicode-segmentergeneral): Matches single codepoint alphanumerics -- [`unicode-segmenter/utils`](#export-unicode-segmenterutils): Handles UTF-16 codepoints +- [`unicode-segmenter/utils`](#export-unicode-segmenterutils): Some utilities for handling codepoints ### Export `unicode-segmenter/grapheme` [![](https://edge.bundlejs.com/badge?q=unicode-segmenter/grapheme&treeshake=[*])](https://bundlejs.com/?q=unicode-segmenter%2Fgrapheme&treeshake=%5B*%5D) @@ -254,7 +254,7 @@ Since [Hermes doesn't support the `Intl.Segmenter` API](https://github.com/faceb | Name | Unicode® | ESM? | Size | Size (min) | Size (min+gzip) | Size (min+br) | |------------------------------|----------|------|----------:|-----------:|----------------:|--------------:| -| `unicode-segmenter/grapheme` | 16.0.0 | ✔️ | 17,313 | 12,783 | 5,285 | 3,946 | +| `unicode-segmenter/grapheme` | 16.0.0 | ✔️ | 15,929 | 12,110 | 5,049 | 3,740 | | `graphemer` | 15.0.0 | ✖️ ️| 410,435 | 95,104 | 15,752 | 10,660 | | `grapheme-splitter` | 10.0.0 | ✖️ | 122,252 | 23,680 | 7,852 | 4,841 | | `@formatjs/intl-segmenter`* | 15.0.0 | ✖️ | 491,043 | 318,721 | 54,248 | 34,380 | @@ -270,9 +270,9 @@ Since [Hermes doesn't support the `Intl.Segmenter` API](https://github.com/faceb | Name | Bytecode size | Bytecode size (gzip)* | |------------------------------|--------------:|----------------------:| -| `unicode-segmenter/grapheme` | 24,386 | 12,690 | -| `graphemer` | 133,949 | 31,710 | -| `grapheme-splitter` | 63,810 | 19,125 | +| `unicode-segmenter/grapheme` | 23,037 | 12,058 | +| `graphemer` | 133,952 | 31,708 | +| `grapheme-splitter` | 63,813 | 19,123 | * It would be compressed when included as an app asset. diff --git a/benchmark/grapheme/_records/20250307-apple_m4_pro-macos_15.3-bun_1.2.4.txt b/benchmark/grapheme/_records/20250307-apple_m4_pro-macos_15.3-bun_1.2.4.txt new file mode 100644 index 0000000..367a72a --- /dev/null +++ b/benchmark/grapheme/_records/20250307-apple_m4_pro-macos_15.3-bun_1.2.4.txt @@ -0,0 +1,335 @@ +clk: ~4.18 GHz +cpu: Apple M4 Pro +runtime: bun 1.2.4 (arm64-darwin) + +benchmark avg (min … max) p75 / p99 (min … top 1%) +------------------------------------------------------------- ------------------------------- +• Lorem ipsum (ascii) +------------------------------------------------------------- ------------------------------- +unicode-segmenter/grapheme 3.15 µs/iter 3.25 µs ▄█ + (2.93 µs … 3.49 µs) 3.49 µs ▅███▅ ▅█ ▅ ▅ + ( 0.00 b … 984.00 b) 19.64 b █████▅█████▅██▅▅▅▅▅█▅ + 7.59 ipc ( 2.04% stalls) 0.00% L1 data cache + 12.86k cycles 97.59k instructions 0.00% retired LD/ST ( 0.00) + +graphemer 14.42 µs/iter 14.71 µs █ ▇▆ + (12.33 µs … 273.38 µs) 20.67 µs ▄█▆ ██ + ( 0.00 b … 256.00 kb) 782.09 b ▁██████▆▂▂▂▁▁▁▁▁▁▁▂▂▁ + 6.82 ipc ( 1.77% stalls) 0.00% L1 data cache + 64.42k cycles 439.30k instructions 0.00% retired LD/ST ( 0.00) + +grapheme-splitter 59.02 µs/iter 62.25 µs █ + (51.96 µs … 189.83 µs) 86.67 µs █ ▂▆ ▃ + ( 0.00 b … 848.00 kb) 78.35 kb ██▃██▇██▃▂▃▂▁▁▁▁▁▁▁▁▁ + 6.33 ipc ( 0.47% stalls) 0.00% L1 data cache + 260.82k cycles 1.65M instructions 0.00% retired LD/ST ( 0.00) + +@formatjs/intl-segmenter 57.70 µs/iter 48.87 µs ██ + (46.29 µs … 140.81 µs) 76.50 µs ██▅ + ( 0.00 b … 106.25 kb) 9.25 kb ███▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▇ + 6.60 ipc ( 0.84% stalls) 0.00% L1 data cache + 195.01k cycles 1.29M instructions 0.00% retired LD/ST ( 0.00) + +unicode-rs/unicode-segmentation (wasm-bindgen) 10.97 µs/iter 11.04 µs ▄█ + (8.92 µs … 2.19 ms) 14.63 µs ▂ ██▃ + ( 0.00 b … 96.00 kb) 9.50 b ▁▅█▆▆███▆▆▅▃▂▂▂▃▂▁▁▁▁ + 6.12 ipc ( 1.07% stalls) 0.00% L1 data cache + 47.87k cycles 292.78k instructions 0.00% retired LD/ST ( 0.00) + +Intl.Segmenter 6.67 µs/iter 6.67 µs ██ █ + (6.50 µs … 6.96 µs) 6.93 µs ██ ███ █ █ + ( 4.00 b … 9.27 kb) 930.12 b █▁███████▁▁█▁▁▁█▁▁▁▁█ + 5.79 ipc ( 2.92% stalls) 0.00% L1 data cache + 28.22k cycles 163.32k instructions 0.00% retired LD/ST ( 0.00) + + ┌ ┐ + unicode-segmenter/grapheme ┤ 3.15 µs + graphemer ┤■■■■■■■ 14.42 µs + grapheme-splitter ┤■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 59.02 µs + @formatjs/intl-segmenter ┤■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 57.70 µs +unicode-rs/unicode-segmentation (wasm-bindgen) ┤■■■■■ 10.97 µs + Intl.Segmenter ┤■■ 6.67 µs + └ ┘ + +summary + unicode-segmenter/grapheme + 2.12x faster than Intl.Segmenter + 3.49x faster than unicode-rs/unicode-segmentation (wasm-bindgen) + 4.58x faster than graphemer + 18.34x faster than @formatjs/intl-segmenter + 18.76x faster than grapheme-splitter + +• Emojis +------------------------------------------------------------- ------------------------------- +unicode-segmenter/grapheme 1.92 µs/iter 1.95 µs ▃▃ █ + (1.85 µs … 2.00 µs) 1.99 µs ▆ █▆██▆▃▃█▃▃█▃▆ + ( 0.00 b … 2.00 kb) 45.69 b ▄██▁██████████████▄▄▄ + 6.39 ipc ( 1.98% stalls) 0.00% L1 data cache + 8.13k cycles 51.96k instructions 0.00% retired LD/ST ( 0.00) + +graphemer 4.60 µs/iter 4.63 µs ▃ █ + (4.51 µs … 4.74 µs) 4.68 µs ▂▂ ▂█ ▇█ ▂ ▇ + ( 0.00 b … 4.00 b) 0.11 b ▆▁██▆▁██▁██▆█▆▆▆▆▆▆█▆ + 6.69 ipc ( 1.22% stalls) 0.00% L1 data cache + 19.45k cycles 130.12k instructions 0.00% retired LD/ST ( 0.00) + +grapheme-splitter 17.44 µs/iter 17.61 µs █ █ + (16.66 µs … 19.35 µs) 18.17 µs ▅▅▅▅ ▅█ ▅▅ █ + ( 0.00 b … 0.00 b) 0.00 b ████▁██▁▁▁▁▁██▁▁▁▁▁▁█ + 4.86 ipc ( 0.31% stalls) 0.00% L1 data cache + 78.43k cycles 381.42k instructions 0.00% retired LD/ST ( 0.00) + +@formatjs/intl-segmenter 19.15 µs/iter 19.47 µs █ + (18.17 µs … 21.27 µs) 19.79 µs █ ▅ ▅▅ ▅▅ ▅▅ ▅▅ + ( 0.00 b … 0.00 b) 0.00 b █▁█▁▁▁██▁▁██▁▁▁██▁▁██ + 5.49 ipc ( 0.76% stalls) 0.00% L1 data cache + 78.98k cycles 433.87k instructions 0.00% retired LD/ST ( 0.00) + +unicode-rs/unicode-segmentation (wasm-bindgen) 4.08 µs/iter 3.88 µs █ + (3.00 µs … 1.90 ms) 14.50 µs █ + ( 0.00 b … 32.00 kb) 311.01 b ▆█▆▅▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁ + 5.60 ipc ( 1.14% stalls) 0.00% L1 data cache + 19.75k cycles 110.58k instructions 0.00% retired LD/ST ( 0.00) + +Intl.Segmenter 2.67 µs/iter 2.71 µs ▆█ + (2.42 µs … 3.00 µs) 2.87 µs ███▂ + ( 0.00 b … 5.20 kb) 1.20 kb ▃▆█▆▁▆▁▆███████▁▃█▃▆▆ + 5.08 ipc ( 4.02% stalls) 0.00% L1 data cache + 11.24k cycles 57.10k instructions 0.00% retired LD/ST ( 0.00) + + ┌ ┐ + unicode-segmenter/grapheme ┤ 1.92 µs + graphemer ┤■■■■■ 4.60 µs + grapheme-splitter ┤■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 17.44 µs + @formatjs/intl-segmenter ┤■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 19.15 µs +unicode-rs/unicode-segmentation (wasm-bindgen) ┤■■■■ 4.08 µs + Intl.Segmenter ┤■ 2.67 µs + └ ┘ + +summary + unicode-segmenter/grapheme + 1.39x faster than Intl.Segmenter + 2.13x faster than unicode-rs/unicode-segmentation (wasm-bindgen) + 2.4x faster than graphemer + 9.08x faster than grapheme-splitter + 9.97x faster than @formatjs/intl-segmenter + +• Hindi +------------------------------------------------------------- ------------------------------- +unicode-segmenter/grapheme 5.22 µs/iter 5.24 µs █ + (5.10 µs … 5.34 µs) 5.33 µs ▂ ▂▇ ▂▇█ ▂ ▂ + ( 0.00 b … 8.00 b) 0.36 b ▆▁▁█▆██▆▆███▆▁▁▁█▁▆█▆ + 6.33 ipc ( 1.65% stalls) 0.00% L1 data cache + 21.98k cycles 139.18k instructions 0.00% retired LD/ST ( 0.00) + +graphemer 15.98 µs/iter 15.78 µs █ + (15.65 µs … 18.34 µs) 16.02 µs █ █ █ + ( 0.00 b … 64.00 b) 5.33 b ██▁▁▁███▁▁▁▁█▁▁▁▁▁▁▁█ + 6.49 ipc ( 1.24% stalls) 0.00% L1 data cache + 66.75k cycles 432.87k instructions 0.00% retired LD/ST ( 0.00) + +grapheme-splitter 26.33 µs/iter 26.59 µs █ + (25.48 µs … 30.12 µs) 27.20 µs ██ + ( 0.00 b … 0.00 b) 0.00 b ██▁█▁██▁▁▁▁▁▁█▁█▁▁▁▁█ + 7.02 ipc ( 0.45% stalls) 0.00% L1 data cache + 112.54k cycles 789.66k instructions 0.00% retired LD/ST ( 0.00) + +@formatjs/intl-segmenter 57.08 µs/iter 58.26 µs █ █ █ + (54.85 µs … 58.62 µs) 58.60 µs █ ▅▅ █▅ █▅▅ + ( 0.00 b … 0.00 b) 0.00 b █▁██▁▁▁▁▁▁▁▁▁▁██▁▁███ + 5.67 ipc ( 0.68% stalls) 0.00% L1 data cache + 237.56k cycles 1.35M instructions 0.00% retired LD/ST ( 0.00) + +unicode-rs/unicode-segmentation (wasm-bindgen) 11.75 µs/iter 11.54 µs █ + (9.71 µs … 1.76 ms) 22.00 µs ▇▇█ + ( 0.00 b … 0.00 b) 0.00 b ▅███▅▃▂▁▁▁▁▁▁▁▁▁▂▁▁▂▂ + 5.55 ipc ( 1.02% stalls) 0.00% L1 data cache + 52.69k cycles 292.39k instructions 0.00% retired LD/ST ( 0.00) + +Intl.Segmenter 5.68 µs/iter 5.73 µs ▂█ + (5.53 µs … 5.79 µs) 5.78 µs ▅ ▅ ██ ▅ ▅▅ + ( 0.00 b … 3.24 kb) 461.00 b ▇▁▇█▁▁▇▁▇█▇▇██▇▇▇█▁██ + 5.25 ipc ( 2.94% stalls) 0.00% L1 data cache + 24.08k cycles 126.48k instructions 0.00% retired LD/ST ( 0.00) + + ┌ ┐ + unicode-segmenter/grapheme ┤ 5.22 µs + graphemer ┤■■■■■■■ 15.98 µs + grapheme-splitter ┤■■■■■■■■■■■■■■ 26.33 µs + @formatjs/intl-segmenter ┤■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 57.08 µs +unicode-rs/unicode-segmentation (wasm-bindgen) ┤■■■■ 11.75 µs + Intl.Segmenter ┤ 5.68 µs + └ ┘ + +summary + unicode-segmenter/grapheme + 1.09x faster than Intl.Segmenter + 2.25x faster than unicode-rs/unicode-segmentation (wasm-bindgen) + 3.06x faster than graphemer + 5.05x faster than grapheme-splitter + 10.94x faster than @formatjs/intl-segmenter + +• Demonic characters +------------------------------------------------------------- ------------------------------- +unicode-segmenter/grapheme 1.68 µs/iter 1.70 µs █ + (1.55 µs … 1.79 µs) 1.76 µs ▇▆█▅ + ( 0.00 b … 2.19 kb) 43.57 b ▂▁▁▁▃▁▂▄▂▃▆███████▃▃▅ + 7.04 ipc ( 1.48% stalls) 0.00% L1 data cache + 7.04k cycles 49.56k instructions 0.00% retired LD/ST ( 0.00) + +graphemer 7.83 µs/iter 8.03 µs █ + (7.50 µs … 8.25 µs) 8.24 µs ▅ █ ▅ ▅ + ( 0.00 b … 5.70 kb) 278.67 b █▇▇█▁▁▁▇▇▁▇▁▁▁█▇▁▁▁▇█ + 6.52 ipc ( 2.06% stalls) 0.00% L1 data cache + 32.18k cycles 209.73k instructions 0.00% retired LD/ST ( 0.00) + +grapheme-splitter 6.14 µs/iter 6.35 µs ▂ █▂ ▂ + (5.86 µs … 6.46 µs) 6.43 µs ▅ █ ██ ▅ ▅ █ + ( 0.00 b … 0.00 b) 0.00 b █▁▇█▁██▇▁▇▁▁▁█▁▁▁█▇█▇ + 6.77 ipc ( 1.16% stalls) 0.00% L1 data cache + 24.90k cycles 168.52k instructions 0.00% retired LD/ST ( 0.00) + +@formatjs/intl-segmenter 59.52 µs/iter 60.11 µs █ + (58.05 µs … 62.22 µs) 61.60 µs █▅▅ ▅▅▅ ▅▅ ▅ ▅ + ( 0.00 b … 0.00 b) 0.00 b ███▁███▁▁▁▁██▁▁█▁▁▁▁█ + 4.20 ipc ( 0.18% stalls) 0.00% L1 data cache + 263.60k cycles 1.11M instructions 0.00% retired LD/ST ( 0.00) + +unicode-rs/unicode-segmentation (wasm-bindgen) 2.18 µs/iter 2.20 µs ▆▃█ + (2.09 µs … 2.50 µs) 2.23 µs ▇▂▂███ ▂ + ( 0.00 b … 212.00 b) 43.59 b ▃▃▁▃▅▁▃▃▃▃▅██████▆█▃▃ + 5.40 ipc ( 0.40% stalls) 0.00% L1 data cache + 9.26k cycles 49.99k instructions 0.00% retired LD/ST ( 0.00) + +Intl.Segmenter 1.34 µs/iter 1.46 µs █ ▃▅█ + (1.06 µs … 1.67 µs) 1.62 µs ██ ▆▅ ███ ▃ + ( 0.00 b … 4.36 kb) 1.62 kb ▆███▃██▃▆██████████▃▃ + 4.65 ipc ( 6.01% stalls) 0.00% L1 data cache + 5.46k cycles 25.35k instructions 0.00% retired LD/ST ( 0.00) + + ┌ ┐ + unicode-segmenter/grapheme ┤ 1.68 µs + graphemer ┤■■■■ 7.83 µs + grapheme-splitter ┤■■■ 6.14 µs + @formatjs/intl-segmenter ┤■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 59.52 µs +unicode-rs/unicode-segmentation (wasm-bindgen) ┤ 2.18 µs + Intl.Segmenter ┤ 1.34 µs + └ ┘ + +summary + unicode-segmenter/grapheme + 1.26x slower than Intl.Segmenter + 1.3x faster than unicode-rs/unicode-segmentation (wasm-bindgen) + 3.64x faster than grapheme-splitter + 4.65x faster than graphemer + 35.33x faster than @formatjs/intl-segmenter + +• Tweet text (combined) +------------------------------------------------------------- ------------------------------- +unicode-segmenter/grapheme 6.74 µs/iter 6.86 µs █ + (6.52 µs … 7.15 µs) 7.00 µs █ █ █ + ( 4.00 b … 44.00 b) 11.38 b █████▁█▁█████▁██▁████ + 6.87 ipc ( 2.61% stalls) 0.00% L1 data cache + 27.15k cycles 186.37k instructions 0.00% retired LD/ST ( 0.00) + +graphemer 23.74 µs/iter 24.08 µs ████ ███ ███ █ + (22.76 µs … 26.69 µs) 24.52 µs ████ ███ ███ █ + ( 0.00 b … 40.00 b) 3.33 b ████▁███▁▁▁▁▁▁███▁▁▁█ + 6.61 ipc ( 1.36% stalls) 0.00% L1 data cache + 97.37k cycles 643.90k instructions 0.00% retired LD/ST ( 0.00) + +grapheme-splitter 46.01 µs/iter 46.38 µs ███ + (44.69 µs … 47.88 µs) 47.88 µs ▅ ███▅ ▅ ▅ ▅ + ( 0.00 b … 0.00 b) 0.00 b █▁▁▁████▁▁▁█▁█▁▁▁▁▁▁█ + 6.54 ipc ( 0.43% stalls) 0.00% L1 data cache + 200.47k cycles 1.31M instructions 0.00% retired LD/ST ( 0.00) + +@formatjs/intl-segmenter 72.66 µs/iter 74.71 µs █ + (62.92 µs … 348.38 µs) 89.63 µs ▄▄▂ ██▆ + ( 0.00 b … 0.00 b) 0.00 b ▁▄███▇█████▆▄▃▂▂▂▂▁▁▁ + 6.27 ipc ( 0.92% stalls) 0.00% L1 data cache + 303.91k cycles 1.90M instructions 0.00% retired LD/ST ( 0.00) + +unicode-rs/unicode-segmentation (wasm-bindgen) 17.08 µs/iter 17.32 µs █ + (16.50 µs … 17.70 µs) 17.41 µs ▅ ▅ ▅ ▅▅█ ▅ ▅▅▅ + ( 0.00 b … 0.00 b) 0.00 b █▁█▁▁▁▁▁█▁███▁▁█▁▁███ + 6.12 ipc ( 1.22% stalls) 0.00% L1 data cache + 68.78k cycles 420.59k instructions 0.00% retired LD/ST ( 0.00) + +Intl.Segmenter 10.14 µs/iter 10.21 µs █ + (9.92 µs … 10.47 µs) 10.39 µs █ █ █ + ( 0.00 b … 5.06 kb) 356.00 b █▁███▁▁▁▁█▁▁█▁▁▁█▁▁▁█ + 5.50 ipc ( 2.63% stalls) 0.00% L1 data cache + 40.83k cycles 224.42k instructions 0.00% retired LD/ST ( 0.00) + + ┌ ┐ + unicode-segmenter/grapheme ┤ 6.74 µs + graphemer ┤■■■■■■■■■ 23.74 µs + grapheme-splitter ┤■■■■■■■■■■■■■■■■■■■■ 46.01 µs + @formatjs/intl-segmenter ┤■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 72.66 µs +unicode-rs/unicode-segmentation (wasm-bindgen) ┤■■■■■ 17.08 µs + Intl.Segmenter ┤■■ 10.14 µs + └ ┘ + +summary + unicode-segmenter/grapheme + 1.5x faster than Intl.Segmenter + 2.53x faster than unicode-rs/unicode-segmentation (wasm-bindgen) + 3.52x faster than graphemer + 6.83x faster than grapheme-splitter + 10.78x faster than @formatjs/intl-segmenter + +• Code snippet (combined) +------------------------------------------------------------- ------------------------------- +unicode-segmenter/grapheme 15.35 µs/iter 15.53 µs █ + (14.95 µs … 16.00 µs) 15.66 µs ▅ █ ▅ ▅ ▅ ▅ ▅▅▅ ▅ + ( 0.00 b … 0.00 b) 0.00 b █▁▁█▁█▁█▁█▁█▁▁▁███▁▁█ + 6.88 ipc ( 2.48% stalls) 0.00% L1 data cache + 63.04k cycles 433.94k instructions 0.00% retired LD/ST ( 0.00) + +graphemer 54.69 µs/iter 54.60 µs █ + (53.93 µs … 57.10 µs) 56.15 µs ▅ █ ▅ + ( 0.00 b … 0.00 b) 0.00 b ▇█▁█▁▁█▁▁▇▁▁▁▁▁▁▁▁▁▁▇ + 6.68 ipc ( 1.35% stalls) 0.00% L1 data cache + 226.43k cycles 1.51M instructions 0.00% retired LD/ST ( 0.00) + +grapheme-splitter 112.68 µs/iter 117.04 µs ▅▇█ ▂ + (99.13 µs … 403.79 µs) 140.92 µs ▅█████▆█▄ + ( 0.00 b … 0.00 b) 0.00 b ▅███████████▆▄▅▃▄▃▂▂▁ + 6.46 ipc ( 0.48% stalls) 0.00% L1 data cache + 471.06k cycles 3.04M instructions 0.00% retired LD/ST ( 0.00) + +@formatjs/intl-segmenter 168.48 µs/iter 172.67 µs █▇ + (148.92 µs … 476.58 µs) 202.08 µs ▂▄▆▅███▆▂ + ( 0.00 b … 0.00 b) 0.00 b ▂▆██████████▆▄▃▂▂▂▂▁▁ + 6.28 ipc ( 0.88% stalls) 0.00% L1 data cache + 691.16k cycles 4.34M instructions 0.00% retired LD/ST ( 0.00) + +unicode-rs/unicode-segmentation (wasm-bindgen) 39.15 µs/iter 39.53 µs █ + (38.27 µs … 39.99 µs) 39.82 µs █ + ( 0.00 b … 0.00 b) 0.00 b █▁█▁▁▁▁██▁▁▁▁██▁█▁▁██ + 6.18 ipc ( 1.25% stalls) 0.00% L1 data cache + 159.92k cycles 987.69k instructions 0.00% retired LD/ST ( 0.00) + +Intl.Segmenter 21.65 µs/iter 21.80 µs █ █ + (21.29 µs … 22.14 µs) 22.01 µs ▅▅▅ █ ▅▅ █ ▅ ▅ + ( 28.00 b … 520.00 b) 122.67 b ███▁▁█▁▁▁██▁▁▁█▁█▁▁▁█ + 5.72 ipc ( 2.60% stalls) 0.00% L1 data cache + 90.13k cycles 515.22k instructions 0.00% retired LD/ST ( 0.00) + + ┌ ┐ + unicode-segmenter/grapheme ┤ 15.35 µs + graphemer ┤■■■■■■■■■ 54.69 µs + grapheme-splitter ┤■■■■■■■■■■■■■■■■■■■■■■ 112.68 µs + @formatjs/intl-segmenter ┤■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 168.48 µs +unicode-rs/unicode-segmentation (wasm-bindgen) ┤■■■■■ 39.15 µs + Intl.Segmenter ┤■ 21.65 µs + └ ┘ + +summary + unicode-segmenter/grapheme + 1.41x faster than Intl.Segmenter + 2.55x faster than unicode-rs/unicode-segmentation (wasm-bindgen) + 3.56x faster than graphemer + 7.34x faster than grapheme-splitter + 10.97x faster than @formatjs/intl-segmenter diff --git a/benchmark/grapheme/_records/20250307-apple_m4_pro-macos_15.3-chrome_134.0.6998.45.txt b/benchmark/grapheme/_records/20250307-apple_m4_pro-macos_15.3-chrome_134.0.6998.45.txt new file mode 100644 index 0000000..e08b673 --- /dev/null +++ b/benchmark/grapheme/_records/20250307-apple_m4_pro-macos_15.3-chrome_134.0.6998.45.txt @@ -0,0 +1,197 @@ +clk: ~4.35 GHz +cpu: null +runtime: null (null) + +benchmark avg (min … max) p75 / p99 (min … top 1%) +------------------------------------------------------------- ------------------------------- +• Lorem ipsum (ascii) +------------------------------------------------------------- ------------------------------- +unicode-segmenter/grapheme 3.34 µs/iter 3.39 µs █▂ + (3.22 µs … 3.61 µs) 3.61 µs ▄██▄▆▇▄▃▁▄▄▃▁▁▆▃▃▃▁▁▃ +graphemer 23.69 µs/iter 0.00 ps █ + (0.00 ps … 200.00 µs) 100.00 µs █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▅ +grapheme-splitter 35.99 µs/iter 100.00 µs █ + (0.00 ps … 600.00 µs) 100.00 µs █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█ +@formatjs/intl-segmenter 32.08 µs/iter 100.00 µs █ + (0.00 ps … 200.00 µs) 100.00 µs █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█ +unicode-rs/unicode-segmentation (wasm-bindgen) 17.73 µs/iter 0.00 ps █ + (0.00 ps … 200.00 µs) 100.00 µs █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▄ +Intl.Segmenter 6.43 µs/iter 6.67 µs █▄ + (5.74 µs … 6.93 µs) 6.79 µs █▅▁▁▁▁▁▁▁▁▅▁██▁█▁██▅█ + + ┌ ┐ + unicode-segmenter/grapheme ┤ 3.34 µs + graphemer ┤■■■■■■■■■■■■■■■■■■■■■ 23.69 µs + grapheme-splitter ┤■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 35.99 µs + @formatjs/intl-segmenter ┤■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 32.08 µs +unicode-rs/unicode-segmentation (wasm-bindgen) ┤■■■■■■■■■■■■■■■ 17.73 µs + Intl.Segmenter ┤■■■ 6.43 µs + └ ┘ + +summary + unicode-segmenter/grapheme + 1.93x faster than Intl.Segmenter + 5.3x faster than unicode-rs/unicode-segmentation (wasm-bindgen) + 7.09x faster than graphemer + 9.6x faster than @formatjs/intl-segmenter + 10.77x faster than grapheme-splitter + +• Emojis +------------------------------------------------------------- ------------------------------- +unicode-segmenter/grapheme 1.41 µs/iter 1.44 µs ▄ ▇ █ + (1.34 µs … 1.51 µs) 1.49 µs ▃▁▁█▁▁▁█▁▁█▁▁▆▁▁▁▇▁▁▅ +graphemer 6.93 µs/iter 6.98 µs ▂█ + (6.79 µs … 7.18 µs) 7.15 µs ▄▄▁██▇▁▁▁▄▁▄▄▁▁▁▄▄▁▁▄ +grapheme-splitter 14.77 µs/iter 0.00 ps █ + (0.00 ps … 300.00 µs) 100.00 µs █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▃ +@formatjs/intl-segmenter 10.24 µs/iter 0.00 ps █ + (0.00 ps … 1.00 ms) 100.00 µs █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂ +unicode-rs/unicode-segmentation (wasm-bindgen) 5.43 µs/iter 5.47 µs ▂ █ ▂ ▂ ▂ + (5.35 µs … 5.57 µs) 5.57 µs █▁█▁█▁▁▇▁█▁█▁▇▁▁▁▁▇▁▄ +Intl.Segmenter 3.51 µs/iter 3.13 µs █ + (2.59 µs … 6.25 µs) 6.25 µs ▅█▁▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▄▄ + + ┌ ┐ + unicode-segmenter/grapheme ┤ 1.41 µs + graphemer ┤■■■■■■■■■■■■■■ 6.93 µs + grapheme-splitter ┤■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 14.77 µs + @formatjs/intl-segmenter ┤■■■■■■■■■■■■■■■■■■■■■■ 10.24 µs +unicode-rs/unicode-segmentation (wasm-bindgen) ┤■■■■■■■■■■ 5.43 µs + Intl.Segmenter ┤■■■■■ 3.51 µs + └ ┘ + +summary + unicode-segmenter/grapheme + 2.49x faster than Intl.Segmenter + 3.85x faster than unicode-rs/unicode-segmentation (wasm-bindgen) + 4.91x faster than graphemer + 7.25x faster than @formatjs/intl-segmenter + 10.46x faster than grapheme-splitter + +• Hindi +------------------------------------------------------------- ------------------------------- +unicode-segmenter/grapheme 4.33 µs/iter 4.35 µs ▂ █ + (4.25 µs … 4.42 µs) 4.39 µs ▄▁▁▄▁▁▁█▁▁█▁▁█▁▁▁█▁▁▄ +graphemer 26.32 µs/iter 26.76 µs █ █ + (25.59 µs … 27.39 µs) 26.95 µs ██▁▁█▁▁▁██▁▁█▁▁▁▁██▁█ +grapheme-splitter 46.68 µs/iter 47.27 µs █ + (44.02 µs … 52.08 µs) 49.73 µs ██▁▁███▁██▁█▁▁█▁▁▁▁▁█ +@formatjs/intl-segmenter 33.39 µs/iter 100.00 µs █ + (0.00 ps … 600.00 µs) 100.00 µs █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█ +unicode-rs/unicode-segmentation (wasm-bindgen) 15.84 µs/iter 15.82 µs ▃█▃ + (15.63 µs … 16.33 µs) 15.99 µs ▆▁▁▁▁▁▁███▁▆▁▁▁▁▁▁▁▆▆ +Intl.Segmenter 6.54 µs/iter 6.91 µs ▄ ▄▄▄█ + (5.49 µs … 7.03 µs) 6.98 µs ▅▁█▁▁▁█▁▁▁▁▁▁▁▁▁▅████ + + ┌ ┐ + unicode-segmenter/grapheme ┤ 4.33 µs + graphemer ┤■■■■■■■■■■■■■■■■■■ 26.32 µs + grapheme-splitter ┤■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 46.68 µs + @formatjs/intl-segmenter ┤■■■■■■■■■■■■■■■■■■■■■■■ 33.39 µs +unicode-rs/unicode-segmentation (wasm-bindgen) ┤■■■■■■■■■ 15.84 µs + Intl.Segmenter ┤■■ 6.54 µs + └ ┘ + +summary + unicode-segmenter/grapheme + 1.51x faster than Intl.Segmenter + 3.66x faster than unicode-rs/unicode-segmentation (wasm-bindgen) + 6.07x faster than graphemer + 7.71x faster than @formatjs/intl-segmenter + 10.77x faster than grapheme-splitter + +• Demonic characters +------------------------------------------------------------- ------------------------------- +unicode-segmenter/grapheme 1.25 µs/iter 1.27 µs ▃ █ + (1.20 µs … 1.37 µs) 1.34 µs ▅▁▁█▁▁▁█▁▁▅▁▁▄▁▁▁▄▁▁▂ +graphemer 13.58 µs/iter 13.67 µs █ + (13.23 µs … 14.23 µs) 14.09 µs ▆█▆▁▁▆▆▁▁▆▆▁▁▁▁▁▁▁▁▆▆ +grapheme-splitter 10.33 µs/iter 10.35 µs █▃ + (10.11 µs … 10.62 µs) 10.57 µs ▆▁▁▆▁▁██▁▆▁▆▁▁▁▁▁▆▁▁▆ +@formatjs/intl-segmenter 12.33 µs/iter 12.45 µs █ █ █ ███ █ █ + (12.13 µs … 12.65 µs) 12.50 µs █▁▁█▁█▁███▁▁▁▁▁▁▁█▁▁█ +unicode-rs/unicode-segmentation (wasm-bindgen) 2.57 µs/iter 2.56 µs ▅ █ + (2.49 µs … 2.95 µs) 2.91 µs ▂▆█▁█▅▂▁▁▁▁▁▂▁▁▁▁▁▁▂▂ +Intl.Segmenter 1.98 µs/iter 1.56 µs █ + (1.37 µs … 14.04 µs) 13.79 µs █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁ + + ┌ ┐ + unicode-segmenter/grapheme ┤ 1.25 µs + graphemer ┤■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 13.58 µs + grapheme-splitter ┤■■■■■■■■■■■■■■■■■■■■■■■■■ 10.33 µs + @formatjs/intl-segmenter ┤■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 12.33 µs +unicode-rs/unicode-segmentation (wasm-bindgen) ┤■■■■ 2.57 µs + Intl.Segmenter ┤■■ 1.98 µs + └ ┘ + +summary + unicode-segmenter/grapheme + 1.59x faster than Intl.Segmenter + 2.06x faster than unicode-rs/unicode-segmentation (wasm-bindgen) + 8.27x faster than grapheme-splitter + 9.87x faster than @formatjs/intl-segmenter + 10.87x faster than graphemer + +• Tweet text (combined) +------------------------------------------------------------- ------------------------------- +unicode-segmenter/grapheme 5.79 µs/iter 5.81 µs █ ▆ ▃ + (5.69 µs … 5.86 µs) 5.86 µs ▄▁▁▁▁▁█▁▁█▁█▁▁▄▁▁▄▁▁█ +graphemer 35.46 µs/iter 35.89 µs █ █ + (34.79 µs … 36.33 µs) 36.18 µs █▁▁▁████▁▁▁▁▁█▁▁█▁█▁█ +grapheme-splitter 100.80 µs/iter 100.00 µs █ + (0.00 ps … 300.00 µs) 200.00 µs ▅▁▁▁▁▁▁▁▁▁█▁▁▁▁▁▁▁▁▁▅ +@formatjs/intl-segmenter 50.18 µs/iter 50.46 µs █ █ + (49.27 µs … 51.39 µs) 51.29 µs ██▁█▁▁███▁▁▁█▁▁█▁▁▁▁█ +unicode-rs/unicode-segmentation (wasm-bindgen) 26.33 µs/iter 26.54 µs █ █ █ + (25.68 µs … 27.39 µs) 26.78 µs █▁▁▁▁▁█▁█▁█▁▁▁███▁▁▁█ +Intl.Segmenter 9.64 µs/iter 9.81 µs █ ██ + (9.30 µs … 9.99 µs) 9.96 µs █▁▁▁▁▁█████▁▁▁▁▁██▁▁█ + + ┌ ┐ + unicode-segmenter/grapheme ┤ 5.79 µs + graphemer ┤■■■■■■■■■■■ 35.46 µs + grapheme-splitter ┤■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 100.80 µs + @formatjs/intl-segmenter ┤■■■■■■■■■■■■■■■■ 50.18 µs +unicode-rs/unicode-segmentation (wasm-bindgen) ┤■■■■■■■ 26.33 µs + Intl.Segmenter ┤■ 9.64 µs + └ ┘ + +summary + unicode-segmenter/grapheme + 1.66x faster than Intl.Segmenter + 4.55x faster than unicode-rs/unicode-segmentation (wasm-bindgen) + 6.13x faster than graphemer + 8.67x faster than @formatjs/intl-segmenter + 17.41x faster than grapheme-splitter + +• Code snippet (combined) +------------------------------------------------------------- ------------------------------- +unicode-segmenter/grapheme 14.13 µs/iter 0.00 ps █ + (0.00 ps … 200.00 µs) 100.00 µs █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▃ +graphemer 83.54 µs/iter 83.54 µs █ + (82.76 µs … 85.69 µs) 84.89 µs ▅▁▅█▁▅▅▅▅▁▁▁▁▁▁▁▁▁▁▁▅ +grapheme-splitter 233.89 µs/iter 300.00 µs █ ▂ + (100.00 µs … 400.00 µs) 400.00 µs ▂▁▁▁▁▁▁█▁▁▁▁▁█▁▁▁▁▁▁▁ +@formatjs/intl-segmenter 117.64 µs/iter 200.00 µs █ + (0.00 ps … 600.00 µs) 200.00 µs ▃▁▁▁▁▁▁▁▁▁█▁▁▁▁▁▁▁▁▁▇ +unicode-rs/unicode-segmentation (wasm-bindgen) 62.89 µs/iter 100.00 µs ▅ █ + (0.00 ps … 300.00 µs) 200.00 µs █▁▁▁▁▁▁▁▁▁█▁▁▁▁▁▁▁▁▁▂ +Intl.Segmenter 18.61 µs/iter 18.65 µs █ █ █ + (18.41 µs … 19.04 µs) 18.75 µs ██▁▁▁▁██▁▁▁█▁▁█▁█▁▁▁█ + + ┌ ┐ + unicode-segmenter/grapheme ┤ 14.13 µs + graphemer ┤■■■■■■■■■■■ 83.54 µs + grapheme-splitter ┤■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 233.89 µs + @formatjs/intl-segmenter ┤■■■■■■■■■■■■■■■■ 117.64 µs +unicode-rs/unicode-segmentation (wasm-bindgen) ┤■■■■■■■■ 62.89 µs + Intl.Segmenter ┤■ 18.61 µs + └ ┘ + +summary + unicode-segmenter/grapheme + 1.32x faster than Intl.Segmenter + 4.45x faster than unicode-rs/unicode-segmentation (wasm-bindgen) + 5.91x faster than graphemer + 8.32x faster than @formatjs/intl-segmenter + 16.55x faster than grapheme-splitter diff --git a/benchmark/grapheme/_records/20250307-apple_m4_pro-macos_15.3-nodejs_23.9.0.txt b/benchmark/grapheme/_records/20250307-apple_m4_pro-macos_15.3-nodejs_23.9.0.txt new file mode 100644 index 0000000..13142e4 --- /dev/null +++ b/benchmark/grapheme/_records/20250307-apple_m4_pro-macos_15.3-nodejs_23.9.0.txt @@ -0,0 +1,335 @@ +clk: ~4.44 GHz +cpu: Apple M4 Pro +runtime: node 23.9.0 (arm64-darwin) + +benchmark avg (min … max) p75 / p99 (min … top 1%) +------------------------------------------------------------- ------------------------------- +• Lorem ipsum (ascii) +------------------------------------------------------------- ------------------------------- +unicode-segmenter/grapheme 3.63 µs/iter 3.65 µs ▃▃▃ ▃ █ ▃ + (3.57 µs … 3.67 µs) 3.66 µs ███▇█ ▇ █▇█▂ + ( 7.90 b … 38.65 b) 16.02 b ▆▁▁▁▆▁▆▆▆█████▆█▆████ + 7.95 ipc ( 4.13% stalls) 0.00% L1 data cache + 14.85k cycles 118.11k instructions 0.00% retired LD/ST ( 0.00) + +graphemer 29.52 µs/iter 29.58 µs █ + (24.75 µs … 119.96 µs) 37.17 µs ██▂ + ( 5.81 kb … 1.51 mb) 78.28 kb ▁▁▂▃▃████▅▃▃▂▂▂▁▁▁▁▁▁ + 6.85 ipc ( 1.56% stalls) 0.00% L1 data cache + 121.03k cycles 828.74k instructions 0.00% retired LD/ST ( 0.00) + +grapheme-splitter 45.22 µs/iter 45.71 µs █ + (38.21 µs … 648.46 µs) 57.46 µs ▅▇ ██ + ( 10.14 kb … 1.96 mb) 47.17 kb ▃██▅▆███▆▅▄▄▃▃▃▃▂▃▂▂▁ + 8.31 ipc ( 0.50% stalls) 0.00% L1 data cache + 186.53k cycles 1.55M instructions 0.00% retired LD/ST ( 0.00) + +@formatjs/intl-segmenter 29.50 µs/iter 29.78 µs ██ █ █ + (28.80 µs … 31.01 µs) 30.69 µs ██▅ █ █▅ ▅ + ( 3.62 kb … 3.66 kb) 3.64 kb ███▁▁█▁▁▁▁██▁▁▁▁▁▁▁▁█ + 7.02 ipc ( 1.41% stalls) 0.00% L1 data cache + 118.04k cycles 828.23k instructions 0.00% retired LD/ST ( 0.00) + +unicode-rs/unicode-segmentation (wasm-bindgen) 12.58 µs/iter 12.79 µs █ + (11.04 µs … 104.54 µs) 14.50 µs ▃█▆ + ( 24.00 b … 369.70 kb) 16.18 kb ▁▅▅▄▂▂▁▂████▂▁▁▁▂▂▁▁▁ + 5.73 ipc ( 0.81% stalls) 0.00% L1 data cache + 54.73k cycles 313.66k instructions 0.00% retired LD/ST ( 0.00) + +Intl.Segmenter 6.33 µs/iter 6.37 µs █ + (6.19 µs … 6.48 µs) 6.45 µs ▇ ▂ █▇ ▂ + ( 1.91 kb … 1.92 kb) 1.92 kb ▆▆█▆▁█▁▁▁▆▁▁██▆▁▆▆▆▁█ + 6.92 ipc ( 2.85% stalls) 0.00% L1 data cache + 25.65k cycles 177.63k instructions 0.00% retired LD/ST ( 0.00) + + ┌ ┐ + unicode-segmenter/grapheme ┤ 3.63 µs + graphemer ┤■■■■■■■■■■■■■■■■■■■■■ 29.52 µs + grapheme-splitter ┤■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 45.22 µs + @formatjs/intl-segmenter ┤■■■■■■■■■■■■■■■■■■■■■ 29.50 µs +unicode-rs/unicode-segmentation (wasm-bindgen) ┤■■■■■■■ 12.58 µs + Intl.Segmenter ┤■■ 6.33 µs + └ ┘ + +summary + unicode-segmenter/grapheme + 1.74x faster than Intl.Segmenter + 3.47x faster than unicode-rs/unicode-segmentation (wasm-bindgen) + 8.12x faster than @formatjs/intl-segmenter + 8.13x faster than graphemer + 12.45x faster than grapheme-splitter + +• Emojis +------------------------------------------------------------- ------------------------------- +unicode-segmenter/grapheme 1.36 µs/iter 1.37 µs ▄ █ + (1.34 µs … 1.38 µs) 1.38 µs ▂▇▄█ ▄▄█ + ( 2.60 kb … 2.60 kb) 2.60 kb ▄▆▅▄▂▆██████▅█████▅▄▅ + 7.36 ipc ( 3.53% stalls) 0.00% L1 data cache + 5.50k cycles 40.47k instructions 0.00% retired LD/ST ( 0.00) + +graphemer 8.36 µs/iter 8.39 µs █ █ + (8.21 µs … 8.55 µs) 8.52 µs █ █ █ █ + ( 3.51 kb … 3.55 kb) 3.52 kb █▁▁▁█▁█▁█████▁▁▁█▁▁▁█ + 6.69 ipc ( 1.58% stalls) 0.00% L1 data cache + 33.44k cycles 223.62k instructions 0.00% retired LD/ST ( 0.00) + +grapheme-splitter 17.12 µs/iter 17.17 µs █▆ + (14.42 µs … 266.33 µs) 22.33 µs ██ ██▄ + ( 88.00 b … 1.20 mb) 13.81 kb ▃██▅▆███▅▅▄▃▃▂▂▂▃▃▂▁▁ + 7.67 ipc ( 0.31% stalls) 0.00% L1 data cache + 73.05k cycles 560.29k instructions 0.00% retired LD/ST ( 0.00) + +@formatjs/intl-segmenter 9.88 µs/iter 9.95 µs ██ █ + (9.66 µs … 10.35 µs) 10.16 µs ▅██ █ ▅ ▅ ▅ ▅ ▅ ▅ + ( 1.40 kb … 1.51 kb) 1.50 kb ███▁█▁▁█▁▁█▁█▁▁█▁█▁▁█ + 6.62 ipc ( 1.58% stalls) 0.00% L1 data cache + 39.83k cycles 263.55k instructions 0.00% retired LD/ST ( 0.00) + +unicode-rs/unicode-segmentation (wasm-bindgen) 4.13 µs/iter 4.15 µs █▅ + (4.01 µs … 4.46 µs) 4.39 µs ▃ ██▃▃ + (794.74 b … 1.02 kb) 950.96 b █████████▄██▁▄▁▁▁▁▁▁▄ + 5.92 ipc ( 0.64% stalls) 0.00% L1 data cache + 16.84k cycles 99.71k instructions 0.00% retired LD/ST ( 0.00) + +Intl.Segmenter 3.13 µs/iter 3.20 µs ▄█ ██ ▄ + (2.89 µs … 3.73 µs) 3.55 µs ██▅██▅█ █ + (457.98 b … 466.41 b) 465.71 b █▅████████▅▁█▅▅▅▁▅▁▅▅ + 5.94 ipc ( 3.08% stalls) 0.00% L1 data cache + 12.81k cycles 76.05k instructions 0.00% retired LD/ST ( 0.00) + + ┌ ┐ + unicode-segmenter/grapheme ┤ 1.36 µs + graphemer ┤■■■■■■■■■■■■■■■ 8.36 µs + grapheme-splitter ┤■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 17.12 µs + @formatjs/intl-segmenter ┤■■■■■■■■■■■■■■■■■■ 9.88 µs +unicode-rs/unicode-segmentation (wasm-bindgen) ┤■■■■■■ 4.13 µs + Intl.Segmenter ┤■■■■ 3.13 µs + └ ┘ + +summary + unicode-segmenter/grapheme + 2.3x faster than Intl.Segmenter + 3.03x faster than unicode-rs/unicode-segmentation (wasm-bindgen) + 6.15x faster than graphemer + 7.27x faster than @formatjs/intl-segmenter + 12.59x faster than grapheme-splitter + +• Hindi +------------------------------------------------------------- ------------------------------- +unicode-segmenter/grapheme 4.37 µs/iter 4.37 µs █ + (4.35 µs … 4.43 µs) 4.42 µs ▃▆▆█ + ( 3.52 kb … 3.53 kb) 3.53 kb ▆█████▆▄▁▁▄▄▁▁▁▁▁▄▁▁▄ + 7.13 ipc ( 2.90% stalls) 0.00% L1 data cache + 17.72k cycles 126.35k instructions 0.00% retired LD/ST ( 0.00) + +graphemer 31.65 µs/iter 31.93 µs █ █ ███ ██ ██ █ █ + (30.74 µs … 32.87 µs) 32.43 µs █ █ ███ ██ ██ █ █ + ( 3.34 kb … 3.40 kb) 3.37 kb █▁█▁▁███▁▁██▁██▁█▁▁▁█ + 6.50 ipc ( 1.57% stalls) 0.00% L1 data cache + 127.76k cycles 829.88k instructions 0.00% retired LD/ST ( 0.00) + +grapheme-splitter 49.12 µs/iter 50.88 µs ▂█ + (42.38 µs … 256.29 µs) 62.63 µs ▂██ ██▇▂ + (552.00 b … 360.54 kb) 48.72 kb ▂█████████▆▅▄▃▃▃▃▃▂▂▂ + 6.32 ipc ( 0.49% stalls) 0.00% L1 data cache + 205.43k cycles 1.30M instructions 0.00% retired LD/ST ( 0.00) + +@formatjs/intl-segmenter 33.11 µs/iter 33.57 µs █ █ + (32.34 µs … 34.65 µs) 33.72 µs █ ▅ █▅ ▅ ▅ ▅▅▅ + ( 2.50 kb … 2.60 kb) 2.58 kb █▁█▁██▁▁▁▁▁█▁▁▁▁█▁███ + 6.24 ipc ( 1.20% stalls) 0.00% L1 data cache + 132.44k cycles 826.34k instructions 0.00% retired LD/ST ( 0.00) + +unicode-rs/unicode-segmentation (wasm-bindgen) 13.17 µs/iter 13.24 µs █ + (13.02 µs … 13.31 µs) 13.26 µs ▅ ▅▅ ▅ ▅ ▅ █▅ + ( 1.60 kb … 1.60 kb) 1.60 kb █▁▁██▁▁█▁▁▁▁█▁▁▁█▁▁██ + 5.40 ipc ( 0.59% stalls) 0.00% L1 data cache + 54.48k cycles 294.22k instructions 0.00% retired LD/ST ( 0.00) + +Intl.Segmenter 6.36 µs/iter 6.41 µs ▄ █ + (6.13 µs … 6.62 µs) 6.57 µs █ █ + ( 3.46 kb … 3.47 kb) 3.47 kb ▅▅▁▁▁▁██▅▅█▁▁▅▅▁▁█▅▁▅ + 5.92 ipc ( 2.47% stalls) 0.00% L1 data cache + 25.76k cycles 152.40k instructions 0.00% retired LD/ST ( 0.00) + + ┌ ┐ + unicode-segmenter/grapheme ┤ 4.37 µs + graphemer ┤■■■■■■■■■■■■■■■■■■■■■ 31.65 µs + grapheme-splitter ┤■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 49.12 µs + @formatjs/intl-segmenter ┤■■■■■■■■■■■■■■■■■■■■■■ 33.11 µs +unicode-rs/unicode-segmentation (wasm-bindgen) ┤■■■■■■■ 13.17 µs + Intl.Segmenter ┤■■ 6.36 µs + └ ┘ + +summary + unicode-segmenter/grapheme + 1.46x faster than Intl.Segmenter + 3.02x faster than unicode-rs/unicode-segmentation (wasm-bindgen) + 7.25x faster than graphemer + 7.58x faster than @formatjs/intl-segmenter + 11.25x faster than grapheme-splitter + +• Demonic characters +------------------------------------------------------------- ------------------------------- +unicode-segmenter/grapheme 1.27 µs/iter 1.28 µs ▅ █▇ + (1.24 µs … 1.37 µs) 1.36 µs █▂██ + (849.20 b … 857.63 b) 857.05 b ▃▆█████▇█▁▃▁▁▁▂▁▂▁▁▂▂ + 7.75 ipc ( 1.52% stalls) 0.00% L1 data cache + 5.12k cycles 39.67k instructions 0.00% retired LD/ST ( 0.00) + +graphemer 17.84 µs/iter 18.04 µs █ █ █ + (17.50 µs … 18.19 µs) 18.17 µs ▅█▅ █ ▅ █ ▅▅ + (710.14 b … 751.80 b) 714.38 b ███▁▁▁█▁▁▁▁▁▁█▁▁█▁▁██ + 6.42 ipc ( 1.69% stalls) 0.00% L1 data cache + 70.86k cycles 454.68k instructions 0.00% retired LD/ST ( 0.00) + +grapheme-splitter 12.57 µs/iter 12.65 µs █ ██ ██ ██ █ █ + (12.24 µs … 12.93 µs) 12.88 µs █ ██ ██ ██ █ █ + ( 2.79 kb … 3.01 kb) 2.83 kb █▁▁██▁▁▁██▁▁██▁█▁▁▁▁█ + 6.44 ipc ( 1.35% stalls) 0.00% L1 data cache + 49.85k cycles 320.90k instructions 0.00% retired LD/ST ( 0.00) + +@formatjs/intl-segmenter 11.86 µs/iter 11.95 µs █ + (11.58 µs … 12.27 µs) 12.10 µs █ ▅ ▅▅▅ ▅ ▅▅ ▅ + ( 3.55 kb … 3.57 kb) 3.56 kb █▁▁█▁▁▁▁███▁█▁██▁▁▁▁█ + 6.28 ipc ( 0.64% stalls) 0.00% L1 data cache + 47.82k cycles 300.54k instructions 0.00% retired LD/ST ( 0.00) + +unicode-rs/unicode-segmentation (wasm-bindgen) 2.49 µs/iter 2.51 µs █ + (2.40 µs … 2.53 µs) 2.53 µs █▅██▂▇█▅ + ( 1.20 kb … 1.31 kb) 1.28 kb ▃▁▃▆▁▁▃▆▃▆▁█████████▃ + 5.23 ipc ( 0.35% stalls) 0.00% L1 data cache + 10.47k cycles 54.72k instructions 0.00% retired LD/ST ( 0.00) + +Intl.Segmenter 1.87 µs/iter 3.25 µs █ + (1.07 µs … 3.84 µs) 3.67 µs █ + ( 1.22 kb … 1.23 kb) 1.23 kb ▄█▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▇▂ + 4.76 ipc ( 4.37% stalls) 0.00% L1 data cache + 7.96k cycles 37.92k instructions 0.00% retired LD/ST ( 0.00) + + ┌ ┐ + unicode-segmenter/grapheme ┤ 1.27 µs + graphemer ┤■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 17.84 µs + grapheme-splitter ┤■■■■■■■■■■■■■■■■■■■■■■■ 12.57 µs + @formatjs/intl-segmenter ┤■■■■■■■■■■■■■■■■■■■■■■ 11.86 µs +unicode-rs/unicode-segmentation (wasm-bindgen) ┤■■ 2.49 µs + Intl.Segmenter ┤■ 1.87 µs + └ ┘ + +summary + unicode-segmenter/grapheme + 1.47x faster than Intl.Segmenter + 1.95x faster than unicode-rs/unicode-segmentation (wasm-bindgen) + 9.31x faster than @formatjs/intl-segmenter + 9.87x faster than grapheme-splitter + 14.01x faster than graphemer + +• Tweet text (combined) +------------------------------------------------------------- ------------------------------- +unicode-segmenter/grapheme 5.76 µs/iter 5.79 µs █ + (5.67 µs … 5.81 µs) 5.80 µs █▃ + (307.43 b … 317.19 b) 315.03 b █▁▄▁▄▁█▁▄▁█▁▁▁▄▄█▄██▄ + 7.72 ipc ( 4.20% stalls) 0.00% L1 data cache + 23.29k cycles 179.88k instructions 0.00% retired LD/ST ( 0.00) + +graphemer 42.40 µs/iter 42.29 µs ▄█ + (37.00 µs … 105.92 µs) 51.67 µs ██ + ( 2.88 kb … 1.62 mb) 111.19 kb ▁▁▁▁▁▃██▆▁▂▂▁▁▁▁▁▁▁▁▁ + 6.61 ipc ( 1.60% stalls) 0.00% L1 data cache + 174.90k cycles 1.16M instructions 0.00% retired LD/ST ( 0.00) + +grapheme-splitter 91.44 µs/iter 96.67 µs █▆ + (80.75 µs … 236.42 µs) 118.00 µs ██▆ ▅ + (432.00 b … 505.51 kb) 63.55 kb ▆████▆▆▅█▇▆▆▅▄▃▃▃▂▂▁▁ + 5.46 ipc ( 0.40% stalls) 0.00% L1 data cache + 381.95k cycles 2.09M instructions 0.00% retired LD/ST ( 0.00) + +@formatjs/intl-segmenter 46.96 µs/iter 47.19 µs █ + (46.22 µs … 48.30 µs) 47.64 µs ▅▅▅ █ ▅ ▅▅▅ ▅ ▅ + ( 1.10 kb … 1.43 kb) 1.39 kb ███▁▁█▁▁█▁▁▁███▁█▁▁▁█ + 6.60 ipc ( 1.35% stalls) 0.00% L1 data cache + 187.33k cycles 1.24M instructions 0.00% retired LD/ST ( 0.00) + +unicode-rs/unicode-segmentation (wasm-bindgen) 18.62 µs/iter 18.72 µs █ █ + (18.34 µs … 18.95 µs) 18.81 µs █ ▅ ▅▅ █▅ ▅ ▅ ▅ + (132.58 b … 134.32 b) 133.07 b █▁▁▁█▁▁▁▁██▁▁██▁█▁█▁█ + 5.79 ipc ( 0.85% stalls) 0.00% L1 data cache + 76.26k cycles 441.48k instructions 0.00% retired LD/ST ( 0.00) + +Intl.Segmenter 9.48 µs/iter 9.50 µs █ + (9.41 µs … 9.59 µs) 9.55 µs █ ▅ ▅▅▅ ▅▅▅▅▅ ▅▅ ▅ + (183.14 b … 192.57 b) 190.81 b █▁█▁███▁▁█████▁▁▁██▁█ + 6.48 ipc ( 2.67% stalls) 0.00% L1 data cache + 38.37k cycles 248.54k instructions 0.00% retired LD/ST ( 0.00) + + ┌ ┐ + unicode-segmenter/grapheme ┤ 5.76 µs + graphemer ┤■■■■■■■■■■■■■■■ 42.40 µs + grapheme-splitter ┤■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 91.44 µs + @formatjs/intl-segmenter ┤■■■■■■■■■■■■■■■■ 46.96 µs +unicode-rs/unicode-segmentation (wasm-bindgen) ┤■■■■■ 18.62 µs + Intl.Segmenter ┤■ 9.48 µs + └ ┘ + +summary + unicode-segmenter/grapheme + 1.65x faster than Intl.Segmenter + 3.24x faster than unicode-rs/unicode-segmentation (wasm-bindgen) + 7.37x faster than graphemer + 8.16x faster than @formatjs/intl-segmenter + 15.89x faster than grapheme-splitter + +• Code snippet (combined) +------------------------------------------------------------- ------------------------------- +unicode-segmenter/grapheme 13.84 µs/iter 13.87 µs █ + (13.81 µs … 13.88 µs) 13.87 µs █ █ + ( 2.30 kb … 2.31 kb) 2.30 kb ██▁▁▁▁▁█▁█▁▁▁▁▁▁▁▁▁▁█ + 7.51 ipc ( 3.62% stalls) 0.00% L1 data cache + 55.75k cycles 418.85k instructions 0.00% retired LD/ST ( 0.00) + +graphemer 103.06 µs/iter 104.29 µs █▄ + (88.42 µs … 230.17 µs) 160.42 µs ██ + ( 64.04 kb … 2.11 mb) 264.75 kb ▁▂████▅▂▁▁▁▁▁▁▁▁▁▁▁▁▁ + 6.62 ipc ( 1.62% stalls) 0.00% L1 data cache + 412.33k cycles 2.73M instructions 0.00% retired LD/ST ( 0.00) + +grapheme-splitter 211.69 µs/iter 217.75 µs █▇▇▅ + (188.21 µs … 378.63 µs) 275.17 µs ▇████▅▃ + ( 22.77 kb … 786.17 kb) 151.19 kb ▇███████▇▅▅▅▄▄▃▃▂▂▂▂▁ + 5.46 ipc ( 0.44% stalls) 0.00% L1 data cache + 890.07k cycles 4.86M instructions 0.00% retired LD/ST ( 0.00) + +@formatjs/intl-segmenter 109.64 µs/iter 111.08 µs █ + (93.92 µs … 248.63 µs) 183.96 µs █▄ + ( 22.49 kb … 776.33 kb) 300.25 kb ▃▇▇██▅▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁ + 6.47 ipc ( 1.46% stalls) 0.00% L1 data cache + 443.15k cycles 2.87M instructions 0.00% retired LD/ST ( 0.00) + +unicode-rs/unicode-segmentation (wasm-bindgen) 43.99 µs/iter 44.10 µs █ █ + (43.31 µs … 45.11 µs) 44.54 µs ▅ ▅ █ ▅▅█ ▅▅ ▅ + ( 2.71 kb … 2.72 kb) 2.72 kb █▁▁▁█▁█▁▁███▁██▁▁▁▁▁█ + 5.78 ipc ( 1.02% stalls) 0.00% L1 data cache + 180.13k cycles 1.04M instructions 0.00% retired LD/ST ( 0.00) + +Intl.Segmenter 20.71 µs/iter 20.67 µs █ █ + (20.49 µs … 21.50 µs) 21.20 µs ██ █ + ( 1.12 kb … 1.13 kb) 1.13 kb ██▁█▁█▁▁█▁▁▁▁▁▁▁▁▁▁▁█ + 6.70 ipc ( 2.48% stalls) 0.00% L1 data cache + 83.27k cycles 557.47k instructions 0.00% retired LD/ST ( 0.00) + + ┌ ┐ + unicode-segmenter/grapheme ┤ 13.84 µs + graphemer ┤■■■■■■■■■■■■■■■ 103.06 µs + grapheme-splitter ┤■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 211.69 µs + @formatjs/intl-segmenter ┤■■■■■■■■■■■■■■■■ 109.64 µs +unicode-rs/unicode-segmentation (wasm-bindgen) ┤■■■■■ 43.99 µs + Intl.Segmenter ┤■ 20.71 µs + └ ┘ + +summary + unicode-segmenter/grapheme + 1.5x faster than Intl.Segmenter + 3.18x faster than unicode-rs/unicode-segmentation (wasm-bindgen) + 7.45x faster than graphemer + 7.92x faster than @formatjs/intl-segmenter + 15.3x faster than grapheme-splitter diff --git a/benchmark/grapheme/_records/20250307-apple_m4_pro-macos_15.3-safari_18.3.txt b/benchmark/grapheme/_records/20250307-apple_m4_pro-macos_15.3-safari_18.3.txt new file mode 100644 index 0000000..1854598 --- /dev/null +++ b/benchmark/grapheme/_records/20250307-apple_m4_pro-macos_15.3-safari_18.3.txt @@ -0,0 +1,197 @@ +clk: ~4.25 GHz +cpu: null +runtime: null (null) + +benchmark avg (min … max) p75 / p99 (min … top 1%) +------------------------------------------------------------- ------------------------------- +• Lorem ipsum (ascii) +------------------------------------------------------------- ------------------------------- +unicode-segmenter/grapheme 2.95 µs/iter 2.93 µs █ + (2.69 µs … 3.42 µs) 3.17 µs ▂▁▁▁▁▁▁▁▁▁█▁▁▁▁▁▁▁▁▁▃ +graphemer 13.65 µs/iter 0.00 ps █ + (0.00 ps … 1.00 ms) 1.00 ms █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁ +grapheme-splitter 68.97 µs/iter 0.00 ps █ + (0.00 ps … 1.00 ms) 1.00 ms █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂ +@formatjs/intl-segmenter 67.99 µs/iter 0.00 ps █ + (0.00 ps … 24.00 ms) 1.00 ms █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂ +unicode-rs/unicode-segmentation (wasm-bindgen) 30.27 µs/iter 0.00 ps █ + (0.00 ps … 1.00 ms) 1.00 ms █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁ +Intl.Segmenter 6.18 µs/iter 6.35 µs █ ▂ + (5.86 µs … 6.59 µs) 6.35 µs ▄▁▁▁▁▁▁▁▁▁█▁▁▁▁▁▁▁▁▁█ + + ┌ ┐ + unicode-segmenter/grapheme ┤ 2.95 µs + graphemer ┤■■■■■■ 13.65 µs + grapheme-splitter ┤■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 68.97 µs + @formatjs/intl-segmenter ┤■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 67.99 µs +unicode-rs/unicode-segmentation (wasm-bindgen) ┤■■■■■■■■■■■■■■ 30.27 µs + Intl.Segmenter ┤■■ 6.18 µs + └ ┘ + +summary + unicode-segmenter/grapheme + 2.1x faster than Intl.Segmenter + 4.63x faster than graphemer + 10.26x faster than unicode-rs/unicode-segmentation (wasm-bindgen) + 23.05x faster than @formatjs/intl-segmenter + 23.38x faster than grapheme-splitter + +• Emojis +------------------------------------------------------------- ------------------------------- +unicode-segmenter/grapheme 1.86 µs/iter 1.95 µs ▃ █ + (1.71 µs … 1.95 µs) 1.95 µs █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█ +graphemer 4.19 µs/iter 4.15 µs █ + (4.15 µs … 4.39 µs) 4.39 µs █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▄ +grapheme-splitter 17.42 µs/iter 17.82 µs ▃ █ ▃ + (16.60 µs … 19.04 µs) 18.55 µs █▁▁█▁█▁▁▆▁▁▁▁▆▁▁▁▆▁▁▆ +@formatjs/intl-segmenter 19.80 µs/iter 0.00 ps █ + (0.00 ps … 1.00 ms) 1.00 ms █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁ +unicode-rs/unicode-segmentation (wasm-bindgen) 8.86 µs/iter 8.79 µs █ + (8.79 µs … 9.03 µs) 9.03 µs █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▆ +Intl.Segmenter 2.57 µs/iter 2.69 µs ▇ █ + (2.20 µs … 2.93 µs) 2.93 µs ▂▁▁▁▁▁▁█▁▁▁▁▁█▁▁▁▁▁▁▂ + + ┌ ┐ + unicode-segmenter/grapheme ┤ 1.86 µs + graphemer ┤■■■■ 4.19 µs + grapheme-splitter ┤■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 17.42 µs + @formatjs/intl-segmenter ┤■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 19.80 µs +unicode-rs/unicode-segmentation (wasm-bindgen) ┤■■■■■■■■■■■■■ 8.86 µs + Intl.Segmenter ┤■ 2.57 µs + └ ┘ + +summary + unicode-segmenter/grapheme + 1.38x faster than Intl.Segmenter + 2.26x faster than graphemer + 4.77x faster than unicode-rs/unicode-segmentation (wasm-bindgen) + 9.38x faster than grapheme-splitter + 10.66x faster than @formatjs/intl-segmenter + +• Hindi +------------------------------------------------------------- ------------------------------- +unicode-segmenter/grapheme 5.40 µs/iter 5.37 µs █ + (5.13 µs … 5.62 µs) 5.62 µs ▃▁▁▁▁▁▁▁▁▁█▁▁▁▁▁▁▁▁▁▄ +graphemer 15.61 µs/iter 0.00 ps █ + (0.00 ps … 1.00 ms) 1.00 ms █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁ +grapheme-splitter 25.70 µs/iter 26.12 µs █ + (24.90 µs … 27.34 µs) 26.61 µs ▅▁▁█▁▁█▁▁▅▁▁▁▁▅▁▁▅▁▁▅ +@formatjs/intl-segmenter 62.24 µs/iter 62.74 µs ▃ █ + (59.57 µs … 63.72 µs) 63.48 µs ▆▁▁▁▁▁▁▆▁▁▁▆█▁▁█▆▆▁▁▆ +unicode-rs/unicode-segmentation (wasm-bindgen) 25.61 µs/iter 25.63 µs █ █ + (25.39 µs … 26.37 µs) 25.88 µs █▁▁▁▁▁▁▁▁▁█▁▁▁▁▁▁▁▁▁▄ +Intl.Segmenter 5.19 µs/iter 0.00 ps █ + (0.00 ps … 1.00 ms) 0.00 ps █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁ + + ┌ ┐ + unicode-segmenter/grapheme ┤ 5.40 µs + graphemer ┤■■■■■■ 15.61 µs + grapheme-splitter ┤■■■■■■■■■■■■ 25.70 µs + @formatjs/intl-segmenter ┤■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 62.24 µs +unicode-rs/unicode-segmentation (wasm-bindgen) ┤■■■■■■■■■■■■ 25.61 µs + Intl.Segmenter ┤ 5.19 µs + └ ┘ + +summary + unicode-segmenter/grapheme + 1.04x slower than Intl.Segmenter + 2.89x faster than graphemer + 4.74x faster than unicode-rs/unicode-segmentation (wasm-bindgen) + 4.76x faster than grapheme-splitter + 11.52x faster than @formatjs/intl-segmenter + +• Demonic characters +------------------------------------------------------------- ------------------------------- +unicode-segmenter/grapheme 1.70 µs/iter 1.71 µs █ + (1.46 µs … 1.71 µs) 1.71 µs ▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█ +graphemer 6.84 µs/iter 6.84 µs █ + (6.59 µs … 7.32 µs) 7.08 µs █▁▁▁▁▁▁▁▁▁█▁▁▁▁▁▁▁▁▁▆ +grapheme-splitter 5.52 µs/iter 5.62 µs █ + (5.37 µs … 5.86 µs) 5.86 µs █▁▁▁▁▁▁▁▁▁█▁▁▁▁▁▁▁▁▁▄ +@formatjs/intl-segmenter 62.42 µs/iter 63.96 µs █ █ █ █ + (60.55 µs … 65.92 µs) 64.94 µs ██▁█▁▁█▁█▁▁▁▁▁▁▁█▁▁▁█ +unicode-rs/unicode-segmentation (wasm-bindgen) 3.32 µs/iter 3.42 µs ▃ █ + (3.17 µs … 3.42 µs) 3.42 µs █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█ +Intl.Segmenter 1.30 µs/iter 0.00 ps █ + (0.00 ps … 1.00 ms) 0.00 ps █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁ + + ┌ ┐ + unicode-segmenter/grapheme ┤ 1.70 µs + graphemer ┤■■■ 6.84 µs + grapheme-splitter ┤■■ 5.52 µs + @formatjs/intl-segmenter ┤■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 62.42 µs +unicode-rs/unicode-segmentation (wasm-bindgen) ┤■ 3.32 µs + Intl.Segmenter ┤ 1.30 µs + └ ┘ + +summary + unicode-segmenter/grapheme + 1.31x slower than Intl.Segmenter + 1.96x faster than unicode-rs/unicode-segmentation (wasm-bindgen) + 3.25x faster than grapheme-splitter + 4.03x faster than graphemer + 36.76x faster than @formatjs/intl-segmenter + +• Tweet text (combined) +------------------------------------------------------------- ------------------------------- +unicode-segmenter/grapheme 7.11 µs/iter 7.08 µs █ + (6.84 µs … 7.57 µs) 7.32 µs ▅▁▁▁▁▁▁▁▁▁█▁▁▁▁▁▁▁▁▁▅ +graphemer 21.34 µs/iter 21.48 µs ▃ █ █ + (20.26 µs … 23.68 µs) 21.97 µs ▆▁▁▁▁▁█▁▁█▁▆▁▁█▁▁▁▁▁▆ +grapheme-splitter 45.21 µs/iter 44.92 µs █ + (44.19 µs … 48.34 µs) 46.14 µs ▅▁▅▅▁█▁█▁▁▁▁▁▁▁▁▁▅▁▁▅ +@formatjs/intl-segmenter 83.55 µs/iter 0.00 ps █ + (0.00 ps … 1.00 ms) 1.00 ms █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂ +unicode-rs/unicode-segmentation (wasm-bindgen) 44.33 µs/iter 44.68 µs █ █ █ + (43.21 µs … 45.17 µs) 45.17 µs █▁▁█▁▁▁▁█▁▁▁██▁█▁▁█▁█ +Intl.Segmenter 9.24 µs/iter 9.52 µs █ + (9.03 µs … 9.77 µs) 9.52 µs █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█ + + ┌ ┐ + unicode-segmenter/grapheme ┤ 7.11 µs + graphemer ┤■■■■■■ 21.34 µs + grapheme-splitter ┤■■■■■■■■■■■■■■■■■ 45.21 µs + @formatjs/intl-segmenter ┤■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 83.55 µs +unicode-rs/unicode-segmentation (wasm-bindgen) ┤■■■■■■■■■■■■■■■■■ 44.33 µs + Intl.Segmenter ┤■ 9.24 µs + └ ┘ + +summary + unicode-segmenter/grapheme + 1.3x faster than Intl.Segmenter + 3x faster than graphemer + 6.24x faster than unicode-rs/unicode-segmentation (wasm-bindgen) + 6.36x faster than grapheme-splitter + 11.76x faster than @formatjs/intl-segmenter + +• Code snippet (combined) +------------------------------------------------------------- ------------------------------- +unicode-segmenter/grapheme 15.88 µs/iter 0.00 ps █ + (0.00 ps … 1.00 ms) 1.00 ms █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁ +graphemer 47.69 µs/iter 47.61 µs █ + (46.88 µs … 50.29 µs) 48.83 µs ▄▁▁█▁▇▁▁▄▁▁▁▁▁▁▄▁▁▁▁▄ +grapheme-splitter 105.55 µs/iter 106.69 µs █ █ █ + (101.32 µs … 113.04 µs) 111.33 µs █▁██▁▁█▁█▁▁█▁▁▁▁▁▁▁██ +@formatjs/intl-segmenter 195.56 µs/iter 199.22 µs █ ▃ + (191.16 µs … 201.90 µs) 199.46 µs ▆▁█▁▁▁▆▁▆▆▁▁▁▁▆▁▁▁▁█▆ +unicode-rs/unicode-segmentation (wasm-bindgen) 104.02 µs/iter 104.98 µs █ █ + (102.05 µs … 105.71 µs) 105.47 µs █▁▁▁█▁█▁▁███▁▁▁▁▁█▁██ +Intl.Segmenter 20.12 µs/iter 20.51 µs █ ▄ + (19.53 µs … 21.00 µs) 20.51 µs ▅▁▁▁▁█▁▁▁▁█▁▁▁▁▅▁▁▁▁█ + + ┌ ┐ + unicode-segmenter/grapheme ┤ 15.88 µs + graphemer ┤■■■■■■ 47.69 µs + grapheme-splitter ┤■■■■■■■■■■■■■■■■■ 105.55 µs + @formatjs/intl-segmenter ┤■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 195.56 µs +unicode-rs/unicode-segmentation (wasm-bindgen) ┤■■■■■■■■■■■■■■■■■ 104.02 µs + Intl.Segmenter ┤■ 20.12 µs + └ ┘ + +summary + unicode-segmenter/grapheme + 1.27x faster than Intl.Segmenter + 3x faster than graphemer + 6.55x faster than unicode-rs/unicode-segmentation (wasm-bindgen) + 6.65x faster than grapheme-splitter + 12.32x faster than @formatjs/intl-segmenter diff --git a/benchmark/grapheme/_records/20250307-intel_x86_64-linux-bun_1.2.4.txt b/benchmark/grapheme/_records/20250307-intel_x86_64-linux-bun_1.2.4.txt new file mode 100644 index 0000000..53d201e --- /dev/null +++ b/benchmark/grapheme/_records/20250307-intel_x86_64-linux-bun_1.2.4.txt @@ -0,0 +1,335 @@ +clk: ~4.49 GHz +cpu: Intel(R) Core(TM) Ultra 7 258V +runtime: bun 1.2.4 (x64-linux) + +benchmark avg (min … max) p75 / p99 (min … top 1%) +------------------------------------------------------------- ------------------------------- +• Lorem ipsum (ascii) +------------------------------------------------------------- ------------------------------- +unicode-segmenter/grapheme 4.49 µs/iter 4.62 µs ▂ ██▂ + (4.22 µs … 4.70 µs) 4.69 µs ▅ ▅▅▅█▅ ▅ ███ ▅ + ( 0.00 b … 1.94 kb) 73.14 b █▁▇█████▇▁▇▁▇█▁▇███▇█ + 6.48 ipc ( 53.55% cache) 10.63 branch misses + 17.01k cycles 110.27k instructions 265.44 c-refs 123.30 c-misses + +graphemer 18.16 µs/iter 17.81 µs █▆ + (12.99 µs … 1.22 ms) 28.48 µs ▇██▆ + ( 0.00 b … 1.00 mb) 1.43 kb ▁▂▄████▇▄▃▂▂▂▂▂▂▂▂▁▁▁ + 5.81 ipc ( 53.90% cache) 37.45 branch misses + 69.27k cycles 402.78k instructions 1.25k c-refs 574.10 c-misses + +grapheme-splitter 113.66 µs/iter 116.92 µs █ + (94.14 µs … 448.14 µs) 168.95 µs ▃▂█▆▇ + ( 0.00 b … 101.85 mb) 112.23 kb ▂▅██████▅▃▂▂▁▁▁▁▁▁▁▁▁ + 2.77 ipc ( 62.24% cache) 84.42 branch misses + 462.88k cycles 1.28M instructions 731.00 c-refs 276.03 c-misses + +@formatjs/intl-segmenter 90.25 µs/iter 72.53 µs ▇█▃ + (58.34 µs … 76.99 ms) 96.33 µs ████▇▂ + ( 0.00 b … 179.58 mb) 93.58 kb ▁▂▅██████▆▄▃▄▄▃▂▂▂▁▁▁ + 5.10 ipc ( 61.62% cache) 597.04 branch misses + 267.25k cycles 1.36M instructions 1.11k c-refs 427.39 c-misses + +unicode-rs/unicode-segmentation (wasm-bindgen) 16.82 µs/iter 16.91 µs ▇█▂ + (11.74 µs … 1.25 ms) 23.95 µs ▃███▅ + ( 0.00 b … 3.25 mb) 4.92 kb ▁▁▁▄██████▇▅▄▃▃▂▂▁▁▁▁ + 4.50 ipc ( 52.41% cache) 59.91 branch misses + 62.19k cycles 279.95k instructions 258.29 c-refs 122.92 c-misses + +Intl.Segmenter 7.92 µs/iter 8.12 µs █ + (7.63 µs … 8.20 µs) 8.19 µs ██ ██ + ( 0.00 b … 3.88 kb) 260.00 b █▁███▁▁█▁██▁█▁█▁▁█▁██ + 4.89 ipc ( 40.06% cache) 35.87 branch misses + 30.50k cycles 149.20k instructions 461.78 c-refs 276.77 c-misses + + ┌ ┐ + unicode-segmenter/grapheme ┤ 4.49 µs + graphemer ┤■■■■ 18.16 µs + grapheme-splitter ┤■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 113.66 µs + @formatjs/intl-segmenter ┤■■■■■■■■■■■■■■■■■■■■■■■■■■■ 90.25 µs +unicode-rs/unicode-segmentation (wasm-bindgen) ┤■■■■ 16.82 µs + Intl.Segmenter ┤■ 7.92 µs + └ ┘ + +summary + unicode-segmenter/grapheme + 1.77x faster than Intl.Segmenter + 3.75x faster than unicode-rs/unicode-segmentation (wasm-bindgen) + 4.05x faster than graphemer + 20.12x faster than @formatjs/intl-segmenter + 25.33x faster than grapheme-splitter + +• Emojis +------------------------------------------------------------- ------------------------------- +unicode-segmenter/grapheme 2.41 µs/iter 2.47 µs █ ▂▂ + (2.17 µs … 3.07 µs) 2.86 µs ▆█ ▆██ ▃ + ( 0.00 b … 5.38 kb) 148.48 b ▃▇██▅▇█████▃▅▅▅▃▁▁▁▁▃ + 5.14 ipc ( 52.63% cache) 19.66 branch misses + 9.23k cycles 47.47k instructions 171.55 c-refs 81.26 c-misses + +graphemer 5.72 µs/iter 5.92 µs ██ + (5.48 µs … 6.06 µs) 6.03 µs ▅██ ▅▅ ▅ + ( 0.00 b … 9.48 kb) 423.04 b ▇███▇▇▁▁▇▇▁▁▁▇▁██▇▁▇█ + 5.68 ipc ( 59.52% cache) 44.95 branch misses + 21.58k cycles 122.63k instructions 397.16 c-refs 160.77 c-misses + +grapheme-splitter 28.83 µs/iter 29.18 µs █ █ + (27.62 µs … 30.44 µs) 29.71 µs ▅ ▅ ▅ █▅ ▅ ▅█ ▅ + ( 0.00 b … 3.81 kb) 714.11 b █▁█▁▁█▁▁▁██▁█▁██▁▁▁▁█ + 2.34 ipc ( 66.35% cache) 85.68 branch misses + 123.87k cycles 289.43k instructions 218.89 c-refs 73.65 c-misses + +@formatjs/intl-segmenter 22.73 µs/iter 23.05 µs █ + (21.54 µs … 24.17 µs) 23.14 µs ▅ █▅ + ( 0.00 b … 6.85 kb) 755.00 b █▁▁▇▁▁▁▁▁▁▇▁▁▁▁▁▁▁▇██ + 4.76 ipc ( 57.73% cache) 184.13 branch misses + 89.71k cycles 427.16k instructions 354.00 c-refs 149.63 c-misses + +unicode-rs/unicode-segmentation (wasm-bindgen) 5.34 µs/iter 5.40 µs █ ██ + (4.82 µs … 6.55 µs) 6.53 µs █ ██ + ( 0.00 b … 4.59 kb) 835.55 b █▅▁▅██▁█▁▁▅▅▁▁▁▁▅▁▁▁█ + 4.87 ipc ( 49.84% cache) 17.21 branch misses + 19.89k cycles 96.84k instructions 113.91 c-refs 57.14 c-misses + +Intl.Segmenter 3.30 µs/iter 3.42 µs ▄▄ ▄█▄█ + (2.96 µs … 4.23 µs) 4.13 µs ██ ████▅█▅ + ( 0.00 b … 10.13 kb) 413.88 b ▅██████████▅▁▁▁▁▁▁▁▁▅ + 3.90 ipc ( 45.06% cache) 13.07 branch misses + 12.95k cycles 50.46k instructions 243.15 c-refs 133.60 c-misses + + ┌ ┐ + unicode-segmenter/grapheme ┤ 2.41 µs + graphemer ┤■■■■ 5.72 µs + grapheme-splitter ┤■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 28.83 µs + @formatjs/intl-segmenter ┤■■■■■■■■■■■■■■■■■■■■■■■■■■ 22.73 µs +unicode-rs/unicode-segmentation (wasm-bindgen) ┤■■■■ 5.34 µs + Intl.Segmenter ┤■ 3.30 µs + └ ┘ + +summary + unicode-segmenter/grapheme + 1.37x faster than Intl.Segmenter + 2.22x faster than unicode-rs/unicode-segmentation (wasm-bindgen) + 2.38x faster than graphemer + 9.44x faster than @formatjs/intl-segmenter + 11.97x faster than grapheme-splitter + +• Hindi +------------------------------------------------------------- ------------------------------- +unicode-segmenter/grapheme 6.31 µs/iter 6.34 µs █ ▂ ▂█ + (6.20 µs … 6.67 µs) 6.64 µs █▅ █▅██ ▅ + ( 0.00 b … 8.63 kb) 448.00 b ██▇████▁█▁▁▁▁▁▁▁▁▁▁▁▇ + 5.34 ipc ( 56.22% cache) 62.60 branch misses + 24.62k cycles 131.40k instructions 360.66 c-refs 157.90 c-misses + +graphemer 20.78 µs/iter 20.59 µs █ █████ █ ███ █ + (19.51 µs … 27.41 µs) 21.25 µs █ █████ █ ███ █ + ( 0.00 b … 17.01 kb) 2.13 kb █▁█████▁█▁▁███▁▁▁▁▁▁█ + 5.39 ipc ( 62.77% cache) 222.11 branch misses + 77.42k cycles 417.58k instructions 1.27k c-refs 473.04 c-misses + +grapheme-splitter 36.35 µs/iter 35.92 µs █ + (35.20 µs … 43.97 µs) 37.02 µs ▅ █▅ + ( 0.00 b … 5.74 kb) 988.00 b █▁██▁▁▁▁▇▁▁▁▇▁▁▁▁▁▁▁▇ + 4.08 ipc ( 66.63% cache) 174.28 branch misses + 154.61k cycles 630.70k instructions 734.46 c-refs 245.08 c-misses + +@formatjs/intl-segmenter 69.82 µs/iter 70.58 µs █ █ █ + (67.66 µs … 73.07 µs) 72.22 µs ▅█ ▅▅ █ █ ▅ ▅ + ( 0.00 b … 8.08 kb) 933.67 b ██▁▁▁██▁█▁▁▁▁█▁█▁▁▁▁█ + 4.91 ipc ( 55.64% cache) 609.17 branch misses + 277.26k cycles 1.36M instructions 985.26 c-refs 437.07 c-misses + +unicode-rs/unicode-segmentation (wasm-bindgen) 16.92 µs/iter 16.75 µs █ + (15.74 µs … 20.18 µs) 20.01 µs █ ██ + ( 0.00 b … 9.09 kb) 1.91 kb ██▁████▁▁▁▁▁▁▁▁▁▁▁▁▁█ + 4.36 ipc ( 49.78% cache) 102.60 branch misses + 65.10k cycles 283.98k instructions 270.56 c-refs 135.88 c-misses + +Intl.Segmenter 7.49 µs/iter 7.54 µs █ ██ █ + (7.38 µs … 7.62 µs) 7.61 µs █▅▅▅▅ ▅ ██▅ ▅▅█ ▅ ▅ + ( 0.00 b … 7.16 kb) 595.25 b █████▁▁█▁███▁███▁▁█▁█ + 3.81 ipc ( 34.71% cache) 66.06 branch misses + 30.37k cycles 115.57k instructions 542.79 c-refs 354.39 c-misses + + ┌ ┐ + unicode-segmenter/grapheme ┤ 6.31 µs + graphemer ┤■■■■■■■■ 20.78 µs + grapheme-splitter ┤■■■■■■■■■■■■■■■■ 36.35 µs + @formatjs/intl-segmenter ┤■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 69.82 µs +unicode-rs/unicode-segmentation (wasm-bindgen) ┤■■■■■■ 16.92 µs + Intl.Segmenter ┤■ 7.49 µs + └ ┘ + +summary + unicode-segmenter/grapheme + 1.19x faster than Intl.Segmenter + 2.68x faster than unicode-rs/unicode-segmentation (wasm-bindgen) + 3.29x faster than graphemer + 5.76x faster than grapheme-splitter + 11.06x faster than @formatjs/intl-segmenter + +• Demonic characters +------------------------------------------------------------- ------------------------------- +unicode-segmenter/grapheme 2.08 µs/iter 2.11 µs █ + (1.91 µs … 3.26 µs) 3.09 µs █▃▅▃ + ( 0.00 b … 5.81 kb) 236.56 b ▅████▃▂▁▁▁▁▂▁▁▁▁▁▁▁▁▂ + 5.92 ipc ( 43.37% cache) 10.43 branch misses + 7.68k cycles 45.42k instructions 122.39 c-refs 69.31 c-misses + +graphemer 9.95 µs/iter 10.22 µs █ + (9.53 µs … 10.36 µs) 10.36 µs █ █ + ( 0.00 b … 26.18 kb) 2.18 kb █▁██▁▁▁▁█▁▁▁▁▁▁████▁█ + 5.53 ipc ( 57.53% cache) 32.35 branch misses + 36.94k cycles 204.24k instructions 796.27 c-refs 338.14 c-misses + +grapheme-splitter 7.69 µs/iter 7.94 µs ▂ █ + (7.33 µs … 8.01 µs) 7.97 µs ▅ █ ▅ █▅ + ( 0.00 b … 4.50 kb) 230.40 b █▁█▇▇▁▇▁▁▁▁▁▁█▇▁▇▁▁██ + 5.13 ipc ( 63.10% cache) 17.87 branch misses + 29.97k cycles 153.85k instructions 406.30 c-refs 149.91 c-misses + +@formatjs/intl-segmenter 48.00 µs/iter 48.14 µs █ + (45.33 µs … 51.26 µs) 50.68 µs ▅█ + ( 0.00 b … 5.25 kb) 697.44 b ▇▁▇▁▁▁▇▁▁██▇▁▁▁▁▁▁▁▁▇ + 4.62 ipc ( 40.99% cache) 254.19 branch misses + 196.70k cycles 907.89k instructions 162.45 c-refs 95.87 c-misses + +unicode-rs/unicode-segmentation (wasm-bindgen) 3.47 µs/iter 3.57 µs █▅ █▅ + (3.30 µs … 3.75 µs) 3.72 µs ▃██▃ ██ + ( 0.00 b … 1.47 kb) 510.75 b ████▄▄▄▄▁▁▄▄████▁▄▁▄▄ + 4.01 ipc ( 52.96% cache) 9.99 branch misses + 13.43k cycles 53.79k instructions 24.30 c-refs 11.43 c-misses + +Intl.Segmenter 1.74 µs/iter 1.95 µs █▇ + (1.34 µs … 2.69 µs) 2.66 µs ▄███ + ( 0.00 b … 5.34 kb) 605.68 b █████▁▂▄▆██▄▅▁▁▆▅▁▁▄▅ + 2.79 ipc ( 41.39% cache) 4.73 branch misses + 6.61k cycles 18.48k instructions 135.79 c-refs 79.59 c-misses + + ┌ ┐ + unicode-segmenter/grapheme ┤ 2.08 µs + graphemer ┤■■■■■■ 9.95 µs + grapheme-splitter ┤■■■■ 7.69 µs + @formatjs/intl-segmenter ┤■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 48.00 µs +unicode-rs/unicode-segmentation (wasm-bindgen) ┤■ 3.47 µs + Intl.Segmenter ┤ 1.74 µs + └ ┘ + +summary + unicode-segmenter/grapheme + 1.19x slower than Intl.Segmenter + 1.67x faster than unicode-rs/unicode-segmentation (wasm-bindgen) + 3.7x faster than grapheme-splitter + 4.79x faster than graphemer + 23.13x faster than @formatjs/intl-segmenter + +• Tweet text (combined) +------------------------------------------------------------- ------------------------------- +unicode-segmenter/grapheme 8.04 µs/iter 8.31 µs █ █ + (7.75 µs … 8.40 µs) 8.40 µs █ █ █ █ █ + ( 0.00 b … 5.13 kb) 286.32 b █▁████▁██▁▁▁▁▁▁▁▁█▁██ + 5.78 ipc ( 55.79% cache) 29.26 branch misses + 30.77k cycles 177.79k instructions 557.76 c-refs 246.60 c-misses + +graphemer 30.06 µs/iter 30.23 µs █ + (28.24 µs … 36.66 µs) 30.27 µs █ + ( 0.00 b … 14.74 kb) 1.85 kb ██▁▁█▁▁▁▁▁█▁███▁▁▁▁██ + 5.57 ipc ( 53.42% cache) 249.14 branch misses + 109.48k cycles 610.12k instructions 1.93k c-refs 900.79 c-misses + +grapheme-splitter 68.69 µs/iter 69.66 µs █▃▅ + (58.25 µs … 762.55 µs) 95.25 µs ▅███▅ + ( 0.00 b … 256.00 kb) 2.55 kb ▁▄█████▆▄▆▅▄▃▂▂▁▁▁▁▁▁ + 3.54 ipc ( 63.91% cache) 338.86 branch misses + 291.48k cycles 1.03M instructions 1.14k c-refs 410.20 c-misses + +@formatjs/intl-segmenter 104.02 µs/iter 106.59 µs ▄▅██ + (80.57 µs … 996.75 µs) 134.73 µs ▂█████▂ + ( 0.00 b … 128.00 kb) 4.47 kb ▁▁▁▂▃███████▆▃▂▁▁▁▁▁▁ + 4.98 ipc ( 59.76% cache) 984.69 branch misses + 397.71k cycles 1.98M instructions 1.67k c-refs 671.06 c-misses + +unicode-rs/unicode-segmentation (wasm-bindgen) 24.17 µs/iter 24.61 µs █ █ + (22.61 µs … 28.31 µs) 24.89 µs ▅▅ █ ▅ █▅ ▅▅▅ + ( 0.00 b … 11.87 kb) 1.36 kb ██▁█▁▁▁▁▁▁█▁▁██▁▁▁███ + 4.68 ipc ( 49.08% cache) 97.79 branch misses + 89.11k cycles 417.27k instructions 492.37 c-refs 250.73 c-misses + +Intl.Segmenter 12.69 µs/iter 12.77 µs █ + (12.29 µs … 12.97 µs) 12.93 µs ▅ ▅ ▅ ▅▅ █ ▅ ▅ + ( 0.00 b … 10.45 kb) 1.17 kb █▁█▁▁▁▁▁▁█▁▁██▁█▁▁█▁█ + 4.43 ipc ( 33.87% cache) 79.36 branch misses + 46.57k cycles 206.44k instructions 778.52 c-refs 514.82 c-misses + + ┌ ┐ + unicode-segmenter/grapheme ┤ 8.04 µs + graphemer ┤■■■■■■■■ 30.06 µs + grapheme-splitter ┤■■■■■■■■■■■■■■■■■■■■■ 68.69 µs + @formatjs/intl-segmenter ┤■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 104.02 µs +unicode-rs/unicode-segmentation (wasm-bindgen) ┤■■■■■■ 24.17 µs + Intl.Segmenter ┤■■ 12.69 µs + └ ┘ + +summary + unicode-segmenter/grapheme + 1.58x faster than Intl.Segmenter + 3.01x faster than unicode-rs/unicode-segmentation (wasm-bindgen) + 3.74x faster than graphemer + 8.55x faster than grapheme-splitter + 12.94x faster than @formatjs/intl-segmenter + +• Code snippet (combined) +------------------------------------------------------------- ------------------------------- +unicode-segmenter/grapheme 19.51 µs/iter 19.86 µs █ █ + (18.70 µs … 20.87 µs) 20.21 µs ▅▅▅ █ █▅ ▅ ▅ ▅ + ( 64.00 b … 3.91 kb) 635.43 b ███▁▁▁█▁▁▁██▁▁▁█▁▁█▁█ + 5.74 ipc ( 51.88% cache) 105.21 branch misses + 72.12k cycles 413.89k instructions 1.21k c-refs 581.29 c-misses + +graphemer 72.33 µs/iter 71.25 µs █▆ + (50.92 µs … 766.96 µs) 139.82 µs ▃██▃ + ( 0.00 b … 512.00 kb) 5.72 kb ▂▃████▅▂▁▁▁▁▁▁▁▁▂▂▂▁▁ + 5.39 ipc ( 57.99% cache) 603.49 branch misses + 268.69k cycles 1.45M instructions 4.54k c-refs 1.91k c-misses + +grapheme-splitter 160.86 µs/iter 161.87 µs █ + (133.35 µs … 941.92 µs) 208.62 µs ▄█▇ + ( 0.00 b … 256.00 kb) 5.69 kb ▁▁▃▄▅███▆▄▄▃▃▂▂▁▁▁▁▁▁ + 3.61 ipc ( 62.76% cache) 707.52 branch misses + 659.44k cycles 2.38M instructions 2.66k c-refs 991.46 c-misses + +@formatjs/intl-segmenter 248.60 µs/iter 250.84 µs █ + (194.98 µs … 1.10 ms) 402.79 µs ██ + ( 0.00 b … 128.00 kb) 10.21 kb ▁▁▁▇██▇▄▃▂▁▁▁▁▁▁▁▁▁▁▁ + 5.00 ipc ( 59.32% cache) 2.25k branch misses + 904.86k cycles 4.52M instructions 3.98k c-refs 1.62k c-misses + +unicode-rs/unicode-segmentation (wasm-bindgen) 54.73 µs/iter 55.95 µs █ █ █ + (52.32 µs … 57.63 µs) 56.16 µs █▅ █ ▅ ▅▅▅█ + ( 0.00 b … 13.72 kb) 1.25 kb ██▁▁▁▁▁▁█▁▁█▁▁▁▁▁████ + 4.64 ipc ( 50.03% cache) 214.52 branch misses + 211.18k cycles 980.44k instructions 1.21k c-refs 607.07 c-misses + +Intl.Segmenter 27.40 µs/iter 27.91 µs █ █ + (26.42 µs … 29.26 µs) 28.64 µs ▅ ▅▅█ █▅ ▅▅ ▅ + ( 32.00 b … 10.87 kb) 1.27 kb █▁███▁██▁▁▁▁▁██▁▁▁▁▁█ + 4.74 ipc ( 32.43% cache) 194.55 branch misses + 102.85k cycles 487.97k instructions 1.52k c-refs 1.03k c-misses + + ┌ ┐ + unicode-segmenter/grapheme ┤ 19.51 µs + graphemer ┤■■■■■■■■ 72.33 µs + grapheme-splitter ┤■■■■■■■■■■■■■■■■■■■■■ 160.86 µs + @formatjs/intl-segmenter ┤■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 248.60 µs +unicode-rs/unicode-segmentation (wasm-bindgen) ┤■■■■■ 54.73 µs + Intl.Segmenter ┤■ 27.40 µs + └ ┘ + +summary + unicode-segmenter/grapheme + 1.4x faster than Intl.Segmenter + 2.81x faster than unicode-rs/unicode-segmentation (wasm-bindgen) + 3.71x faster than graphemer + 8.25x faster than grapheme-splitter + 12.74x faster than @formatjs/intl-segmenter diff --git a/benchmark/grapheme/_records/20250307-intel_x86_64-linux-chrome_133.0.6943.141.txt b/benchmark/grapheme/_records/20250307-intel_x86_64-linux-chrome_133.0.6943.141.txt new file mode 100644 index 0000000..ea588c9 --- /dev/null +++ b/benchmark/grapheme/_records/20250307-intel_x86_64-linux-chrome_133.0.6943.141.txt @@ -0,0 +1,197 @@ +clk: ~4.40 GHz +cpu: null +runtime: null (null) + +benchmark avg (min … max) p75 / p99 (min … top 1%) +------------------------------------------------------------- ------------------------------- +• Lorem ipsum (ascii) +------------------------------------------------------------- ------------------------------- +unicode-segmenter/grapheme 3.75 µs/iter 3.76 µs ▅ ▅ █ ▄ + (3.69 µs … 3.81 µs) 3.81 µs ▄▁▁▁█▁▁▁█▁▁▁█▁▁▁█▁▁▁▄ +graphemer 34.41 µs/iter 100.00 µs █ + (0.00 ps … 300.00 µs) 100.00 µs █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█ +grapheme-splitter 46.89 µs/iter 100.00 µs █ ▄ + (0.00 ps … 900.00 µs) 200.00 µs █▁▁▁▁▁▁▁▁▁█▁▁▁▁▁▁▁▁▁▁ +@formatjs/intl-segmenter 29.61 µs/iter 100.00 µs █ + (0.00 ps … 900.00 µs) 100.00 µs █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▇ +unicode-rs/unicode-segmentation (wasm-bindgen) 27.03 µs/iter 100.00 µs █ + (0.00 ps … 300.00 µs) 100.00 µs █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▆ +Intl.Segmenter 10.60 µs/iter 10.86 µs █▂ + (9.25 µs … 10.91 µs) 10.89 µs ▄▁▁▁▄▁▁▁▁▁▁▁▁▁▁▁▁▁▄██ + + ┌ ┐ + unicode-segmenter/grapheme ┤ 3.75 µs + graphemer ┤■■■■■■■■■■■■■■■■■■■■■■■■ 34.41 µs + grapheme-splitter ┤■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 46.89 µs + @formatjs/intl-segmenter ┤■■■■■■■■■■■■■■■■■■■■ 29.61 µs +unicode-rs/unicode-segmentation (wasm-bindgen) ┤■■■■■■■■■■■■■■■■■■ 27.03 µs + Intl.Segmenter ┤■■■■■ 10.60 µs + └ ┘ + +summary + unicode-segmenter/grapheme + 2.83x faster than Intl.Segmenter + 7.21x faster than unicode-rs/unicode-segmentation (wasm-bindgen) + 7.9x faster than @formatjs/intl-segmenter + 9.18x faster than graphemer + 12.51x faster than grapheme-splitter + +• Emojis +------------------------------------------------------------- ------------------------------- +unicode-segmenter/grapheme 1.49 µs/iter 1.51 µs ▂ █ ▇ + (1.44 µs … 1.56 µs) 1.56 µs ▆▁▁▁█▁▁▁█▁▁▁█▁▁▁▅▁▁▁▂ +graphemer 10.01 µs/iter 10.03 µs █ ▄ + (9.96 µs … 10.06 µs) 10.06 µs ▅▁▁▁▁█▁▁▁▁█▁▁▁▁█▁▁▁▁▅ +grapheme-splitter 20.04 µs/iter 0.00 ps █ + (0.00 ps … 1.30 ms) 100.00 µs █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▄ +@formatjs/intl-segmenter 12.55 µs/iter 12.92 µs █ █ + (12.13 µs … 13.06 µs) 12.99 µs █▁██▁▁▁▁▁▁█▁▁▁▁▁▁▁█▁█ +unicode-rs/unicode-segmentation (wasm-bindgen) 8.03 µs/iter 8.25 µs ▄█ + (7.79 µs … 8.47 µs) 8.42 µs ██▅▁▁▁▁▅▁▁▁▅▁▁▅█▁▅▁▁▅ +Intl.Segmenter 6.49 µs/iter 5.71 µs █ + (5.08 µs … 11.50 µs) 11.45 µs █▁▂▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▄ + + ┌ ┐ + unicode-segmenter/grapheme ┤ 1.49 µs + graphemer ┤■■■■■■■■■■■■■■■■ 10.01 µs + grapheme-splitter ┤■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 20.04 µs + @formatjs/intl-segmenter ┤■■■■■■■■■■■■■■■■■■■■ 12.55 µs +unicode-rs/unicode-segmentation (wasm-bindgen) ┤■■■■■■■■■■■■ 8.03 µs + Intl.Segmenter ┤■■■■■■■■■ 6.49 µs + └ ┘ + +summary + unicode-segmenter/grapheme + 4.35x faster than Intl.Segmenter + 5.38x faster than unicode-rs/unicode-segmentation (wasm-bindgen) + 6.7x faster than graphemer + 8.4x faster than @formatjs/intl-segmenter + 13.42x faster than grapheme-splitter + +• Hindi +------------------------------------------------------------- ------------------------------- +unicode-segmenter/grapheme 5.06 µs/iter 5.08 µs █ + (4.96 µs … 5.15 µs) 5.13 µs ▃▁▁▁▁▁█▁▁█▁█▁▁█▁▁▁▁▁▆ +graphemer 39.85 µs/iter 39.84 µs █ + (39.65 µs … 40.41 µs) 39.99 µs ▅▅▁█▁▁▁▅▁▁▁█▁▁▁▁▅▁▁▁▅ +grapheme-splitter 89.86 µs/iter 100.00 µs █ + (0.00 ps … 700.00 µs) 200.00 µs ▆▁▁▁▁▁▁▁▁▁█▁▁▁▁▁▁▁▁▁▃ +@formatjs/intl-segmenter 41.65 µs/iter 100.00 µs █ ▂ + (0.00 ps … 2.40 ms) 200.00 µs █▁▁▁▁▁▁▁▁▁█▁▁▁▁▁▁▁▁▁▁ +unicode-rs/unicode-segmentation (wasm-bindgen) 24.26 µs/iter 24.66 µs █ █ █ + (23.73 µs … 24.78 µs) 24.76 µs ██▁▁▁▁████▁▁▁▁▁▁▁▁█▁█ +Intl.Segmenter 10.68 µs/iter 11.38 µs █ █ + (9.13 µs … 11.57 µs) 11.57 µs ██▁█▁▁▁▁▁▁▁▁▁███▁▁█▁█ + + ┌ ┐ + unicode-segmenter/grapheme ┤ 5.06 µs + graphemer ┤■■■■■■■■■■■■■■ 39.85 µs + grapheme-splitter ┤■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 89.86 µs + @formatjs/intl-segmenter ┤■■■■■■■■■■■■■■■ 41.65 µs +unicode-rs/unicode-segmentation (wasm-bindgen) ┤■■■■■■■■ 24.26 µs + Intl.Segmenter ┤■■ 10.68 µs + └ ┘ + +summary + unicode-segmenter/grapheme + 2.11x faster than Intl.Segmenter + 4.8x faster than unicode-rs/unicode-segmentation (wasm-bindgen) + 7.88x faster than graphemer + 8.23x faster than @formatjs/intl-segmenter + 17.76x faster than grapheme-splitter + +• Demonic characters +------------------------------------------------------------- ------------------------------- +unicode-segmenter/grapheme 1.35 µs/iter 1.37 µs ▂ █ ▇ + (1.29 µs … 1.44 µs) 1.42 µs ▅▁▁▁█▁▁▁█▁▁▁█▁▁▁█▁▁▁▄ +graphemer 20.34 µs/iter 20.26 µs ███ + (20.02 µs … 21.78 µs) 20.48 µs █▁▁▁▁███▁█▁█▁▁█▁▁▁▁▁█ +grapheme-splitter 16.35 µs/iter 16.33 µs █ + (16.26 µs … 16.89 µs) 16.36 µs █▁▁▁▁█▁▁▁▁█▁▁▁▁█▁▁▁▁▅ +@formatjs/intl-segmenter 14.63 µs/iter 14.99 µs █ █ + (13.94 µs … 15.19 µs) 15.14 µs █▁▁█▁▁██▁▁▁▁▁█▁▁███▁█ +unicode-rs/unicode-segmentation (wasm-bindgen) 4.12 µs/iter 4.15 µs █▃ + (3.86 µs … 4.66 µs) 4.49 µs ▄▁█▃▃▃▁▃██▆▃▄▃▁▁▁▁▁▁▃ +Intl.Segmenter 3.57 µs/iter 3.44 µs ▂█▂ + (3.32 µs … 8.11 µs) 4.15 µs ████▄▁▁▁▁▁▁▁▁▁▁▃▁▁▁▁▃ + + ┌ ┐ + unicode-segmenter/grapheme ┤ 1.35 µs + graphemer ┤■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 20.34 µs + grapheme-splitter ┤■■■■■■■■■■■■■■■■■■■■■■■■■■■ 16.35 µs + @formatjs/intl-segmenter ┤■■■■■■■■■■■■■■■■■■■■■■■■ 14.63 µs +unicode-rs/unicode-segmentation (wasm-bindgen) ┤■■■■■ 4.12 µs + Intl.Segmenter ┤■■■■ 3.57 µs + └ ┘ + +summary + unicode-segmenter/grapheme + 2.64x faster than Intl.Segmenter + 3.04x faster than unicode-rs/unicode-segmentation (wasm-bindgen) + 10.81x faster than @formatjs/intl-segmenter + 12.08x faster than grapheme-splitter + 15.04x faster than graphemer + +• Tweet text (combined) +------------------------------------------------------------- ------------------------------- +unicode-segmenter/grapheme 6.52 µs/iter 6.54 µs █ █ + (6.47 µs … 6.62 µs) 6.57 µs █▁▁▁▁▁▁▁▁▁█▁▁▁▁▅▁▁▁▁▇ +graphemer 53.47 µs/iter 100.00 µs █ ▇ + (0.00 ps … 300.00 µs) 200.00 µs █▁▁▁▁▁▁▁▁▁█▁▁▁▁▁▁▁▁▁▂ +grapheme-splitter 132.69 µs/iter 200.00 µs █ ▂ + (0.00 ps … 600.00 µs) 300.00 µs ▂▁▁▁▁▁▁█▁▁▁▁▁█▁▁▁▁▁▁▁ +@formatjs/intl-segmenter 55.09 µs/iter 100.00 µs █ ▇ + (0.00 ps … 1.50 ms) 200.00 µs █▁▁▁▁▁▁▁▁▁█▁▁▁▁▁▁▁▁▁▂ +unicode-rs/unicode-segmentation (wasm-bindgen) 39.26 µs/iter 39.62 µs █ + (38.53 µs … 40.19 µs) 39.99 µs ████▁▁▁▁▁█▁▁▁████▁▁▁█ +Intl.Segmenter 14.71 µs/iter 14.53 µs █ + (13.77 µs … 15.92 µs) 15.89 µs ▅▁▁▁▅▅██▁▁▁▁▁▁▁▁▁▁▁▅▅ + + ┌ ┐ + unicode-segmenter/grapheme ┤ 6.52 µs + graphemer ┤■■■■■■■■■■■■■ 53.47 µs + grapheme-splitter ┤■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 132.69 µs + @formatjs/intl-segmenter ┤■■■■■■■■■■■■■ 55.09 µs +unicode-rs/unicode-segmentation (wasm-bindgen) ┤■■■■■■■■■ 39.26 µs + Intl.Segmenter ┤■■ 14.71 µs + └ ┘ + +summary + unicode-segmenter/grapheme + 2.26x faster than Intl.Segmenter + 6.02x faster than unicode-rs/unicode-segmentation (wasm-bindgen) + 8.21x faster than graphemer + 8.45x faster than @formatjs/intl-segmenter + 20.36x faster than grapheme-splitter + +• Code snippet (combined) +------------------------------------------------------------- ------------------------------- +unicode-segmenter/grapheme 15.16 µs/iter 15.11 µs █ █ + (14.84 µs … 16.67 µs) 15.21 µs █▁▁▁▁█▁█▁█▁██▁▁█▁█▁▁█ +graphemer 117.15 µs/iter 200.00 µs █ + (0.00 ps … 400.00 µs) 200.00 µs ▃▁▁▁▁▁▁▁▁▁█▁▁▁▁▁▁▁▁▁▇ +grapheme-splitter 293.48 µs/iter 300.00 µs █ + (200.00 µs … 600.00 µs) 400.00 µs ▆▁▁▁▁▁▁▁▁▁█▁▁▁▁▁▁▁▁▁▄ +@formatjs/intl-segmenter 125.99 µs/iter 200.00 µs █ + (0.00 ps … 1.70 ms) 200.00 µs ▃▁▁▁▁▁▁▁▁▁█▁▁▁▁▁▁▁▁▁█ +unicode-rs/unicode-segmentation (wasm-bindgen) 98.47 µs/iter 100.00 µs █ + (0.00 ps … 700.00 µs) 200.00 µs ▅▁▁▁▁▁▁▁▁▁█▁▁▁▁▁▁▁▁▁▄ +Intl.Segmenter 30.34 µs/iter 30.35 µs █ ██ + (30.00 µs … 30.76 µs) 30.57 µs █▁▁▁▁▁▁█▁████▁▁▁▁▁█▁█ + + ┌ ┐ + unicode-segmenter/grapheme ┤ 15.16 µs + graphemer ┤■■■■■■■■■■■■ 117.15 µs + grapheme-splitter ┤■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 293.48 µs + @formatjs/intl-segmenter ┤■■■■■■■■■■■■■■ 125.99 µs +unicode-rs/unicode-segmentation (wasm-bindgen) ┤■■■■■■■■■■ 98.47 µs + Intl.Segmenter ┤■■ 30.34 µs + └ ┘ + +summary + unicode-segmenter/grapheme + 2x faster than Intl.Segmenter + 6.5x faster than unicode-rs/unicode-segmentation (wasm-bindgen) + 7.73x faster than graphemer + 8.31x faster than @formatjs/intl-segmenter + 19.36x faster than grapheme-splitter diff --git a/benchmark/grapheme/_records/20250307-intel_x86_64-linux-firefox_136.0.txt b/benchmark/grapheme/_records/20250307-intel_x86_64-linux-firefox_136.0.txt new file mode 100644 index 0000000..8d1ee9b --- /dev/null +++ b/benchmark/grapheme/_records/20250307-intel_x86_64-linux-firefox_136.0.txt @@ -0,0 +1,197 @@ +clk: ~3.95 GHz +cpu: null +runtime: null (null) + +benchmark avg (min … max) p75 / p99 (min … top 1%) +------------------------------------------------------------- ------------------------------- +• Lorem ipsum (ascii) +------------------------------------------------------------- ------------------------------- +unicode-segmenter/grapheme 12.97 µs/iter 0.00 ps █ + (0.00 ps … 1.00 ms) 1.00 ms █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁ +graphemer 26.64 µs/iter 0.00 ps █ + (0.00 ps … 1.00 ms) 1.00 ms █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁ +grapheme-splitter 82.59 µs/iter 0.00 ps █ + (0.00 ps … 3.00 ms) 2.00 ms █▁▁▁▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁ +@formatjs/intl-segmenter 45.43 µs/iter 45.41 µs █ + (45.17 µs … 45.90 µs) 45.65 µs ▅▁▁▁▁▁▁▁▁▁█▁▁▁▁▁▁▁▁▁▃ +unicode-rs/unicode-segmentation (wasm-bindgen) 23.14 µs/iter 0.00 ps █ + (0.00 ps … 1.00 ms) 1.00 ms █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁ +Intl.Segmenter 4.05 µs/iter 0.00 ps █ + (0.00 ps … 1.00 ms) 0.00 ps █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁ + + ┌ ┐ + unicode-segmenter/grapheme ┤■■■■ 12.97 µs + graphemer ┤■■■■■■■■■■ 26.64 µs + grapheme-splitter ┤■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 82.59 µs + @formatjs/intl-segmenter ┤■■■■■■■■■■■■■■■■■■ 45.43 µs +unicode-rs/unicode-segmentation (wasm-bindgen) ┤■■■■■■■■ 23.14 µs + Intl.Segmenter ┤ 4.05 µs + └ ┘ + +summary + unicode-segmenter/grapheme + 3.2x slower than Intl.Segmenter + 1.78x faster than unicode-rs/unicode-segmentation (wasm-bindgen) + 2.05x faster than graphemer + 3.5x faster than @formatjs/intl-segmenter + 6.37x faster than grapheme-splitter + +• Emojis +------------------------------------------------------------- ------------------------------- +unicode-segmenter/grapheme 3.06 µs/iter 3.17 µs ▇ █ + (2.93 µs … 3.17 µs) 3.17 µs █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█ +graphemer 8.01 µs/iter 0.00 ps █ + (0.00 ps … 1.00 ms) 0.00 ps █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁ +grapheme-splitter 60.37 µs/iter 0.00 ps █ + (0.00 ps … 2.00 ms) 1.00 ms █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂ +@formatjs/intl-segmenter 13.73 µs/iter 0.00 ps █ + (0.00 ps … 1.00 ms) 1.00 ms █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁ +unicode-rs/unicode-segmentation (wasm-bindgen) 7.81 µs/iter 7.81 µs █ + (7.57 µs … 8.06 µs) 8.06 µs ▃▁▁▁▁▁▁▁▁▁█▁▁▁▁▁▁▁▁▁▂ +Intl.Segmenter 1.56 µs/iter 0.00 ps █ + (0.00 ps … 1.00 ms) 0.00 ps █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁ + + ┌ ┐ + unicode-segmenter/grapheme ┤■ 3.06 µs + graphemer ┤■■■■ 8.01 µs + grapheme-splitter ┤■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 60.37 µs + @formatjs/intl-segmenter ┤■■■■■■■ 13.73 µs +unicode-rs/unicode-segmentation (wasm-bindgen) ┤■■■■ 7.81 µs + Intl.Segmenter ┤ 1.56 µs + └ ┘ + +summary + unicode-segmenter/grapheme + 1.96x slower than Intl.Segmenter + 2.56x faster than unicode-rs/unicode-segmentation (wasm-bindgen) + 2.62x faster than graphemer + 4.49x faster than @formatjs/intl-segmenter + 19.75x faster than grapheme-splitter + +• Hindi +------------------------------------------------------------- ------------------------------- +unicode-segmenter/grapheme 9.87 µs/iter 10.01 µs █ ▃ + (9.52 µs … 10.25 µs) 10.01 µs ▄▁▁▁▁▁▁▁▁▁█▁▁▁▁▁▁▁▁▁█ +graphemer 28.04 µs/iter 28.08 µs ▃ █ + (27.83 µs … 28.32 µs) 28.32 µs █▁▁▁▁▁▁▁▁▁█▁▁▁▁▁▁▁▁▁▄ +grapheme-splitter 88.97 µs/iter 88.87 µs █ + (88.62 µs … 90.33 µs) 89.11 µs █▁▁▁▁▁▁▁▁▁█▁▁▁▁▁▁▁▁▁▆ +@formatjs/intl-segmenter 45.31 µs/iter 45.41 µs ▆ █ + (45.17 µs … 45.41 µs) 45.41 µs █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█ +unicode-rs/unicode-segmentation (wasm-bindgen) 22.16 µs/iter 22.22 µs ▂ █ + (21.97 µs … 22.46 µs) 22.22 µs █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█ +Intl.Segmenter 3.76 µs/iter 3.91 µs █ + (3.42 µs … 4.15 µs) 4.15 µs ▂▁▁▁▁▁▁█▁▁▁▁▁▇▁▁▁▁▁▁▂ + + ┌ ┐ + unicode-segmenter/grapheme ┤■■ 9.87 µs + graphemer ┤■■■■■■■■■■ 28.04 µs + grapheme-splitter ┤■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 88.97 µs + @formatjs/intl-segmenter ┤■■■■■■■■■■■■■■■■■ 45.31 µs +unicode-rs/unicode-segmentation (wasm-bindgen) ┤■■■■■■■ 22.16 µs + Intl.Segmenter ┤ 3.76 µs + └ ┘ + +summary + unicode-segmenter/grapheme + 2.63x slower than Intl.Segmenter + 2.25x faster than unicode-rs/unicode-segmentation (wasm-bindgen) + 2.84x faster than graphemer + 4.59x faster than @formatjs/intl-segmenter + 9.02x faster than grapheme-splitter + +• Demonic characters +------------------------------------------------------------- ------------------------------- +unicode-segmenter/grapheme 2.52 µs/iter 2.69 µs █ + (2.44 µs … 2.69 µs) 2.69 µs █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█ +graphemer 13.37 µs/iter 13.43 µs ▂ █ + (13.18 µs … 13.67 µs) 13.43 µs █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█ +grapheme-splitter 11.94 µs/iter 11.96 µs █ + (11.72 µs … 11.96 µs) 11.96 µs ▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█ +@formatjs/intl-segmenter 14.97 µs/iter 14.89 µs █ + (14.65 µs … 15.87 µs) 15.14 µs ▅▁▁▁▁▁▁▁▁▁█▁▁▁▁▁▁▁▁▁▅ +unicode-rs/unicode-segmentation (wasm-bindgen) 4.07 µs/iter 4.15 µs ▂ █ + (3.91 µs … 4.39 µs) 4.15 µs █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█ +Intl.Segmenter 731.19 ns/iter 732.42 ns █ + (488.28 ns … 976.56 ns) 976.56 ns ▅▁▁▁▁▁▁▁▁▁█▁▁▁▁▁▁▁▁▁▅ + + ┌ ┐ + unicode-segmenter/grapheme ┤■■■■ 2.52 µs + graphemer ┤■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 13.37 µs + grapheme-splitter ┤■■■■■■■■■■■■■■■■■■■■■■■■■■■ 11.94 µs + @formatjs/intl-segmenter ┤■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 14.97 µs +unicode-rs/unicode-segmentation (wasm-bindgen) ┤■■■■■■■■ 4.07 µs + Intl.Segmenter ┤ 731.19 ns + └ ┘ + +summary + unicode-segmenter/grapheme + 3.45x slower than Intl.Segmenter + 1.61x faster than unicode-rs/unicode-segmentation (wasm-bindgen) + 4.74x faster than grapheme-splitter + 5.3x faster than graphemer + 5.94x faster than @formatjs/intl-segmenter + +• Tweet text (combined) +------------------------------------------------------------- ------------------------------- +unicode-segmenter/grapheme 14.30 µs/iter 14.40 µs █ ▂ + (13.67 µs … 16.36 µs) 14.40 µs ▇▁▁▁▁▁▁▄▁▁▁▁▁█▁▁▁▁▁▁█ +graphemer 43.36 µs/iter 43.46 µs █ █ + (42.97 µs … 43.95 µs) 43.70 µs █▁▁▁▁▁▁█▁▁▁▁▁█▁▁▁▁▁▁▅ +grapheme-splitter 139.12 µs/iter 0.00 ps █ + (0.00 ps … 2.00 ms) 1.00 ms █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▃ +@formatjs/intl-segmenter 66.16 µs/iter 66.16 µs █ ▂ + (65.67 µs … 66.89 µs) 66.65 µs ▄▁▁▁▁█▁▁▁▁█▁▁▁▁▁▁▁▁▁▇ +unicode-rs/unicode-segmentation (wasm-bindgen) 37.19 µs/iter 0.00 ps █ + (0.00 ps … 3.00 ms) 1.00 ms █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁ +Intl.Segmenter 6.80 µs/iter 6.84 µs █ + (6.59 µs … 7.08 µs) 7.08 µs ▇▁▁▁▁▁▁▁▁▁█▁▁▁▁▁▁▁▁▁▂ + + ┌ ┐ + unicode-segmenter/grapheme ┤■■ 14.30 µs + graphemer ┤■■■■■■■■■ 43.36 µs + grapheme-splitter ┤■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 139.12 µs + @formatjs/intl-segmenter ┤■■■■■■■■■■■■■■■ 66.16 µs +unicode-rs/unicode-segmentation (wasm-bindgen) ┤■■■■■■■■ 37.19 µs + Intl.Segmenter ┤ 6.80 µs + └ ┘ + +summary + unicode-segmenter/grapheme + 2.1x slower than Intl.Segmenter + 2.6x faster than unicode-rs/unicode-segmentation (wasm-bindgen) + 3.03x faster than graphemer + 4.63x faster than @formatjs/intl-segmenter + 9.73x faster than grapheme-splitter + +• Code snippet (combined) +------------------------------------------------------------- ------------------------------- +unicode-segmenter/grapheme 31.01 µs/iter 31.25 µs █ + (30.52 µs … 31.74 µs) 31.49 µs ▄▁▁▁▁█▁▁▁▁▁▁▁▁▁█▁▁▁▁▄ +graphemer 102.88 µs/iter 103.03 µs █ ▃ █ ▃ + (102.29 µs … 104.49 µs) 103.76 µs █▁▁█▁▁▁█▁▁█▁▁▁▁▁▁▁▁▁▆ +grapheme-splitter 323.45 µs/iter 323.97 µs █ █ + (320.80 µs … 326.66 µs) 325.44 µs █▁▁▁▁▁█▁██▁██▁██▁▁▁▁█ +@formatjs/intl-segmenter 151.20 µs/iter 155.27 µs █ ▃ █ + (144.04 µs … 156.49 µs) 155.52 µs █▁▁▁▁▁▆▁▆▁▁▁▁▁▁▁▁▁█▆█ +unicode-rs/unicode-segmentation (wasm-bindgen) 82.59 µs/iter 0.00 ps █ + (0.00 ps … 3.00 ms) 1.00 ms █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂ +Intl.Segmenter 15.63 µs/iter 15.87 µs █ ▅ + (14.89 µs … 15.87 µs) 15.87 µs ▄▁▁▁▁▄▁▁▁▁▁▁▁▁▁█▁▁▁▁█ + + ┌ ┐ + unicode-segmenter/grapheme ┤■■ 31.01 µs + graphemer ┤■■■■■■■■■■ 102.88 µs + grapheme-splitter ┤■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 323.45 µs + @formatjs/intl-segmenter ┤■■■■■■■■■■■■■■■ 151.20 µs +unicode-rs/unicode-segmentation (wasm-bindgen) ┤■■■■■■■ 82.59 µs + Intl.Segmenter ┤ 15.63 µs + └ ┘ + +summary + unicode-segmenter/grapheme + 1.98x slower than Intl.Segmenter + 2.66x faster than unicode-rs/unicode-segmentation (wasm-bindgen) + 3.32x faster than graphemer + 4.88x faster than @formatjs/intl-segmenter + 10.43x faster than grapheme-splitter diff --git a/benchmark/grapheme/_records/20250307-intel_x86_64-linux-nodejs_23.9.0.txt b/benchmark/grapheme/_records/20250307-intel_x86_64-linux-nodejs_23.9.0.txt new file mode 100644 index 0000000..063d225 --- /dev/null +++ b/benchmark/grapheme/_records/20250307-intel_x86_64-linux-nodejs_23.9.0.txt @@ -0,0 +1,335 @@ +clk: ~4.50 GHz +cpu: Intel(R) Core(TM) Ultra 7 258V +runtime: node 23.9.0 (x64-linux) + +benchmark avg (min … max) p75 / p99 (min … top 1%) +------------------------------------------------------------- ------------------------------- +• Lorem ipsum (ascii) +------------------------------------------------------------- ------------------------------- +unicode-segmenter/grapheme 4.26 µs/iter 4.28 µs █ ▂ █ + (4.18 µs … 4.32 µs) 4.31 µs ▅▅▅▅█▅▅█▅█ ▅▅▅▅ + ( 8.08 b … 38.93 b) 16.06 b ▇▁▁▁██████████▇████▁▇ + 7.77 ipc ( 60.02% cache) 8.36 branch misses + 15.45k cycles 120.05k instructions 275.46 c-refs 110.13 c-misses + +graphemer 36.50 µs/iter 36.39 µs █ + (29.75 µs … 201.07 µs) 45.27 µs ▅██ + ( 5.96 kb … 1.40 mb) 78.40 kb ▁▁▁▁▂▄████▄▃▃▂▂▁▁▁▁▁▁ + 5.67 ipc ( 46.11% cache) 26.89 branch misses + 148.90k cycles 843.81k instructions 1.34k c-refs 722.03 c-misses + +grapheme-splitter 66.01 µs/iter 65.60 µs █ + (54.38 µs … 943.86 µs) 82.34 µs ▄█ ▃ + (456.00 b … 2.36 mb) 48.87 kb ▁▁▂▃▃███▆█▃▂▁▁▁▁▁▁▁▁▁ + 5.20 ipc ( 61.20% cache) 25.52 branch misses + 288.63k cycles 1.50M instructions 769.18 c-refs 298.42 c-misses + +@formatjs/intl-segmenter 34.42 µs/iter 34.49 µs █ █ + (34.23 µs … 34.74 µs) 34.55 µs █▅ ▅ ▅ ▅▅▅█ ▅ + ( 3.60 kb … 3.66 kb) 3.63 kb ██▁▁█▁▁▁▁█▁▁▁████▁▁▁█ + 5.84 ipc ( 39.57% cache) 51.87 branch misses + 136.45k cycles 796.52k instructions 1.51k c-refs 912.55 c-misses + +unicode-rs/unicode-segmentation (wasm-bindgen) 15.51 µs/iter 15.51 µs █ + (13.38 µs … 224.69 µs) 21.88 µs ▇█▂ + ( 96.00 b … 394.71 kb) 16.24 kb ▁▁▇████▃▂▁▁▁▁▁▁▁▁▁▁▁▁ + 4.84 ipc ( 41.15% cache) 46.84 branch misses + 62.12k cycles 300.60k instructions 256.90 c-refs 151.19 c-misses + +Intl.Segmenter 9.17 µs/iter 9.32 µs █ + (8.85 µs … 9.45 µs) 9.42 µs ▂▂ █ + ( 1.92 kb … 1.92 kb) 1.92 kb ▆▁▁██▆▆▁▁▁▁▁▁▁▁▁█▁▁▆▆ + 4.74 ipc ( 51.84% cache) 33.38 branch misses + 34.57k cycles 163.99k instructions 580.66 c-refs 279.64 c-misses + + ┌ ┐ + unicode-segmenter/grapheme ┤ 4.26 µs + graphemer ┤■■■■■■■■■■■■■■■■■■ 36.50 µs + grapheme-splitter ┤■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 66.01 µs + @formatjs/intl-segmenter ┤■■■■■■■■■■■■■■■■■ 34.42 µs +unicode-rs/unicode-segmentation (wasm-bindgen) ┤■■■■■■ 15.51 µs + Intl.Segmenter ┤■■■ 9.17 µs + └ ┘ + +summary + unicode-segmenter/grapheme + 2.15x faster than Intl.Segmenter + 3.64x faster than unicode-rs/unicode-segmentation (wasm-bindgen) + 8.09x faster than @formatjs/intl-segmenter + 8.58x faster than graphemer + 15.51x faster than grapheme-splitter + +• Emojis +------------------------------------------------------------- ------------------------------- +unicode-segmenter/grapheme 1.68 µs/iter 1.69 µs █ + (1.63 µs … 1.72 µs) 1.72 µs ▅ ▄▅▂█ █ + ( 2.60 kb … 2.60 kb) 2.60 kb ▃▃▃▄▆▇▄▇█▇███████▄▇▇▄ + 6.67 ipc ( 47.06% cache) 16.15 branch misses + 6.06k cycles 40.43k instructions 113.34 c-refs 60.00 c-misses + +graphemer 11.17 µs/iter 11.19 µs █ █ + (11.12 µs … 11.21 µs) 11.20 µs ▅ ▅▅ █ ▅ ▅ █▅▅ + ( 3.51 kb … 3.58 kb) 3.54 kb █▁▁██▁█▁▁█▁▁█▁▁▁▁▁███ + 5.56 ipc ( 52.93% cache) 37.80 branch misses + 41.37k cycles 230.13k instructions 406.46 c-refs 191.31 c-misses + +grapheme-splitter 24.99 µs/iter 24.52 µs █ + (21.63 µs … 416.69 µs) 28.95 µs █▂ + (312.00 b … 940.88 kb) 14.28 kb ▁▁▂▂▅█▆██▃▇▃▁▁▁▁▁▁▁▁▁ + 4.84 ipc ( 61.47% cache) 54.45 branch misses + 109.82k cycles 531.16k instructions 210.85 c-refs 81.24 c-misses + +@formatjs/intl-segmenter 13.30 µs/iter 13.31 µs █ + (13.27 µs … 13.35 µs) 13.33 µs ▅ ▅ █▅ ▅▅ ▅ + ( 1.41 kb … 1.51 kb) 1.50 kb █▁▁▁▁▁█▁██▁▁██▁▁▁▁▁▁█ + 5.12 ipc ( 48.55% cache) 66.68 branch misses + 50.08k cycles 256.43k instructions 441.29 c-refs 227.03 c-misses + +unicode-rs/unicode-segmentation (wasm-bindgen) 5.17 µs/iter 5.19 µs ▂█ ▂ + (5.11 µs … 5.28 µs) 5.20 µs ▅ ▅▅ ██ ▅ ▅ █▅ + (947.55 b … 1.03 kb) 960.42 b ▇▁█▇▁██▇▁▇▁██▁▇█▇█▇██ + 5.08 ipc ( 54.60% cache) 21.41 branch misses + 19.67k cycles 99.86k instructions 84.14 c-refs 38.20 c-misses + +Intl.Segmenter 5.14 µs/iter 5.16 µs █ + (4.86 µs … 6.44 µs) 6.19 µs ▃████ + (458.04 b … 466.41 b) 465.59 b ██████▄▁▁▁▁▁▁▁▁▁▁▁▁▁▄ + 3.45 ipc ( 37.60% cache) 16.08 branch misses + 20.35k cycles 70.29k instructions 466.12 c-refs 290.84 c-misses + + ┌ ┐ + unicode-segmenter/grapheme ┤ 1.68 µs + graphemer ┤■■■■■■■■■■■■■■ 11.17 µs + grapheme-splitter ┤■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 24.99 µs + @formatjs/intl-segmenter ┤■■■■■■■■■■■■■■■■■ 13.30 µs +unicode-rs/unicode-segmentation (wasm-bindgen) ┤■■■■■ 5.17 µs + Intl.Segmenter ┤■■■■■ 5.14 µs + └ ┘ + +summary + unicode-segmenter/grapheme + 3.06x faster than Intl.Segmenter + 3.07x faster than unicode-rs/unicode-segmentation (wasm-bindgen) + 6.64x faster than graphemer + 7.91x faster than @formatjs/intl-segmenter + 14.87x faster than grapheme-splitter + +• Hindi +------------------------------------------------------------- ------------------------------- +unicode-segmenter/grapheme 5.21 µs/iter 5.23 µs █ + (5.16 µs … 5.29 µs) 5.26 µs ▂ ▂ ▇█▂ ▂▂▂▂▂ + ( 3.52 kb … 3.53 kb) 3.53 kb █▆▆▁▁▆█▆███▁█████▁▁▁▆ + 6.65 ipc ( 53.94% cache) 49.11 branch misses + 18.87k cycles 125.51k instructions 271.56 c-refs 125.08 c-misses + +graphemer 43.96 µs/iter 44.27 µs ▃█▃ + (37.41 µs … 236.28 µs) 61.99 µs ███▄ + ( 23.52 kb … 1.49 mb) 88.17 kb ▁▃█████▇▄▃▂▂▁▁▁▁▁▁▁▁▁ + 5.23 ipc ( 56.75% cache) 207.00 branch misses + 164.50k cycles 860.76k instructions 1.51k c-refs 652.09 c-misses + +grapheme-splitter 99.20 µs/iter 100.09 µs █ + (86.09 µs … 516.40 µs) 125.25 µs ▃█ + ( 4.66 kb … 376.24 kb) 49.16 kb ▁▂▃▅▆▇██▇▃▂▁▁▁▁▁▁▁▁▁▁ + 3.01 ipc ( 60.15% cache) 267.31 branch misses + 419.68k cycles 1.26M instructions 854.09 c-refs 340.34 c-misses + +@formatjs/intl-segmenter 41.90 µs/iter 41.93 µs ██ + (41.72 µs … 42.24 µs) 42.01 µs ▅ ▅ ▅ ██▅ ▅ ▅ ▅ + ( 2.49 kb … 2.61 kb) 2.58 kb █▁▁█▁█▁▁▁███▁▁▁█▁▁█▁█ + 5.19 ipc ( 58.01% cache) 162.08 branch misses + 155.96k cycles 808.70k instructions 1.22k c-refs 514.17 c-misses + +unicode-rs/unicode-segmentation (wasm-bindgen) 16.97 µs/iter 17.03 µs █ ███ █ █ █ ███ █ + (16.86 µs … 17.10 µs) 17.06 µs █ ███ █ █ █ ███ █ + ( 1.59 kb … 1.60 kb) 1.60 kb █▁▁███▁█▁█▁▁█▁▁▁███▁█ + 4.48 ipc ( 47.83% cache) 95.56 branch misses + 66.68k cycles 298.76k instructions 234.12 c-refs 122.13 c-misses + +Intl.Segmenter 9.03 µs/iter 9.13 µs █ + (8.48 µs … 9.20 µs) 9.19 µs ████ + ( 3.46 kb … 3.47 kb) 3.47 kb █▁▁▁▁▁▁▁▁▁▁████▁█████ + 4.14 ipc ( 51.31% cache) 70.60 branch misses + 34.23k cycles 141.61k instructions 567.62 c-refs 276.37 c-misses + + ┌ ┐ + unicode-segmenter/grapheme ┤ 5.21 µs + graphemer ┤■■■■■■■■■■■■■■ 43.96 µs + grapheme-splitter ┤■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 99.20 µs + @formatjs/intl-segmenter ┤■■■■■■■■■■■■■ 41.90 µs +unicode-rs/unicode-segmentation (wasm-bindgen) ┤■■■■ 16.97 µs + Intl.Segmenter ┤■ 9.03 µs + └ ┘ + +summary + unicode-segmenter/grapheme + 1.73x faster than Intl.Segmenter + 3.26x faster than unicode-rs/unicode-segmentation (wasm-bindgen) + 8.04x faster than @formatjs/intl-segmenter + 8.44x faster than graphemer + 19.04x faster than grapheme-splitter + +• Demonic characters +------------------------------------------------------------- ------------------------------- +unicode-segmenter/grapheme 1.54 µs/iter 1.56 µs ▂ █▂ + (1.49 µs … 1.64 µs) 1.61 µs ▆▃▅█▅███▆▃▆ ▅ ▃ + (849.53 b … 857.63 b) 857.04 b ▅███████████▃█▅█▇▃▅▃▅ + 7.10 ipc ( 56.81% cache) 6.99 branch misses + 5.50k cycles 39.06k instructions 82.93 c-refs 35.82 c-misses + +graphemer 23.77 µs/iter 23.71 µs █ + (23.57 µs … 24.89 µs) 23.75 µs ▅ ▅▅▅ ▅ ▅ ▅▅█ ▅ + (710.10 b … 751.18 b) 714.29 b █▁▁███▁▁▁█▁█▁▁███▁▁▁█ + 5.38 ipc ( 57.09% cache) 88.97 branch misses + 87.89k cycles 472.67k instructions 911.63 c-refs 391.15 c-misses + +grapheme-splitter 19.25 µs/iter 19.29 µs █ + (19.13 µs … 19.49 µs) 19.32 µs ▅ ▅█▅ ▅ ▅▅ ▅▅ ▅ + ( 2.79 kb … 3.03 kb) 2.84 kb █▁███▁▁▁▁▁▁█▁▁██▁██▁█ + 4.39 ipc ( 50.93% cache) 67.47 branch misses + 74.70k cycles 328.02k instructions 465.06 c-refs 228.19 c-misses + +@formatjs/intl-segmenter 13.33 µs/iter 13.35 µs █ + (13.29 µs … 13.37 µs) 13.35 µs ▅ ▅▅▅ ▅ ▅ █ + ( 3.55 kb … 3.57 kb) 3.56 kb █▁▁▁▁███▁▁▁▁▁▁▁█▁▁█▁█ + 5.54 ipc ( 59.54% cache) 102.61 branch misses + 53.32k cycles 295.46k instructions 198.43 c-refs 80.28 c-misses + +unicode-rs/unicode-segmentation (wasm-bindgen) 3.48 µs/iter 3.49 µs ▃ ▃▃█ ▃ ▃ + (3.43 µs … 3.57 µs) 3.56 µs █ ███▇▇█▂█▂▇ ▂ + ( 1.28 kb … 1.32 kb) 1.28 kb ▆█▆██████████▆█▁▁▁▁▆▆ + 3.92 ipc ( 21.25% cache) 20.62 branch misses + 15.09k cycles 59.17k instructions 22.56 c-refs 17.77 c-misses + +Intl.Segmenter 3.49 µs/iter 7.91 µs █ + (1.53 µs … 8.37 µs) 8.33 µs █ + ( 1.22 kb … 1.23 kb) 1.23 kb █▂▁▁▃▁▁▂▁▁▁▁▁▁▁▁▁▁▁▅▆ + 2.07 ipc ( 19.15% cache) 2.91 branch misses + 15.52k cycles 32.10k instructions 417.88 c-refs 337.88 c-misses + + ┌ ┐ + unicode-segmenter/grapheme ┤ 1.54 µs + graphemer ┤■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 23.77 µs + grapheme-splitter ┤■■■■■■■■■■■■■■■■■■■■■■■■■■■ 19.25 µs + @formatjs/intl-segmenter ┤■■■■■■■■■■■■■■■■■■ 13.33 µs +unicode-rs/unicode-segmentation (wasm-bindgen) ┤■■■ 3.48 µs + Intl.Segmenter ┤■■■ 3.49 µs + └ ┘ + +summary + unicode-segmenter/grapheme + 2.26x faster than unicode-rs/unicode-segmentation (wasm-bindgen) + 2.26x faster than Intl.Segmenter + 8.64x faster than @formatjs/intl-segmenter + 12.48x faster than grapheme-splitter + 15.41x faster than graphemer + +• Tweet text (combined) +------------------------------------------------------------- ------------------------------- +unicode-segmenter/grapheme 6.42 µs/iter 6.44 µs ▂ █ + (6.38 µs … 6.47 µs) 6.45 µs ▅ ▅ █▅ ▅█ + (307.66 b … 317.25 b) 314.99 b █▁▇▁▁▇▇▁▇█▁██▇▁▇██▁▁▇ + 7.41 ipc ( 52.25% cache) 11.29 branch misses + 24.50k cycles 181.40k instructions 421.01 c-refs 201.03 c-misses + +graphemer 54.94 µs/iter 54.93 µs █ + (47.91 µs … 215.08 µs) 79.95 µs ██ + ( 6.75 kb … 1.67 mb) 111.38 kb ▁▂████▅▂▂▁▁▁▁▁▁▁▁▁▁▁▁ + 5.42 ipc ( 54.12% cache) 205.68 branch misses + 217.56k cycles 1.18M instructions 1.90k c-refs 873.52 c-misses + +grapheme-splitter 142.32 µs/iter 143.85 µs ▅█ + (129.21 µs … 569.93 µs) 175.32 µs ▇███ ▅ + ( 20.41 kb … 422.81 kb) 63.94 kb ▁▂▆████▇█▃▂▂▁▁▁▁▁▁▁▁▁ + 3.23 ipc ( 62.50% cache) 229.67 branch misses + 627.35k cycles 2.02M instructions 1.11k c-refs 415.15 c-misses + +@formatjs/intl-segmenter 56.33 µs/iter 56.35 µs █ + (49.28 µs … 235.72 µs) 87.01 µs ██▆ + ( 13.79 kb … 312.59 kb) 125.88 kb ▁▃███▅▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁ + 5.33 ipc ( 48.73% cache) 223.25 branch misses + 224.32k cycles 1.20M instructions 2.16k c-refs 1.11k c-misses + +unicode-rs/unicode-segmentation (wasm-bindgen) 21.97 µs/iter 21.93 µs ██ + (21.83 µs … 22.64 µs) 22.00 µs ▅ ▅▅ ▅▅ ▅██ ▅ + (132.73 b … 134.32 b) 133.08 b █▁▁▁██▁██▁███▁▁▁▁▁▁▁█ + 4.90 ipc ( 40.82% cache) 83.80 branch misses + 89.70k cycles 439.38k instructions 414.36 c-refs 245.20 c-misses + +Intl.Segmenter 12.58 µs/iter 12.59 µs █ + (12.51 µs … 12.70 µs) 12.63 µs █▅▅ ▅▅ ▅ ▅▅ + (173.45 b … 192.57 b) 190.34 b ███▁▁▁▁▁▁██▁█▁▁▁▁▁▁██ + 4.67 ipc ( 55.69% cache) 91.52 branch misses + 49.50k cycles 230.95k instructions 659.35 c-refs 292.16 c-misses + + ┌ ┐ + unicode-segmenter/grapheme ┤ 6.42 µs + graphemer ┤■■■■■■■■■■■■ 54.94 µs + grapheme-splitter ┤■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 142.32 µs + @formatjs/intl-segmenter ┤■■■■■■■■■■■■ 56.33 µs +unicode-rs/unicode-segmentation (wasm-bindgen) ┤■■■■ 21.97 µs + Intl.Segmenter ┤■■ 12.58 µs + └ ┘ + +summary + unicode-segmenter/grapheme + 1.96x faster than Intl.Segmenter + 3.42x faster than unicode-rs/unicode-segmentation (wasm-bindgen) + 8.55x faster than graphemer + 8.77x faster than @formatjs/intl-segmenter + 22.16x faster than grapheme-splitter + +• Code snippet (combined) +------------------------------------------------------------- ------------------------------- +unicode-segmenter/grapheme 15.58 µs/iter 15.62 µs █ + (15.50 µs … 15.73 µs) 15.67 µs ▅ ▅█▅▅▅ ▅ ▅ ▅ ▅ + ( 2.30 kb … 2.31 kb) 2.30 kb █▁█████▁█▁▁▁▁▁▁█▁▁█▁█ + 7.14 ipc ( 52.15% cache) 66.59 branch misses + 59.11k cycles 421.86k instructions 1.01k c-refs 483.84 c-misses + +graphemer 132.90 µs/iter 132.82 µs ▅█ + (119.32 µs … 329.62 µs) 251.85 µs ██ + ( 57.99 kb … 1.74 mb) 265.11 kb ▂██▅▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁ + 5.38 ipc ( 56.85% cache) 522.67 branch misses + 521.56k cycles 2.81M instructions 4.53k c-refs 1.96k c-misses + +grapheme-splitter 331.14 µs/iter 332.62 µs ▇█ + (310.38 µs … 627.35 µs) 412.60 µs ███ + ( 51.14 kb … 732.12 kb) 151.91 kb ▂▅████▆▄▁▁▁▁▁▁▁▁▁▁▁▁▁ + 3.25 ipc ( 60.87% cache) 536.03 branch misses + 1.46M cycles 4.73M instructions 2.63k c-refs 1.03k c-misses + +@formatjs/intl-segmenter 129.90 µs/iter 129.44 µs █ + (112.87 µs … 509.67 µs) 268.83 µs ▆█ + ( 35.13 kb … 624.73 kb) 300.40 kb ▁██▄▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁ + 5.40 ipc ( 49.37% cache) 450.96 branch misses + 510.81k cycles 2.76M instructions 5.22k c-refs 2.64k c-misses + +unicode-rs/unicode-segmentation (wasm-bindgen) 52.30 µs/iter 52.38 µs █ █ + (52.01 µs … 52.77 µs) 52.40 µs ▅ ▅ ▅ ▅▅ █▅ █▅ + ( 2.71 kb … 2.72 kb) 2.72 kb █▁▁▁█▁▁▁▁▁█▁██▁██▁▁██ + 4.83 ipc ( 34.71% cache) 171.85 branch misses + 214.40k cycles 1.04M instructions 1.01k c-refs 660.82 c-misses + +Intl.Segmenter 28.72 µs/iter 28.80 µs █ + (27.81 µs … 29.94 µs) 28.97 µs █ █ + ( 1.12 kb … 1.13 kb) 1.13 kb █▁▁▁▁▁▁▁▁▁▁██▁██▁██▁█ + 4.98 ipc ( 52.67% cache) 263.23 branch misses + 104.96k cycles 522.23k instructions 1.15k c-refs 544.33 c-misses + + ┌ ┐ + unicode-segmenter/grapheme ┤ 15.58 µs + graphemer ┤■■■■■■■■■■■■■ 132.90 µs + grapheme-splitter ┤■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■ 331.14 µs + @formatjs/intl-segmenter ┤■■■■■■■■■■■■ 129.90 µs +unicode-rs/unicode-segmentation (wasm-bindgen) ┤■■■■ 52.30 µs + Intl.Segmenter ┤■ 28.72 µs + └ ┘ + +summary + unicode-segmenter/grapheme + 1.84x faster than Intl.Segmenter + 3.36x faster than unicode-rs/unicode-segmentation (wasm-bindgen) + 8.34x faster than @formatjs/intl-segmenter + 8.53x faster than graphemer + 21.26x faster than grapheme-splitter diff --git a/benchmark/grapheme/perf.js b/benchmark/grapheme/perf.js index 134674e..3a0858d 100644 --- a/benchmark/grapheme/perf.js +++ b/benchmark/grapheme/perf.js @@ -123,6 +123,9 @@ await run({ self.postMessage({ type: 'log', message: s }); }, }, + ...isSystemRuntime && { + colors: !process.env.NO_COLOR, + }, }); if (isWebWorker) { diff --git a/package.json b/package.json index f6af094..9ba4a04 100644 --- a/package.json +++ b/package.json @@ -37,6 +37,9 @@ "require": "./intl-adapter.cjs" } }, + "imports": { + "#src/*": "./src/*" + }, "sideEffects": [ "./intl-polyfill.js", "./intl-polyfill.cjs" diff --git a/scripts/lib/encoding.js b/scripts/lib/encoding.js new file mode 100644 index 0000000..dc9b763 --- /dev/null +++ b/scripts/lib/encoding.js @@ -0,0 +1,25 @@ +// @ts-check + +/** + * @import { + * UnicodeDataEncoding, + * UnicodeRange, + * CategorizedUnicodeRange, + * } from '#src/core.js'; + */ + +/** + * @param {CategorizedUnicodeRange[]} ranges + * @returns {UnicodeDataEncoding} + */ +export function encodeUnicodeData(ranges) { + /** @type {number[]} */ + let buf = []; + for (let [from, to] of ranges) { + let pad = to - from; + buf.push(from, pad); + } + return /** @type {UnicodeDataEncoding} */ ( + buf.map(x => x ? x.toString(36) : '').join(',') + ); +} diff --git a/scripts/unicode.js b/scripts/unicode.js index 7473b0d..22d710e 100755 --- a/scripts/unicode.js +++ b/scripts/unicode.js @@ -23,9 +23,11 @@ /** * @import { WriteStream } from 'node:fs'; - * @import { UnicodeRange, CategorizedUnicodeRange } from '../src/core.js'; + * @import { UnicodeRange } from '#src/core.js'; * * @typedef {number[]} UnicodeValues + * + * @typedef {[from: number, to: number, categoryName: string]} CategorizedUnicodeRange */ import * as assert from 'node:assert/strict'; @@ -34,6 +36,8 @@ import { existsSync, createWriteStream } from 'node:fs'; import * as fs from 'node:fs/promises'; import { fileURLToPath } from 'node:url'; +import { encodeUnicodeData } from './lib/encoding.js'; + let __dirname = path.dirname(fileURLToPath(import.meta.url)); let srcPath = path.resolve(__dirname, '../src'); let testPath = path.resolve(__dirname, '../test'); @@ -52,7 +56,7 @@ const UNICODE_VERSION_STRING = UNICODE_VERSION.join('.'); // these are the surrogate codepoints, which are not valid rust characters /** @type {UnicodeRange} */ -let surrogateCodepoints = [0xd800, 0xdfff]; +let surrogateCodepoints = [0xd800, 0xdfff, 0]; /** @type {Record} */ let expandedCategories = { @@ -133,38 +137,6 @@ let isSurrogate = n => { return surrogateCodepoints[0] <= n && n <= surrogateCodepoints[1]; }; -/** - * @template T - * @param {Set} a - * @param {Set} b - * @return {Set} - */ -let difference = (a, b) => { - const result = new Set(a); - if (a.size <= b.size) { - for (const elem of a) { - if (b.has(elem)) { - result.delete(elem); - } - } - } else { - for (const elem of b.keys()) { - if (result.has(elem)) { - result.delete(elem); - } - } - } - return result; -}; - -/** - * @param {UnicodeRange[]} ranges - * @return {string} - */ -let encodeRanges = ranges => { - return ranges.map(x => `${x[0] ? x[0].toString(36) : ''},${x[1] ? x[1].toString(36) : ''}`).join(','); -}; - /** * @param {UnicodeRange[]} ranges * @return {UnicodeValues} @@ -172,9 +144,9 @@ let encodeRanges = ranges => { let ungroupCat = ranges => { /** @type {UnicodeValues} */ let catOut = []; - for (let [point, padding] of ranges) { - let cur = point; - while (cur <= point + padding) { + for (let [from, to] of ranges) { + let cur = from; + while (cur <= to) { catOut.push(cur); cur += 1; } @@ -201,11 +173,11 @@ let groupCat = values => { if (letter === curEnd + 1) { curEnd = letter; } else { - catOut.push([curStart, curEnd - curStart]); + catOut.push([curStart, curEnd, 0]); curStart = curEnd = letter; } } - catOut.push([curStart, curEnd - curStart]); + catOut.push([curStart, curEnd, 0]); return catOut; }; @@ -262,7 +234,7 @@ let parseGencats = (data) => { old, iso, upcase, lowcase, titlecase] = data; // place letter in categories as appropriate - for (let cat of [gencat, "Assigned"].concat(expandedCategories[gencat] || [])) { + for (let cat of [gencat, 'Assigned'].concat(expandedCategories[gencat] || [])) { gencats[cat] ||= []; gencats[cat].push(Number.parseInt(code)); } @@ -304,7 +276,7 @@ let parseProperties = (data, interestingProps) => { let hi = Number.parseInt(d_hi, 16); props[propValue] ||= []; - props[propValue].push([lo, hi - lo]); + props[propValue].push([lo, hi, 0]); } for (let [key, ranges] of Object.entries(props)) { @@ -458,54 +430,26 @@ let printTableRaw = (f, name, table, format) => { /** * @param {WriteStream} f - * @param {CategorizedUnicodeRange[]} breakTable + * @param {CategorizedUnicodeRange[]} breakTable * @param {string[]} breakCats * @param {string} name * @returns */ let printBreakModule = (f, breakTable, breakCats, name) => { - let cats = [...breakCats, 'Any'].toSorted(); + let cats = ['Any', ...breakCats.toSorted()]; let capitalName = capitalize(name); let typeName = `${capitalName}Category`; let keyTypeName = `${typeName}Key`; let numTypeName = `${typeName}Num`; - - // We don't want the lookup table to be too large so choose a reasonable - // cutoff. 0x20000 is selected because most of the range table entries are - // within the interval of [0x0, 0x20000] - let lookupValueCutoff = 0x20000; - - // Length of lookup table. It has to be a divisor of `lookup_value_cutoff`. - let lookupTableLen = 0x80; - - let lookupInterval = Math.round(lookupValueCutoff / lookupTableLen); - - let lookupTable = Array.from({ length: lookupTableLen }, _ => 0); - let j = 0; - for (let i of range(0, lookupTableLen)) { - let lookupFrom = i * lookupInterval; - while (j < breakTable.length) { - let [entryPoint, entryPadding] = breakTable[j]; - if (entryPoint + entryPadding >= lookupFrom) { - break; - } - j += 1; - } - lookupTable[i] = j; - } + let rangeTypeName = `${typeName}Range`; f.write(preamble); f.write(` -import { - searchUnicodeRange, - initLookupTableBuffer, - initUnicodeRangeBuffer, -} from './core.js'; +import { decodeUnicodeData } from './core.js'; /** - * @typedef {import('./core.js').LookupTableEncoding} LookupTableEncoding - * @typedef {import('./core.js').UnicodeRangeEncoding} UnicodeRangeEncoding + * @typedef {import('./core.js').UnicodeDataEncoding} UnicodeDataEncoding */ /** @@ -527,7 +471,7 @@ import { f.write(` /** - * @typedef {import('./core.js').CategorizedUnicodeRange<${numTypeName}>} ${typeName}Range + * @typedef {import('./core.js').CategorizedUnicodeRange<${numTypeName}>} ${rangeTypeName} */ `, ); @@ -561,47 +505,14 @@ export const ${typeName} = { f.write('};\n'); f.write(` -export const ${name}_buffer = initUnicodeRangeBuffer( - Array(${breakTable.length * 2}), - /** @type {UnicodeRangeEncoding} */ - ('${breakTable.map(x => `${x[0] === 0 ? '' : x[0].toString(36)},${x[1] === 0 ? '' : x[1].toString(36)}`).join(',')}') -); - -export const ${name}_cats = initLookupTableBuffer( - Array(${breakTable.length}), - /** @type {LookupTableEncoding} */ - ('${breakTable.map(x => inversed[x[2]].toString(36)).join('')}') -); - -const ${name}_lookup = initLookupTableBuffer( - Array(${lookupTable.length}), - /** @type {LookupTableEncoding} */ - ('${lookupTable.map(x => x === 0 ? '' : x.toString(36)).join(',')}'), - ',' -); - /** - * @param {number} cp - * @return Index of {@link ${name}_ranges} if found, or negation of last visited low cursor. + * @type {${rangeTypeName}[]} */ -export function find${capitalName}Index(cp) { - // Perform a quick O(1) lookup in a precomputed table to determine - // the slice of the range table to search in. - let lookup_table = ${name}_lookup; - let lookup_interval = ${lookupInterval}; - - let idx = cp / lookup_interval | 0; - // If the \`idx\` is outside of the precomputed table - use the slice - // starting from the last covered index in the precomputed table and - // ending with the length of the range table. - let sliceFrom = ${j}, sliceTo = ${breakTable.length}; - if (idx + 1 < lookup_table.length) { - sliceFrom = lookup_table[idx]; - sliceTo = lookup_table[idx + 1] + 1; - } - - return searchUnicodeRange(cp, ${name}_buffer, sliceFrom * 2, sliceTo * 2); -} +export const ${name}_ranges = decodeUnicodeData( + /** @type {UnicodeDataEncoding} */ + ('${encodeUnicodeData(breakTable.map(row => [row[0], row[1], 0]))}'), + '${breakTable.map(row => inversed[row[2]].toString(36)).join('')}', +); `, ); }; @@ -616,19 +527,21 @@ let printIncbModule = async f => { f.write(preamble); f.write(` -import { initUnicodeRangeBuffer } from './core.js'; +import { decodeUnicodeData } from './core.js'; /** - * @typedef {import('./core.js').UnicodeRangeEncoding} UnicodeRangeEncoding + * @typedef {import('./core.js').UnicodeRange} UnicodeRange + * @typedef {import('./core.js').UnicodeDataEncoding} UnicodeDataEncoding */ /** * The Unicode \`Indic_Conjunct_Break=Consonant\` derived property table + * + * @type {UnicodeRange[]} */ -export const consonant_buffer = initUnicodeRangeBuffer( - Array(${table.length * 2}), - /** @type {UnicodeRangeEncoding} */ - ('${table.map(x => `${x[0] ? x[0].toString(36) : ''},${x[1] ? x[1].toString(36) : ''}`).join(',')}') +export const consonant_ranges = decodeUnicodeData( + /** @type {UnicodeDataEncoding} */ + ('${encodeUnicodeData(table)}') ); `, ); @@ -651,44 +564,41 @@ let printGeneralModule = async f => { f.write(preamble); f.write(` -import { initUnicodeRangeBuffer } from './core.js'; +import { decodeUnicodeData } from './core.js'; /** - * @typedef {import('./core.js').UnicodeRangeBuffer} UnicodeRangeBuffer - * @typedef {import('./core.js').UnicodeRangeEncoding} UnicodeRangeEncoding + * @typedef {import('./core.js').UnicodeRange} UnicodeRange + * @typedef {import('./core.js').UnicodeDataEncoding} UnicodeDataEncoding */ /** * The Unicode \`L\` (Letter) properties data * - * @type {UnicodeRangeBuffer} + * @type {UnicodeRange[]} */ -export const letter_buffer = initUnicodeRangeBuffer( - Array(${gencats['L'].length * 2}), - /** @type {UnicodeRangeEncoding} */ - ('${encodeRanges(gencats['L'])}') +export const letter_ranges = decodeUnicodeData( + /** @type {UnicodeDataEncoding} */ + ('${encodeUnicodeData(gencats['L'])}') ); /** * The Unicode \`N\` (Numeric) properties data * - * @type {UnicodeRangeBuffer} + * @type {UnicodeRange[]} */ -export const numeric_buffer = initUnicodeRangeBuffer( - Array(${gencats['N'].length * 2}), - /** @type {UnicodeRangeEncoding} */ - ('${encodeRanges(gencats['N'])}') +export const numeric_ranges = decodeUnicodeData( + /** @type {UnicodeDataEncoding} */ + ('${encodeUnicodeData(gencats['N'])}') ); /** * The Unicode \`Alphabetic\` properties data * - * @type {UnicodeRangeBuffer} + * @type {UnicodeRange[]} */ -export const alphabetic_buffer = initUnicodeRangeBuffer( - Array(${derived['Alphabetic'].length * 2}), - /** @type {UnicodeRangeEncoding} */ - ('${encodeRanges(derived['Alphabetic'])}') +export const alphabetic_ranges = decodeUnicodeData( + /** @type {UnicodeDataEncoding} */ + ('${encodeUnicodeData(derived['Alphabetic'])}') ); `, ); @@ -703,33 +613,31 @@ let printEmojiModule = async f => { f.write(preamble); f.write(` -import { initUnicodeRangeBuffer } from './core.js'; +import { decodeUnicodeData } from './core.js'; /** - * @typedef {import('./core.js').UnicodeRangeBuffer} UnicodeRangeBuffer - * @typedef {import('./core.js').UnicodeRangeEncoding} UnicodeRangeEncoding + * @typedef {import('./core.js').UnicodeRange} UnicodeRange + * @typedef {import('./core.js').UnicodeDataEncoding} UnicodeDataEncoding */ /** * The Unicode \`Emoji_Presentation\` properties data * - * @type {UnicodeRangeBuffer} + * @type {UnicodeRange[]} */ -export const emoji_presentation_buffer = initUnicodeRangeBuffer( - Array(${emojiProps['Emoji_Presentation'].length * 2}), - /** @type {UnicodeRangeEncoding} */ - ('${encodeRanges(emojiProps['Emoji_Presentation'])}') +export const emoji_presentation_ranges = decodeUnicodeData( + /** @type {UnicodeDataEncoding} */ + ('${encodeUnicodeData(emojiProps['Emoji_Presentation'])}') ); /** * The Unicode \`Extended_Pictographic\` properties data * - * @type {UnicodeRangeBuffer} + * @type {UnicodeRange[]} */ -export const extended_pictographic_buffer = initUnicodeRangeBuffer( - Array(${emojiProps['Extended_Pictographic'].length * 2}), - /** @type {UnicodeRangeEncoding} */ - ('${encodeRanges(emojiProps['Extended_Pictographic'])}') +export const extended_pictographic_ranges = decodeUnicodeData( + /** @type {UnicodeDataEncoding} */ + ('${encodeUnicodeData(emojiProps['Extended_Pictographic'])}') ); `, ); @@ -885,16 +793,15 @@ let graphemeCats = parseProperties(graphemeData); // Note: // This category also includes Cs (surrogate codepoints). // We have to remove Cs from the Control category -graphemeCats["Control"] = groupCat(Array.from( - difference( - new Set(ungroupCat(graphemeCats["Control"])), - new Set(ungroupCat([[surrogateCodepoints[0], surrogateCodepoints[1] - surrogateCodepoints[0]]])), +graphemeCats['Control'] = groupCat(Array.from( + new Set(ungroupCat(graphemeCats['Control'])).difference( + new Set(ungroupCat([[surrogateCodepoints[0], surrogateCodepoints[1], 0]])), ), )); let emojiProps = parseProperties(emojiData, ['Extended_Pictographic']); -/** @type {CategorizedUnicodeRange[]} */ +/** @type {CategorizedUnicodeRange[]} */ let graphemeTable = []; for (let [cat, ranges] of Object.entries(graphemeCats)) { for (let [from, to] of ranges) { @@ -917,7 +824,7 @@ for (let chars of graphemeTable) { } // let wordCats = parseProperties(wordData); -// /** @type {CategorizedUnicodeRange[]} */ +// /** @type {CategorizedUnicodeRange[]} */ // let wordTable = []; // for (let [cat, ranges] of Object.entries(wordCats)) { // for (let [from, to] of ranges) { @@ -927,7 +834,7 @@ for (let chars of graphemeTable) { // wordTable.sort((a, b) => a[0] - b[0]); // let sentenceCats = parseProperties(sentenceData); -// /** @type {CategorizedUnicodeRange[]} */ +// /** @type {CategorizedUnicodeRange[]} */ // let sentenceTable = []; // for (let [cat, ranges] of Object.entries(sentenceCats)) { // for (let [from, to] of ranges) { diff --git a/src/_emoji_data.js b/src/_emoji_data.js index b89f8bb..6a3cf84 100644 --- a/src/_emoji_data.js +++ b/src/_emoji_data.js @@ -3,31 +3,29 @@ // // @ts-check -import { initUnicodeRangeBuffer } from './core.js'; +import { decodeUnicodeData } from './core.js'; /** - * @typedef {import('./core.js').UnicodeRangeBuffer} UnicodeRangeBuffer - * @typedef {import('./core.js').UnicodeRangeEncoding} UnicodeRangeEncoding + * @typedef {import('./core.js').UnicodeRange} UnicodeRange + * @typedef {import('./core.js').UnicodeDataEncoding} UnicodeDataEncoding */ /** * The Unicode `Emoji_Presentation` properties data * - * @type {UnicodeRangeBuffer} + * @type {UnicodeRange[]} */ -export const emoji_presentation_buffer = initUnicodeRangeBuffer( - Array(160), - /** @type {UnicodeRangeEncoding} */ +export const emoji_presentation_ranges = decodeUnicodeData( + /** @type {UnicodeDataEncoding} */ ('6xm,1,73d,3,73k,,73n,,7i5,1,7is,1,7k8,b,7lr,,7mb,,7mp,,7my,1,7nh,1,7no,1,7ny,,7o4,,7oq,,7oy,1,7p1,,7p6,,7p9,,7ph,,7pm,1,7qg,,7rg,,7ri,,7rn,2,7rr,,7th,2,7u8,,7un,,8ij,1,8k0,,8k5,,2pz8,,2q4v,,2qa6,,2qa9,9,2qcm,p,2qdd,,2qe2,,2qen,,2qeq,4,2qew,2,2qfk,1,2qkg,w,2qlp,8,2qlz,1x,2qny,l,2qow,16,2qq7,4,2qqo,g,2qr8,,2qrc,1y,2qtc,,2qte,56,2qyn,1q,2r0r,3,2r0w,n,2r22,,2r2t,1,2r38,,2r5n,2c,2r9c,1x,2rbg,,2rbk,2,2rbp,2,2rbw,3,2rcb,1,2rck,8,2rj4,b,2rjk,,2rrg,1a,2rss,9,2rt3,54,2s1c,c,2s1s,9,2s27,1j,2s3y,e,2s4f,a,2s4w,8') ); /** * The Unicode `Extended_Pictographic` properties data * - * @type {UnicodeRangeBuffer} + * @type {UnicodeRange[]} */ -export const extended_pictographic_buffer = initUnicodeRangeBuffer( - Array(156), - /** @type {UnicodeRangeEncoding} */ +export const extended_pictographic_ranges = decodeUnicodeData( + /** @type {UnicodeDataEncoding} */ ('4p,,4u,,6d8,,6dl,,6jm,,6k9,,6ms,5,6nd,1,6xm,1,6y0,,70o,,72n,,73d,a,73s,2,79e,,7fu,1,7g6,,7gg,,7i3,3,7i8,5,7if,b,7is,35,7m8,39,7pk,a,7pw,,7py,,7q5,,7q9,,7qg,,7qr,1,7r8,,7rb,,7rg,,7ri,,7rn,2,7rr,,7s3,4,7th,2,7tt,,7u8,,7un,,850,1,8hx,2,8ij,1,8k0,,8k5,,9io,,9j1,,9zr,,9zt,,2pz4,73,2q6l,2,2q7j,,2q98,5,2q9q,1,2qa6,,2qa9,9,2qb1,1k,2qdd,e,2qe2,,2qen,,2qeq,8,2qf0,3,2qfd,c1,2qrk,8t,2r0m,7d,2r9c,3j,2rg4,b,2rit,16,2rkc,3,2rm0,7,2rmi,5,2rns,7,2rou,29,2rrg,1a,2rss,9,2rt3,c8,2scg,sd') ); diff --git a/src/_general_data.js b/src/_general_data.js index 1ec699d..4258ccd 100644 --- a/src/_general_data.js +++ b/src/_general_data.js @@ -3,42 +3,39 @@ // // @ts-check -import { initUnicodeRangeBuffer } from './core.js'; +import { decodeUnicodeData } from './core.js'; /** - * @typedef {import('./core.js').UnicodeRangeBuffer} UnicodeRangeBuffer - * @typedef {import('./core.js').UnicodeRangeEncoding} UnicodeRangeEncoding + * @typedef {import('./core.js').UnicodeRange} UnicodeRange + * @typedef {import('./core.js').UnicodeDataEncoding} UnicodeDataEncoding */ /** * The Unicode `L` (Letter) properties data * - * @type {UnicodeRangeBuffer} + * @type {UnicodeRange[]} */ -export const letter_buffer = initUnicodeRangeBuffer( - Array(1354), - /** @type {UnicodeRangeEncoding} */ +export const letter_ranges = decodeUnicodeData( + /** @type {UnicodeDataEncoding} */ ('1t,p,2p,p,4q,,51,,56,,5c,m,60,u,6w,cp,jq,b,kg,4,ks,,ku,,og,4,om,1,oq,3,ov,,p2,,p4,2,p8,,pa,j,pv,2a,s7,3u,wa,4l,10x,11,121,,128,14,15c,q,167,3,17k,16,19q,1,19t,2q,1cl,,1d1,1,1da,1,1dm,2,1dr,,1e8,,1ea,t,1fx,2g,1ip,,1je,w,1kk,1,1kq,,1kw,l,1lm,,1lw,,1m0,,1mo,o,1nk,a,1o0,n,1op,5,1pc,15,1s4,1h,1tp,,1u8,,1ug,9,1v5,f,1vp,7,1vz,1,1w3,l,1wq,6,1wy,,1x2,3,1x9,,1xq,,1y4,1,1y7,2,1yo,1,1z0,,1z9,5,1zj,1,1zn,l,20a,6,20i,1,20l,1,20o,1,21l,3,21q,,22a,2,22t,8,233,2,237,l,23u,6,242,1,245,4,24d,,24w,,25c,1,261,,26d,7,26n,1,26r,l,27e,6,27m,1,27p,4,27x,,28s,1,28v,2,29d,,29v,,29x,5,2a6,2,2aa,3,2ah,1,2ak,,2am,1,2ar,1,2aw,2,2b2,b,2c0,,2dh,7,2dq,2,2du,m,2ei,f,2f1,,2fs,2,2fx,,2g0,1,2gw,,2h1,7,2ha,2,2he,m,2i2,9,2id,4,2il,,2jh,1,2jk,1,2k1,1,2kk,8,2ku,2,2ky,14,2m5,,2mm,,2ms,2,2n3,2,2nu,5,2o5,h,2oq,n,2pf,8,2pp,,2ps,6,2rl,1b,2sy,1,2tc,6,2v5,1,2v8,,2va,4,2vg,n,2w5,,2w7,9,2wi,1,2wt,,2ww,4,2x2,,2xo,3,2yo,,30g,7,30p,z,32g,4,35s,16,37j,,380,5,38a,3,38h,,38l,1,38u,2,391,c,39q,,3a8,11,3bb,,3bh,,3bk,16,3cs,98,3m2,3,3m8,6,3mg,,3mi,3,3mo,14,3nu,3,3o0,w,3oy,3,3p4,6,3pc,,3pe,3,3pk,e,3q0,1k,3rm,3,3rs,1u,3uo,f,3vk,2d,3y0,5,3y9,h7,4fj,g,4g1,p,4gw,22,4j5,7,4jk,h,4kf,i,4lc,h,4m8,c,4mm,2,4n4,1f,4pj,,4po,,4rk,2g,4u8,4,4uf,x,4ve,,4vk,1x,4xs,u,500,t,50w,4,51c,17,52o,p,54w,m,55s,1g,59j,,5c5,1a,5dx,7,5fn,t,5gu,1,5h6,17,5j4,z,5l9,2,5lm,z,5mo,a,5n4,16,5od,2,5pl,3,5pq,5,5px,1,5q2,,5q8,5b,5xc,7p,654,5,65c,11,66g,5,66o,7,66x,,66z,,671,,673,u,680,1g,69i,6,69q,,69u,2,69y,6,6a8,3,6ae,5,6ao,c,6b6,2,6ba,6,6ep,,6f3,,6fk,c,6iq,,6iv,,6iy,9,6j9,,6jd,4,6jo,,6jq,,6js,,6ju,3,6jz,a,6kc,3,6kl,4,6ku,,6mb,1,8ow,6c,8vf,3,8vm,1,8w0,11,8x3,,8x9,,8xc,1j,8z3,,8zk,m,90g,6,90o,6,90w,6,914,6,91c,6,91k,6,91s,6,920,6,94f,,9hh,1,9ip,4,9iz,1,9j5,2d,9lp,2,9lt,2h,9oc,3,9ol,16,9pt,2l,9sw,v,9v4,f,a9s,533,feo,h3g,wk0,19,wlc,7g,wsw,f,wtm,1,wu8,1a,wvz,u,www,1x,x07,8,x0i,2u,x3f,1u,x5c,1,x5f,,x5h,7,x6a,f,x6r,2,x6v,3,x70,m,x8g,1f,xaa,1d,xde,5,xdn,,xdp,1,xe2,r,xf4,m,xgg,s,xhg,1a,xjj,,xk0,4,xk6,9,xkq,4,xkw,14,xmo,2,xms,7,xnk,m,xoa,,xoe,1d,xpt,,xpx,1,xq1,4,xq8,,xqa,,xqz,2,xr4,a,xrm,2,xs1,5,xs9,5,xsh,5,xsw,6,xt4,6,xtc,16,xuk,d,xv4,36,xz4,8mb,16ls,m,16mj,1c,1d6o,a5,1dgw,2x,1dkw,6,1dlf,4,1dlp,,1dlr,9,1dm2,c,1dmg,4,1dmm,,1dmo,1,1dmr,1,1dmu,2z,1dqr,a2,1e1c,1r,1e36,1h,1e5s,b,1e9c,4,1e9i,3q,1ee9,p,1ef5,p,1eg6,2g,1eiq,5,1eiy,5,1ej6,5,1eje,2,1ekg,b,1ekt,p,1elk,i,1em4,1,1em7,e,1emo,d,1eo0,3e,1f28,s,1f34,1c,1f5s,v,1f71,j,1f7m,7,1f80,11,1f9c,t,1fa8,z,1fbc,7,1fcw,4d,1fhs,z,1fiw,z,1fk0,13,1flc,1f,1fn4,a,1fng,e,1fnw,6,1fo4,1,1fo7,a,1foj,e,1foz,6,1fp7,1,1fpc,1f,1fr4,8m,1g00,l,1g0w,7,1g1s,5,1g1z,15,1g36,8,1g5c,5,1g5k,,1g5m,17,1g6v,1,1g70,,1g73,m,1g80,m,1g8w,u,1gbk,i,1gc4,1,1gcg,l,1gdc,p,1gg0,1j,1ghq,1,1gjk,,1gk0,3,1gk5,2,1gk9,s,1gm8,s,1gn4,s,1gow,7,1gp5,r,1gqo,1h,1gsg,l,1gtc,i,1gu8,h,1gxs,20,1h1c,1e,1h34,1e,1h4w,z,1h6y,r,1h7z,m,1hfk,15,1hgw,1,1hhe,2,1hj4,s,1hk7,,1hkg,l,1hm8,h,1ho0,k,1hpc,m,1hqb,1g,1htd,1,1hth,,1htv,18,1hw0,o,1hxf,z,1hz8,,1hzb,,1hzk,y,1i0m,,1i0z,1b,1i2p,3,1i3e,,1i3g,,1i4g,h,1i4z,o,1i67,1,1i80,6,1i88,,1i8a,3,1i8f,e,1i8v,9,1i9c,1a,1ibp,7,1ibz,1,1ic3,l,1icq,6,1icy,1,1id1,4,1id9,,1ids,,1ie5,4,1if4,9,1iff,,1ifi,,1ifk,11,1ign,,1ihd,,1ihf,,1iio,1g,1ikn,3,1ilb,2,1im8,1b,1io4,1,1io7,,1itc,1a,1ivs,3,1iww,1b,1iys,,1j0g,16,1j20,,1j40,q,1j5s,6,1jb4,17,1jfk,1r,1ji7,7,1jih,,1jik,7,1jit,1,1jiw,n,1jjz,,1jk1,,1jmo,7,1jmy,12,1joh,,1joj,,1jpc,,1jpn,13,1jqy,,1jrk,,1jrw,19,1jtp,,1ju8,20,1k1s,w,1k3k,8,1k3u,10,1k5c,,1k6q,t,1kao,6,1kaw,1,1kaz,11,1kcm,,1kdc,5,1kdj,1,1kdm,v,1kew,,1ko0,i,1koy,,1kp0,c,1kpe,x,1kts,,1kw0,pl,1ls0,5f,1nyo,2o,1o1s,tr,1ow1,5,1oww,32y,1s00,g6,1xq8,t,1z40,fs,1zk0,u,1zlc,26,1zo0,t,1zpc,1b,1zr4,3,1zs3,k,1zst,i,205c,18,20cg,1r,20hs,22,20k0,,20lv,c,20o0,1,20o3,,20ow,4qf,25fk,yd,26f3,9,2dbk,3,2dbp,6,2dbx,1,2dc0,82,2dki,,2dlc,2,2dlh,,2dlw,3,2dm8,az,2fpc,2y,2fsg,c,2fsw,8,2ftc,9,2kg0,2c,2kie,1y,2kke,1,2kki,,2kkl,1,2kkp,3,2kku,b,2kl7,,2kl9,6,2klh,1s,2knb,3,2knh,7,2knq,6,2kny,r,2kor,3,2kow,4,2kp2,,2kp6,6,2kpe,9f,2kyw,o,2kzm,o,2l0c,u,2l18,o,2l1y,u,2l2u,o,2l3k,u,2l4g,o,2l56,u,2l62,o,2l6s,7,2mm8,u,2mn9,5,2muo,1p,2n0g,18,2n1z,6,2n2m,,2nbk,t,2ncw,17,2nrk,r,2nyo,t,2nzk,,2odc,6,2odk,3,2odp,1,2ods,e,2oe8,5g,2olc,1v,2onf,,2pkw,3,2pl1,q,2plt,1,2plw,,2plz,,2pm1,9,2pmc,3,2pmh,,2pmj,,2pmq,,2pmv,,2pmx,,2pmz,,2pn1,2,2pn5,1,2pn8,,2pnb,,2pnd,,2pnf,,2pnh,,2pnj,,2pnl,1,2pno,,2pnr,3,2pnw,6,2po4,3,2po9,3,2poe,,2pog,9,2por,g,2ppd,2,2pph,4,2ppn,g,2t4w,wyn,3q4g,37d,3tc0,65,3ti8,4g1,3xyo,5rk,43qo,h9,464g,f1,47pc,3t6,4bio,38f') ); /** * The Unicode `N` (Numeric) properties data * - * @type {UnicodeRangeBuffer} + * @type {UnicodeRange[]} */ -export const numeric_buffer = initUnicodeRangeBuffer( - Array(288), - /** @type {UnicodeRangeEncoding} */ +export const numeric_ranges = decodeUnicodeData( + /** @type {UnicodeDataEncoding} */ ('1c,9,4y,1,55,,58,2,19c,9,1dc,9,1j4,9,1uu,9,1ye,9,1ys,5,21y,9,25i,9,292,9,29e,5,2cm,c,2g6,9,2go,6,2jq,9,2mw,6,2na,i,2qu,9,2ts,9,2xc,9,2zk,j,37k,9,39s,9,3u1,j,4j2,2,4ps,9,4q8,9,4r4,9,4zq,9,53k,a,58g,9,58w,9,5e8,9,5gw,9,5kw,9,5lc,9,6eo,,6es,5,6f4,9,6kw,1e,6md,4,76o,1n,7ai,l,7sm,t,8vx,,9hj,,9i9,8,9iw,2,9si,3,9wg,9,9xk,7,9xt,e,9z4,9,a0h,e,wtc,9,wyu,9,x80,5,xcg,9,xds,9,xjk,9,xkg,9,xn4,9,xyo,9,1eds,9,1err,18,1etc,1k,1eve,1,1f4x,q,1f6o,3,1f7l,,1f7u,,1fbl,4,1fhc,9,1g7s,7,1g8p,6,1g9z,8,1gcb,4,1gd2,5,1gho,1,1ghs,f,1gia,19,1glc,8,1gn1,1,1gnx,2,1gq3,4,1gt4,7,1gu0,7,1gvd,6,1h4q,5,1h68,9,1h6o,9,1heo,u,1hjx,9,1hld,3,1hol,6,1hsi,t,1hww,9,1hyu,9,1i34,9,1i3l,j,1ib4,9,1ikw,9,1iog,9,1iz4,9,1j28,9,1j2o,j,1j5c,b,1jhc,i,1jkg,9,1k34,9,1k5s,s,1kcw,9,1kf4,9,1kr4,9,1ku8,k,1log,32,1xrk,9,1zkw,9,1znk,9,1zrk,9,1zrv,6,206o,9,20e8,m,2j1s,9,2k74,j,2k80,j,2kbk,o,2l72,1d,2n28,9,2ne8,9,2nsg,9,2nzl,9,2ojr,8,2onk,9,2p9t,1m,2pbh,2,2pbl,3,2pdt,18,2pf3,e,2q68,c,2sc0,9') ); /** * The Unicode `Alphabetic` properties data * - * @type {UnicodeRangeBuffer} + * @type {UnicodeRange[]} */ -export const alphabetic_buffer = initUnicodeRangeBuffer( - Array(1514), - /** @type {UnicodeRangeEncoding} */ +export const alphabetic_ranges = decodeUnicodeData( + /** @type {UnicodeDataEncoding} */ ('1t,p,2p,p,4q,,51,,56,,5c,m,60,u,6w,cp,jq,b,kg,4,ks,,ku,,n9,,o3,h,om,1,oq,3,ov,,p2,,p4,2,p8,,pa,j,pv,2a,s7,3u,wa,4l,10x,11,121,,128,14,14g,d,14v,,14x,1,150,1,153,,15c,q,167,3,174,a,17k,1j,195,6,19q,2t,1cl,7,1cx,7,1d9,2,1dm,2,1dr,,1e8,1b,1fx,2s,1je,w,1kk,1,1kq,,1kw,n,1lm,i,1mo,o,1nk,a,1o0,n,1op,5,1p3,,1pc,15,1qs,b,1r7,6,1rk,23,1tp,f,1u6,2,1ud,e,1v5,i,1vp,7,1vz,1,1w3,l,1wq,6,1wy,,1x2,3,1x9,7,1xj,1,1xn,1,1xq,,1xz,,1y4,1,1y7,4,1yo,1,1z0,,1z5,2,1z9,5,1zj,1,1zn,l,20a,6,20i,1,20l,1,20o,1,20u,4,213,1,217,1,21d,,21l,3,21q,,228,5,22p,2,22t,8,233,2,237,l,23u,6,242,1,245,4,24d,8,24n,2,24r,1,24w,,25c,3,261,3,269,2,26d,7,26n,1,26r,l,27e,6,27m,1,27p,4,27x,7,287,1,28b,1,28m,1,28s,1,28v,4,29d,,29u,1,29x,5,2a6,2,2aa,3,2ah,1,2ak,,2am,1,2ar,1,2aw,2,2b2,b,2bi,4,2bq,2,2bu,2,2c0,,2c7,,2dc,c,2dq,2,2du,m,2ei,f,2f1,7,2fa,2,2fe,2,2fp,1,2fs,2,2fx,,2g0,3,2gw,3,2h1,7,2ha,2,2he,m,2i2,9,2id,4,2il,7,2iu,2,2iy,2,2j9,1,2jh,1,2jk,3,2k1,2,2kg,c,2ku,2,2ky,14,2m5,7,2me,2,2mi,2,2mm,,2ms,3,2n3,4,2nu,5,2o1,2,2o5,h,2oq,n,2pf,8,2pp,,2ps,6,2q7,5,2qe,,2qg,7,2r6,1,2rl,1l,2tc,6,2tp,,2v5,1,2v8,,2va,4,2vg,n,2w5,,2w7,i,2wr,2,2ww,4,2x2,,2x9,,2xo,3,2yo,,30g,7,30p,z,31t,i,32g,f,32x,z,35s,1i,37c,,37f,4,380,1r,3a2,3,3a8,11,3bb,,3bh,,3bk,16,3cs,98,3m2,3,3m8,6,3mg,,3mi,3,3mo,14,3nu,3,3o0,w,3oy,3,3p4,6,3pc,,3pe,3,3pk,e,3q0,1k,3rm,3,3rs,1u,3uo,f,3vk,2d,3y0,5,3y9,h7,4fj,g,4g1,p,4gw,22,4j2,a,4jk,j,4kf,k,4lc,j,4m8,c,4mm,2,4mq,1,4n4,1f,4om,i,4pj,,4po,,4rk,2g,4u8,16,4vk,1x,4xs,u,4yo,b,4z4,8,500,t,50w,4,51c,17,52o,p,54w,r,55s,1q,57l,j,59j,,5a7,1,5ak,2,5c0,1f,5dh,e,5dx,7,5fk,15,5gs,3,5h6,17,5if,a,5j4,1i,5l9,2,5lm,z,5mo,a,5n4,16,5od,2,5pl,3,5pq,5,5px,1,5q2,,5q8,5b,5w3,x,5xc,7p,654,5,65c,11,66g,5,66o,7,66x,,66z,,671,,673,u,680,1g,69i,6,69q,,69u,2,69y,6,6a8,3,6ae,5,6ao,c,6b6,2,6ba,6,6ep,,6f3,,6fk,c,6iq,,6iv,,6iy,9,6j9,,6jd,4,6jo,,6jq,,6js,,6ju,3,6jz,a,6kc,3,6kl,4,6ku,,6lc,14,792,1f,8ow,6c,8vf,3,8vm,1,8w0,11,8x3,,8x9,,8xc,1j,8z3,,8zk,m,90g,6,90o,6,90w,6,914,6,91c,6,91k,6,91s,6,920,6,928,v,94f,,9hh,2,9i9,8,9ip,4,9iw,4,9j5,2d,9lp,2,9lt,2h,9oc,3,9ol,16,9pt,2l,9sw,v,9v4,f,a9s,533,feo,h3g,wk0,19,wlc,7g,wsw,f,wtm,1,wu8,1a,wvo,7,wvz,34,x07,8,x0i,2u,x3f,1u,x5c,1,x5f,,x5h,7,x6a,j,x6v,w,x8g,1f,xa8,1v,xc5,,xde,5,xdn,,xdp,2,xe2,w,xf4,y,xgg,s,xhc,1e,xis,b,xjj,,xk0,f,xkq,4,xkw,1i,xmo,d,xnk,m,xoa,1w,xq8,,xqa,,xqz,2,xr4,f,xrm,3,xs1,5,xs9,5,xsh,5,xsw,6,xt4,6,xtc,16,xuk,d,xv4,3e,xz4,8mb,16ls,m,16mj,1c,1d6o,a5,1dgw,2x,1dkw,6,1dlf,4,1dlp,b,1dm2,c,1dmg,4,1dmm,,1dmo,1,1dmr,1,1dmu,2z,1dqr,a2,1e1c,1r,1e36,1h,1e5s,b,1e9c,4,1e9i,3q,1ee9,p,1ef5,p,1eg6,2g,1eiq,5,1eiy,5,1ej6,5,1eje,2,1ekg,b,1ekt,p,1elk,i,1em4,1,1em7,e,1emo,d,1eo0,3e,1etc,1g,1f28,s,1f34,1c,1f5s,v,1f71,t,1f80,16,1f9c,t,1fa8,z,1fbc,7,1fbl,4,1fcw,4d,1fhs,z,1fiw,z,1fk0,13,1flc,1f,1fn4,a,1fng,e,1fnw,6,1fo4,1,1fo7,a,1foj,e,1foz,6,1fp7,1,1fpc,1f,1fr4,8m,1g00,l,1g0w,7,1g1s,5,1g1z,15,1g36,8,1g5c,5,1g5k,,1g5m,17,1g6v,1,1g70,,1g73,m,1g80,m,1g8w,u,1gbk,i,1gc4,1,1gcg,l,1gdc,p,1gg0,1j,1ghq,1,1gjk,3,1gjp,1,1gjw,7,1gk5,2,1gk9,s,1gm8,s,1gn4,s,1gow,7,1gp5,r,1gqo,1h,1gsg,l,1gtc,i,1gu8,h,1gxs,20,1h1c,1e,1h34,1e,1h4w,13,1h6y,r,1h7t,,1h7z,m,1hfk,15,1hgr,1,1hgw,1,1hhe,2,1hj0,,1hj4,s,1hk7,,1hkg,l,1hm8,h,1ho0,k,1hpc,m,1hq8,1x,1htd,4,1hts,1k,1hvm,,1hw0,o,1hxc,1e,1hz8,3,1hzk,y,1i0m,,1i0w,1r,1i2p,3,1i32,1,1i3e,,1i3g,,1i4g,h,1i4z,x,1i5z,,1i66,3,1i80,6,1i88,,1i8a,3,1i8f,e,1i8v,9,1i9c,1k,1ibk,3,1ibp,7,1ibz,1,1ic3,l,1icq,6,1icy,1,1id1,4,1id9,7,1idj,1,1idn,1,1ids,,1idz,,1ie5,6,1if4,9,1iff,,1ifi,,1ifk,11,1ign,9,1igy,,1ih1,,1ih3,3,1ih8,1,1ihd,,1ihf,,1iio,1t,1ikj,2,1ikn,3,1ilb,2,1im8,1t,1io4,1,1io7,,1itc,1h,1iuw,6,1ivs,5,1iww,1q,1iyo,,1iys,,1j0g,1h,1j20,,1j40,q,1j4t,d,1j5s,6,1jb4,1k,1jfk,1r,1ji7,7,1jih,,1jik,7,1jit,1,1jiw,t,1jjr,1,1jjv,1,1jjz,3,1jmo,7,1jmy,19,1joa,5,1joh,,1joj,1,1jpc,1e,1jqt,9,1jrk,1z,1jtp,,1ju8,20,1k1s,w,1k3k,8,1k3u,18,1k54,6,1k5c,,1k6q,t,1k7m,l,1k89,d,1kao,6,1kaw,1,1kaz,17,1kca,,1kcc,1,1kcf,2,1kcj,,1kcm,1,1kdc,5,1kdj,1,1kdm,10,1keo,1,1ker,3,1kew,,1ko0,m,1kow,g,1kpe,14,1kqm,2,1kts,,1kw0,pl,1log,32,1ls0,5f,1nyo,2o,1o1s,tr,1ow1,5,1oww,32y,1s00,g6,1xq8,1a,1z40,fs,1zk0,u,1zlc,26,1zo0,t,1zpc,1b,1zr4,3,1zs3,k,1zst,i,205c,18,20cg,1r,20hs,22,20jz,1k,20lr,g,20o0,1,20o3,,20og,1,20ow,4qf,25fk,yd,26f3,9,2dbk,3,2dbp,6,2dbx,1,2dc0,82,2dki,,2dlc,2,2dlh,,2dlw,3,2dm8,az,2fpc,2y,2fsg,c,2fsw,8,2ftc,9,2ftq,,2kg0,2c,2kie,1y,2kke,1,2kki,,2kkl,1,2kkp,3,2kku,b,2kl7,,2kl9,6,2klh,1s,2knb,3,2knh,7,2knq,6,2kny,r,2kor,3,2kow,4,2kp2,,2kp6,6,2kpe,9f,2kyw,o,2kzm,o,2l0c,u,2l18,o,2l1y,u,2l2u,o,2l3k,u,2l4g,o,2l56,u,2l62,o,2l6s,7,2mm8,u,2mn9,5,2mtc,6,2mtk,g,2mu3,6,2mub,1,2mue,4,2muo,1p,2mxb,,2n0g,18,2n1z,6,2n2m,,2nbk,t,2ncw,17,2nrk,r,2nyo,t,2nzk,,2odc,6,2odk,3,2odp,1,2ods,e,2oe8,5g,2olc,1v,2onb,,2onf,,2pkw,3,2pl1,q,2plt,1,2plw,,2plz,,2pm1,9,2pmc,3,2pmh,,2pmj,,2pmq,,2pmv,,2pmx,,2pmz,,2pn1,2,2pn5,1,2pn8,,2pnb,,2pnd,,2pnf,,2pnh,,2pnj,,2pnl,1,2pno,,2pnr,3,2pnw,6,2po4,3,2po9,3,2poe,,2pog,9,2por,g,2ppd,2,2pph,4,2ppn,g,2q7k,p,2q8g,p,2q9c,p,2t4w,wyn,3q4g,37d,3tc0,65,3ti8,4g1,3xyo,5rk,43qo,h9,464g,f1,47pc,3t6,4bio,38f') ); diff --git a/src/_grapheme_data.js b/src/_grapheme_data.js index a54ff57..5b257f9 100644 --- a/src/_grapheme_data.js +++ b/src/_grapheme_data.js @@ -3,15 +3,10 @@ // // @ts-check -import { - searchUnicodeRange, - initLookupTableBuffer, - initUnicodeRangeBuffer, -} from './core.js'; +import { decodeUnicodeData } from './core.js'; /** - * @typedef {import('./core.js').LookupTableEncoding} LookupTableEncoding - * @typedef {import('./core.js').UnicodeRangeEncoding} UnicodeRangeEncoding + * @typedef {import('./core.js').UnicodeDataEncoding} UnicodeDataEncoding */ /** @@ -76,8 +71,9 @@ import { /** * Grapheme category enum * - * Note: The enum object is not actually `Object.freeze` - * because it increases 800 bytes of Brotli compression... Not sure why :P + * Note: + * The object isn't actually frozen + * because using `Object.freeze` increases 800 bytes on Brotli compression. * * @type {Readonly>} */ @@ -99,44 +95,11 @@ export const GraphemeCategory = { ZWJ: 14, }; -export const grapheme_buffer = initUnicodeRangeBuffer( - Array(2908), - /** @type {UnicodeRangeEncoding} */ - (',9,a,,b,1,d,,e,h,3j,w,4p,,4t,,4u,,lc,33,w3,6,13l,18,14v,,14x,1,150,1,153,,16o,5,174,a,17g,,18r,k,19s,,1cm,6,1ct,,1cv,5,1d3,1,1d6,3,1e7,,1e9,,1f4,q,1ie,a,1kb,8,1kt,,1li,3,1ln,8,1lx,2,1m1,4,1nd,2,1ow,1,1p3,8,1qi,n,1r6,,1r7,v,1s3,,1tm,,1tn,,1to,,1tq,2,1tt,7,1u1,3,1u5,,1u6,1,1u9,6,1uq,1,1vl,,1vm,1,1x8,,1xa,,1xb,1,1xd,3,1xj,1,1xn,1,1xp,,1xz,,1ya,1,1z2,,1z5,1,1z7,,20s,,20u,2,20x,1,213,1,217,2,21d,,228,1,22d,,22p,1,22r,,24c,,24e,2,24h,4,24n,1,24p,,24r,1,24t,,25e,1,262,5,269,,26a,1,27w,,27y,1,280,,281,3,287,1,28b,1,28d,,28l,2,28y,1,29u,,2bi,,2bj,,2bk,,2bl,1,2bq,2,2bu,2,2bx,,2c7,,2dc,,2dd,2,2dg,,2f0,,2f2,2,2f5,3,2fa,2,2fe,3,2fp,1,2g2,1,2gx,,2gy,1,2ik,,2im,,2in,1,2ip,,2iq,,2ir,1,2iu,2,2iy,3,2j9,1,2jm,1,2k3,,2kg,1,2ki,1,2m3,1,2m6,,2m7,1,2m9,3,2me,2,2mi,2,2ml,,2mm,,2mv,,2n6,1,2o1,,2o2,1,2q2,,2q7,,2q8,1,2qa,2,2qe,,2qg,6,2qn,,2r6,1,2sx,,2sz,,2t0,6,2tj,7,2wh,,2wj,,2wk,8,2x4,6,2zc,1,305,,307,,309,,30e,1,31t,d,327,,328,4,32e,1,32l,a,32x,z,346,,371,3,375,,376,5,37d,1,37f,1,37h,1,386,1,388,1,38e,2,38x,3,39e,,39g,,39h,1,39p,,3a5,,3cw,2n,3fk,1z,3hk,2f,3tp,2,4k2,3,4ky,2,4lu,1,4mq,1,4ok,1,4om,,4on,6,4ou,7,4p2,,4p3,1,4p5,a,4pp,,4qz,2,4r2,,4r3,,4ud,1,4vd,,4yo,2,4yr,3,4yv,1,4yx,2,4z4,1,4z6,,4z7,5,4zd,2,55j,1,55l,1,55n,,579,,57a,,57b,,57c,6,57k,,57m,,57p,7,57x,5,583,9,58f,,59s,u,5c0,3,5c4,,5dg,9,5dq,3,5du,2,5ez,8,5fk,1,5fm,,5gh,,5gi,3,5gm,1,5go,5,5ie,,5if,,5ig,1,5ii,2,5il,,5im,,5in,4,5k4,7,5kc,7,5kk,1,5km,1,5ow,2,5p0,c,5pd,,5pe,6,5pp,,5pw,,5pz,,5q0,1,5vk,1r,6bv,,6bw,,6bx,,6by,1,6co,6,6d8,,6dl,,6e8,f,6hc,w,6jm,,6k9,,6ms,5,6nd,1,6xm,1,6y0,,70o,,72n,,73d,a,73s,2,79e,,7fu,1,7g6,,7gg,,7i3,3,7i8,5,7if,b,7is,35,7m8,39,7pk,a,7pw,,7py,,7q5,,7q9,,7qg,,7qr,1,7r8,,7rb,,7rg,,7ri,,7rn,2,7rr,,7s3,4,7th,2,7tt,,7u8,,7un,,850,1,8hx,2,8ij,1,8k0,,8k5,,8vj,2,8zj,,928,v,9ii,5,9io,,9j1,,9ll,1,9zr,,9zt,,wvj,3,wvo,9,wwu,1,wz4,1,x6q,,x6u,,x6z,,x7n,1,x7p,1,x7r,,x7w,,xa8,1,xbo,f,xc4,1,xcw,h,xdr,,xeu,7,xfr,a,xg2,,xg3,,xgg,s,xhc,2,xhf,,xir,,xis,1,xiu,3,xiy,1,xj0,1,xj2,1,xj4,,xk5,,xm1,5,xm7,1,xm9,1,xmb,1,xmd,1,xmr,,xn0,,xn1,,xoc,,xps,,xpu,2,xpz,1,xq6,1,xq9,,xrf,,xrg,1,xri,1,xrp,,xrq,,xyb,1,xyd,,xye,1,xyg,,xyh,1,xyk,,xyl,,xz4,,xz5,q,xzw,,xzx,q,y0o,,y0p,q,y1g,,y1h,q,y28,,y29,q,y30,,y31,q,y3s,,y3t,q,y4k,,y4l,q,y5c,,y5d,q,y64,,y65,q,y6w,,y6x,q,y7o,,y7p,q,y8g,,y8h,q,y98,,y99,q,ya0,,ya1,q,yas,,yat,q,ybk,,ybl,q,ycc,,ycd,q,yd4,,yd5,q,ydw,,ydx,q,yeo,,yep,q,yfg,,yfh,q,yg8,,yg9,q,yh0,,yh1,q,yhs,,yht,q,yik,,yil,q,yjc,,yjd,q,yk4,,yk5,q,ykw,,ykx,q,ylo,,ylp,q,ymg,,ymh,q,yn8,,yn9,q,yo0,,yo1,q,yos,,yot,q,ypk,,ypl,q,yqc,,yqd,q,yr4,,yr5,q,yrw,,yrx,q,yso,,ysp,q,ytg,,yth,q,yu8,,yu9,q,yv0,,yv1,q,yvs,,yvt,q,ywk,,ywl,q,yxc,,yxd,q,yy4,,yy5,q,yyw,,yyx,q,yzo,,yzp,q,z0g,,z0h,q,z18,,z19,q,z20,,z21,q,z2s,,z2t,q,z3k,,z3l,q,z4c,,z4d,q,z54,,z55,q,z5w,,z5x,q,z6o,,z6p,q,z7g,,z7h,q,z88,,z89,q,z90,,z91,q,z9s,,z9t,q,zak,,zal,q,zbc,,zbd,q,zc4,,zc5,q,zcw,,zcx,q,zdo,,zdp,q,zeg,,zeh,q,zf8,,zf9,q,zg0,,zg1,q,zgs,,zgt,q,zhk,,zhl,q,zic,,zid,q,zj4,,zj5,q,zjw,,zjx,q,zko,,zkp,q,zlg,,zlh,q,zm8,,zm9,q,zn0,,zn1,q,zns,,znt,q,zok,,zol,q,zpc,,zpd,q,zq4,,zq5,q,zqw,,zqx,q,zro,,zrp,q,zsg,,zsh,q,zt8,,zt9,q,zu0,,zu1,q,zus,,zut,q,zvk,,zvl,q,zwc,,zwd,q,zx4,,zx5,q,zxw,,zxx,q,zyo,,zyp,q,zzg,,zzh,q,1008,,1009,q,1010,,1011,q,101s,,101t,q,102k,,102l,q,103c,,103d,q,1044,,1045,q,104w,,104x,q,105o,,105p,q,106g,,106h,q,1078,,1079,q,1080,,1081,q,108s,,108t,q,109k,,109l,q,10ac,,10ad,q,10b4,,10b5,q,10bw,,10bx,q,10co,,10cp,q,10dg,,10dh,q,10e8,,10e9,q,10f0,,10f1,q,10fs,,10ft,q,10gk,,10gl,q,10hc,,10hd,q,10i4,,10i5,q,10iw,,10ix,q,10jo,,10jp,q,10kg,,10kh,q,10l8,,10l9,q,10m0,,10m1,q,10ms,,10mt,q,10nk,,10nl,q,10oc,,10od,q,10p4,,10p5,q,10pw,,10px,q,10qo,,10qp,q,10rg,,10rh,q,10s8,,10s9,q,10t0,,10t1,q,10ts,,10tt,q,10uk,,10ul,q,10vc,,10vd,q,10w4,,10w5,q,10ww,,10wx,q,10xo,,10xp,q,10yg,,10yh,q,10z8,,10z9,q,1100,,1101,q,110s,,110t,q,111k,,111l,q,112c,,112d,q,1134,,1135,q,113w,,113x,q,114o,,114p,q,115g,,115h,q,1168,,1169,q,1170,,1171,q,117s,,117t,q,118k,,118l,q,119c,,119d,q,11a4,,11a5,q,11aw,,11ax,q,11bo,,11bp,q,11cg,,11ch,q,11d8,,11d9,q,11e0,,11e1,q,11es,,11et,q,11fk,,11fl,q,11gc,,11gd,q,11h4,,11h5,q,11hw,,11hx,q,11io,,11ip,q,11jg,,11jh,q,11k8,,11k9,q,11l0,,11l1,q,11ls,,11lt,q,11mk,,11ml,q,11nc,,11nd,q,11o4,,11o5,q,11ow,,11ox,q,11po,,11pp,q,11qg,,11qh,q,11r8,,11r9,q,11s0,,11s1,q,11ss,,11st,q,11tk,,11tl,q,11uc,,11ud,q,11v4,,11v5,q,11vw,,11vx,q,11wo,,11wp,q,11xg,,11xh,q,11y8,,11y9,q,11z0,,11z1,q,11zs,,11zt,q,120k,,120l,q,121c,,121d,q,1224,,1225,q,122w,,122x,q,123o,,123p,q,124g,,124h,q,1258,,1259,q,1260,,1261,q,126s,,126t,q,127k,,127l,q,128c,,128d,q,1294,,1295,q,129w,,129x,q,12ao,,12ap,q,12bg,,12bh,q,12c8,,12c9,q,12d0,,12d1,q,12ds,,12dt,q,12ek,,12el,q,12fc,,12fd,q,12g4,,12g5,q,12gw,,12gx,q,12ho,,12hp,q,12ig,,12ih,q,12j8,,12j9,q,12k0,,12k1,q,12ks,,12kt,q,12lk,,12ll,q,12mc,,12md,q,12n4,,12n5,q,12nw,,12nx,q,12oo,,12op,q,12pg,,12ph,q,12q8,,12q9,q,12r0,,12r1,q,12rs,,12rt,q,12sk,,12sl,q,12tc,,12td,q,12u4,,12u5,q,12uw,,12ux,q,12vo,,12vp,q,12wg,,12wh,q,12x8,,12x9,q,12y0,,12y1,q,12ys,,12yt,q,12zk,,12zl,q,130c,,130d,q,1314,,1315,q,131w,,131x,q,132o,,132p,q,133g,,133h,q,1348,,1349,q,1350,,1351,q,135s,,135t,q,136k,,136l,q,137c,,137d,q,1384,,1385,q,138w,,138x,q,139o,,139p,q,13ag,,13ah,q,13b8,,13b9,q,13c0,,13c1,q,13cs,,13ct,q,13dk,,13dl,q,13ec,,13ed,q,13f4,,13f5,q,13fw,,13fx,q,13go,,13gp,q,13hg,,13hh,q,13i8,,13i9,q,13j0,,13j1,q,13js,,13jt,q,13kk,,13kl,q,13lc,,13ld,q,13m4,,13m5,q,13mw,,13mx,q,13no,,13np,q,13og,,13oh,q,13p8,,13p9,q,13q0,,13q1,q,13qs,,13qt,q,13rk,,13rl,q,13sc,,13sd,q,13t4,,13t5,q,13tw,,13tx,q,13uo,,13up,q,13vg,,13vh,q,13w8,,13w9,q,13x0,,13x1,q,13xs,,13xt,q,13yk,,13yl,q,13zc,,13zd,q,1404,,1405,q,140w,,140x,q,141o,,141p,q,142g,,142h,q,1438,,1439,q,1440,,1441,q,144s,,144t,q,145k,,145l,q,146c,,146d,q,1474,,1475,q,147w,,147x,q,148o,,148p,q,149g,,149h,q,14a8,,14a9,q,14b0,,14b1,q,14bs,,14bt,q,14ck,,14cl,q,14dc,,14dd,q,14e4,,14e5,q,14ew,,14ex,q,14fo,,14fp,q,14gg,,14gh,q,14h8,,14h9,q,14i0,,14i1,q,14is,,14it,q,14jk,,14jl,q,14kc,,14kd,q,14l4,,14l5,q,14lw,,14lx,q,14mo,,14mp,q,14ng,,14nh,q,14o8,,14o9,q,14p0,,14p1,q,14ps,,14pt,q,14qk,,14ql,q,14rc,,14rd,q,14s4,,14s5,q,14sw,,14sx,q,14to,,14tp,q,14ug,,14uh,q,14v8,,14v9,q,14w0,,14w1,q,14ws,,14wt,q,14xk,,14xl,q,14yc,,14yd,q,14z4,,14z5,q,14zw,,14zx,q,150o,,150p,q,151g,,151h,q,1528,,1529,q,1530,,1531,q,153s,,153t,q,154k,,154l,q,155c,,155d,q,1564,,1565,q,156w,,156x,q,157o,,157p,q,158g,,158h,q,1598,,1599,q,15a0,,15a1,q,15as,,15at,q,15bk,,15bl,q,15cc,,15cd,q,15d4,,15d5,q,15dw,,15dx,q,15eo,,15ep,q,15fg,,15fh,q,15g8,,15g9,q,15h0,,15h1,q,15hs,,15ht,q,15ik,,15il,q,15jc,,15jd,q,15k4,,15k5,q,15kw,,15kx,q,15lo,,15lp,q,15mg,,15mh,q,15n8,,15n9,q,15o0,,15o1,q,15os,,15ot,q,15pk,,15pl,q,15qc,,15qd,q,15r4,,15r5,q,15rw,,15rx,q,15so,,15sp,q,15tg,,15th,q,15u8,,15u9,q,15v0,,15v1,q,15vs,,15vt,q,15wk,,15wl,q,15xc,,15xd,q,15y4,,15y5,q,15yw,,15yx,q,15zo,,15zp,q,160g,,160h,q,1618,,1619,q,1620,,1621,q,162s,,162t,q,163k,,163l,q,164c,,164d,q,1654,,1655,q,165w,,165x,q,166o,,166p,q,167g,,167h,q,1688,,1689,q,1690,,1691,q,169s,,169t,q,16ak,,16al,q,16bc,,16bd,q,16c4,,16c5,q,16cw,,16cx,q,16do,,16dp,q,16eg,,16eh,q,16f8,,16f9,q,16g0,,16g1,q,16gs,,16gt,q,16hk,,16hl,q,16ic,,16id,q,16j4,,16j5,q,16jw,,16jx,q,16ko,,16kp,q,16ls,m,16mj,1c,1dlq,,1e68,f,1e74,f,1edb,,1ehq,1,1ek0,b,1eyl,,1f4w,,1f92,4,1gjl,2,1gjp,1,1gjw,3,1gl4,2,1glb,,1gpx,1,1h5w,3,1h7t,4,1hgr,1,1hj0,3,1hl2,a,1hmq,3,1hq8,,1hq9,,1hqa,,1hrs,e,1htc,,1htf,1,1htr,2,1htu,,1hv4,2,1hv7,3,1hvb,1,1hvd,1,1hvh,,1hvm,,1hvx,,1hxc,2,1hyf,4,1hyk,,1hyl,7,1hz9,1,1i0j,,1i0w,1,1i0y,,1i2b,2,1i2e,8,1i2n,,1i2o,,1i2q,1,1i2x,3,1i32,,1i33,,1i5o,2,1i5r,2,1i5u,1,1i5w,3,1i66,,1i69,,1ian,,1iao,2,1iar,7,1ibk,1,1ibm,1,1id7,1,1ida,,1idb,,1idc,,1idd,3,1idj,1,1idn,1,1idp,,1idz,,1iea,1,1iee,6,1ieo,4,1igo,,1igp,1,1igr,5,1igy,,1ih1,,1ih3,2,1ih6,,1ih8,1,1iha,2,1ihd,,1ihe,,1iht,1,1ik5,2,1ik8,7,1ikg,1,1iki,2,1ikl,,1ikm,,1ila,,1ink,,1inl,1,1inn,5,1int,,1inu,,1inv,1,1inx,,1iny,,1inz,1,1io1,,1io2,1,1iun,,1iuo,1,1iuq,3,1iuw,3,1iv0,1,1iv2,,1iv3,1,1ivw,1,1iy8,2,1iyb,7,1iyj,1,1iyl,,1iym,,1iyn,1,1j1n,,1j1o,,1j1p,,1j1q,1,1j1s,7,1j4t,,1j4u,,1j4v,,1j4y,3,1j52,,1j53,4,1jcc,2,1jcf,8,1jco,,1jcp,1,1jjk,,1jjl,4,1jjr,1,1jjv,3,1jjz,,1jk0,,1jk1,,1jk2,,1jk3,,1jo1,2,1jo4,3,1joa,1,1joc,3,1jog,,1jok,,1jpd,9,1jqr,5,1jqx,,1jqy,,1jqz,3,1jrb,,1jrl,5,1jrr,1,1jrt,2,1jt0,5,1jt6,c,1jtj,,1jtk,1,1k4v,,1k4w,6,1k54,5,1k5a,,1k5b,,1k7m,l,1k89,,1k8a,6,1k8h,,1k8i,1,1k8k,,1k8l,1,1kc1,5,1kca,,1kcc,1,1kcf,6,1kcm,,1kcn,,1kei,4,1keo,1,1ker,1,1ket,,1keu,,1kev,,1koj,1,1kol,1,1kow,1,1koy,,1koz,,1kqc,1,1kqe,4,1kqm,1,1kqo,2,1kre,,1ovk,f,1ow0,,1ow7,e,1xr2,b,1xre,2,1xrh,2,1zow,4,1zqo,6,206b,,206f,3,20jz,,20k1,1i,20lr,3,20o4,,20og,1,2ftp,1,2fts,3,2jgg,19,2jhs,m,2jxh,4,2jxp,5,2jxv,7,2jy3,7,2jyd,6,2jze,3,2k3m,2,2lmo,1i,2lob,1d,2lpx,,2lqc,,2lqz,4,2lr5,e,2mtc,6,2mtk,g,2mu3,6,2mub,1,2mue,4,2mxb,,2n1s,6,2nce,,2ne4,3,2nsc,3,2nzi,1,2ok0,6,2on8,6,2pz4,73,2q6l,2,2q7j,,2q98,5,2q9q,1,2qa6,,2qa9,9,2qb1,1k,2qcm,p,2qdd,e,2qe2,,2qen,,2qeq,8,2qf0,3,2qfd,c1,2qrf,4,2qrk,8t,2r0m,7d,2r9c,3j,2rg4,b,2rit,16,2rkc,3,2rm0,7,2rmi,5,2rns,7,2rou,29,2rrg,1a,2rss,9,2rt3,c8,2scg,sd,jny8,v,jnz4,2n,jo1s,3j,jo5c,6n,joc0,2rz') -); - -export const grapheme_cats = initLookupTableBuffer( - Array(1454), - /** @type {LookupTableEncoding} */ - ('262122424333333393233393339333333333393393b3b3b3b3b333b33b3bb33333b3b3333333b3b33bb3333b33b3bb33333b3bbb333b333b33333b3b3b3b3333b3b33b3bb39333b33b33b3b3b333b333333b3b333333b33b3b3333b3335dc333333b3b3b33323333b3bb3b33b3b3b3333b3333b3b333bb3b33b3b3b3b3b333b333b3323e2244234444444444444444444444444444444444444444443333443443333333b3b3bb33333b353b3b3b3b333b3b333b333333b3bb3b3b3bbdc333232333333333333333b3b3333bb3b393933b3b33bb3b393b3b3b3333b33b33b3bbb33b333b3333bb3933b3b3b333b3b3b3b3b33b3b3b33b3b3b33b3b33b33b3b3b33bb39b9b3b33b3b33b9333b393b3b33b33b3b3b3333393b3b3b33b39bb3b332333b333dd3b33332333323333333333333333333333344444444a44444434444444444444423232') -); - -const grapheme_lookup = initLookupTableBuffer( - Array(128), - /** @type {LookupTableEncoding} */ - (',a,w,2y,4r,5a,5m,6w,79,7s,8j,8o,8r,8x,8x,8x,8x,8x,8x,8x,8x,8x,8x,8x,8x,8x,8x,8x,8x,8x,8x,8x,8x,8x,8x,8x,8x,8x,8x,8x,8x,8x,91,ai,cj,el,gl,in,kn,mp,oq,qr,st,ut,wq,wq,wq,wq,wq,wq,wq,wq,wq,wr,ww,wz,wz,x5,xb,z5,10c,118,126,126,126,126,126,126,129,129,129,129,129,129,129,129,129,129,129,12c,12c,12e,12l,12l,12l,12l,12l,12l,12l,12l,12l,12l,12l,12l,12l,12l,12l,12l,12l,12l,12l,12l,12n,12n,12n,12n,12p,12w,12w,132,132,13b,13d,13f,13f,13v,140,148'), - ',' -); - /** - * @param {number} cp - * @return Index of {@link grapheme_ranges} if found, or negation of last visited low cursor. + * @type {GraphemeCategoryRange[]} */ -export function findGraphemeIndex(cp) { - // Perform a quick O(1) lookup in a precomputed table to determine - // the slice of the range table to search in. - let lookup_table = grapheme_lookup; - let lookup_interval = 1024; - - let idx = cp / lookup_interval | 0; - // If the `idx` is outside of the precomputed table - use the slice - // starting from the last covered index in the precomputed table and - // ending with the length of the range table. - let sliceFrom = 1448, sliceTo = 1454; - if (idx + 1 < lookup_table.length) { - sliceFrom = lookup_table[idx]; - sliceTo = lookup_table[idx + 1] + 1; - } - - return searchUnicodeRange(cp, grapheme_buffer, sliceFrom * 2, sliceTo * 2); -} +export const grapheme_ranges = decodeUnicodeData( + /** @type {UnicodeDataEncoding} */ + (',9,a,,b,1,d,,e,h,3j,w,4p,,4t,,4u,,lc,33,w3,6,13l,18,14v,,14x,1,150,1,153,,16o,5,174,a,17g,,18r,k,19s,,1cm,6,1ct,,1cv,5,1d3,1,1d6,3,1e7,,1e9,,1f4,q,1ie,a,1kb,8,1kt,,1li,3,1ln,8,1lx,2,1m1,4,1nd,2,1ow,1,1p3,8,1qi,n,1r6,,1r7,v,1s3,,1tm,,1tn,,1to,,1tq,2,1tt,7,1u1,3,1u5,,1u6,1,1u9,6,1uq,1,1vl,,1vm,1,1x8,,1xa,,1xb,1,1xd,3,1xj,1,1xn,1,1xp,,1xz,,1ya,1,1z2,,1z5,1,1z7,,20s,,20u,2,20x,1,213,1,217,2,21d,,228,1,22d,,22p,1,22r,,24c,,24e,2,24h,4,24n,1,24p,,24r,1,24t,,25e,1,262,5,269,,26a,1,27w,,27y,1,280,,281,3,287,1,28b,1,28d,,28l,2,28y,1,29u,,2bi,,2bj,,2bk,,2bl,1,2bq,2,2bu,2,2bx,,2c7,,2dc,,2dd,2,2dg,,2f0,,2f2,2,2f5,3,2fa,2,2fe,3,2fp,1,2g2,1,2gx,,2gy,1,2ik,,2im,,2in,1,2ip,,2iq,,2ir,1,2iu,2,2iy,3,2j9,1,2jm,1,2k3,,2kg,1,2ki,1,2m3,1,2m6,,2m7,1,2m9,3,2me,2,2mi,2,2ml,,2mm,,2mv,,2n6,1,2o1,,2o2,1,2q2,,2q7,,2q8,1,2qa,2,2qe,,2qg,6,2qn,,2r6,1,2sx,,2sz,,2t0,6,2tj,7,2wh,,2wj,,2wk,8,2x4,6,2zc,1,305,,307,,309,,30e,1,31t,d,327,,328,4,32e,1,32l,a,32x,z,346,,371,3,375,,376,5,37d,1,37f,1,37h,1,386,1,388,1,38e,2,38x,3,39e,,39g,,39h,1,39p,,3a5,,3cw,2n,3fk,1z,3hk,2f,3tp,2,4k2,3,4ky,2,4lu,1,4mq,1,4ok,1,4om,,4on,6,4ou,7,4p2,,4p3,1,4p5,a,4pp,,4qz,2,4r2,,4r3,,4ud,1,4vd,,4yo,2,4yr,3,4yv,1,4yx,2,4z4,1,4z6,,4z7,5,4zd,2,55j,1,55l,1,55n,,579,,57a,,57b,,57c,6,57k,,57m,,57p,7,57x,5,583,9,58f,,59s,u,5c0,3,5c4,,5dg,9,5dq,3,5du,2,5ez,8,5fk,1,5fm,,5gh,,5gi,3,5gm,1,5go,5,5ie,,5if,,5ig,1,5ii,2,5il,,5im,,5in,4,5k4,7,5kc,7,5kk,1,5km,1,5ow,2,5p0,c,5pd,,5pe,6,5pp,,5pw,,5pz,,5q0,1,5vk,1r,6bv,,6bw,,6bx,,6by,1,6co,6,6d8,,6dl,,6e8,f,6hc,w,6jm,,6k9,,6ms,5,6nd,1,6xm,1,6y0,,70o,,72n,,73d,a,73s,2,79e,,7fu,1,7g6,,7gg,,7i3,3,7i8,5,7if,b,7is,35,7m8,39,7pk,a,7pw,,7py,,7q5,,7q9,,7qg,,7qr,1,7r8,,7rb,,7rg,,7ri,,7rn,2,7rr,,7s3,4,7th,2,7tt,,7u8,,7un,,850,1,8hx,2,8ij,1,8k0,,8k5,,8vj,2,8zj,,928,v,9ii,5,9io,,9j1,,9ll,1,9zr,,9zt,,wvj,3,wvo,9,wwu,1,wz4,1,x6q,,x6u,,x6z,,x7n,1,x7p,1,x7r,,x7w,,xa8,1,xbo,f,xc4,1,xcw,h,xdr,,xeu,7,xfr,a,xg2,,xg3,,xgg,s,xhc,2,xhf,,xir,,xis,1,xiu,3,xiy,1,xj0,1,xj2,1,xj4,,xk5,,xm1,5,xm7,1,xm9,1,xmb,1,xmd,1,xmr,,xn0,,xn1,,xoc,,xps,,xpu,2,xpz,1,xq6,1,xq9,,xrf,,xrg,1,xri,1,xrp,,xrq,,xyb,1,xyd,,xye,1,xyg,,xyh,1,xyk,,xyl,,xz4,,xz5,q,xzw,,xzx,q,y0o,,y0p,q,y1g,,y1h,q,y28,,y29,q,y30,,y31,q,y3s,,y3t,q,y4k,,y4l,q,y5c,,y5d,q,y64,,y65,q,y6w,,y6x,q,y7o,,y7p,q,y8g,,y8h,q,y98,,y99,q,ya0,,ya1,q,yas,,yat,q,ybk,,ybl,q,ycc,,ycd,q,yd4,,yd5,q,ydw,,ydx,q,yeo,,yep,q,yfg,,yfh,q,yg8,,yg9,q,yh0,,yh1,q,yhs,,yht,q,yik,,yil,q,yjc,,yjd,q,yk4,,yk5,q,ykw,,ykx,q,ylo,,ylp,q,ymg,,ymh,q,yn8,,yn9,q,yo0,,yo1,q,yos,,yot,q,ypk,,ypl,q,yqc,,yqd,q,yr4,,yr5,q,yrw,,yrx,q,yso,,ysp,q,ytg,,yth,q,yu8,,yu9,q,yv0,,yv1,q,yvs,,yvt,q,ywk,,ywl,q,yxc,,yxd,q,yy4,,yy5,q,yyw,,yyx,q,yzo,,yzp,q,z0g,,z0h,q,z18,,z19,q,z20,,z21,q,z2s,,z2t,q,z3k,,z3l,q,z4c,,z4d,q,z54,,z55,q,z5w,,z5x,q,z6o,,z6p,q,z7g,,z7h,q,z88,,z89,q,z90,,z91,q,z9s,,z9t,q,zak,,zal,q,zbc,,zbd,q,zc4,,zc5,q,zcw,,zcx,q,zdo,,zdp,q,zeg,,zeh,q,zf8,,zf9,q,zg0,,zg1,q,zgs,,zgt,q,zhk,,zhl,q,zic,,zid,q,zj4,,zj5,q,zjw,,zjx,q,zko,,zkp,q,zlg,,zlh,q,zm8,,zm9,q,zn0,,zn1,q,zns,,znt,q,zok,,zol,q,zpc,,zpd,q,zq4,,zq5,q,zqw,,zqx,q,zro,,zrp,q,zsg,,zsh,q,zt8,,zt9,q,zu0,,zu1,q,zus,,zut,q,zvk,,zvl,q,zwc,,zwd,q,zx4,,zx5,q,zxw,,zxx,q,zyo,,zyp,q,zzg,,zzh,q,1008,,1009,q,1010,,1011,q,101s,,101t,q,102k,,102l,q,103c,,103d,q,1044,,1045,q,104w,,104x,q,105o,,105p,q,106g,,106h,q,1078,,1079,q,1080,,1081,q,108s,,108t,q,109k,,109l,q,10ac,,10ad,q,10b4,,10b5,q,10bw,,10bx,q,10co,,10cp,q,10dg,,10dh,q,10e8,,10e9,q,10f0,,10f1,q,10fs,,10ft,q,10gk,,10gl,q,10hc,,10hd,q,10i4,,10i5,q,10iw,,10ix,q,10jo,,10jp,q,10kg,,10kh,q,10l8,,10l9,q,10m0,,10m1,q,10ms,,10mt,q,10nk,,10nl,q,10oc,,10od,q,10p4,,10p5,q,10pw,,10px,q,10qo,,10qp,q,10rg,,10rh,q,10s8,,10s9,q,10t0,,10t1,q,10ts,,10tt,q,10uk,,10ul,q,10vc,,10vd,q,10w4,,10w5,q,10ww,,10wx,q,10xo,,10xp,q,10yg,,10yh,q,10z8,,10z9,q,1100,,1101,q,110s,,110t,q,111k,,111l,q,112c,,112d,q,1134,,1135,q,113w,,113x,q,114o,,114p,q,115g,,115h,q,1168,,1169,q,1170,,1171,q,117s,,117t,q,118k,,118l,q,119c,,119d,q,11a4,,11a5,q,11aw,,11ax,q,11bo,,11bp,q,11cg,,11ch,q,11d8,,11d9,q,11e0,,11e1,q,11es,,11et,q,11fk,,11fl,q,11gc,,11gd,q,11h4,,11h5,q,11hw,,11hx,q,11io,,11ip,q,11jg,,11jh,q,11k8,,11k9,q,11l0,,11l1,q,11ls,,11lt,q,11mk,,11ml,q,11nc,,11nd,q,11o4,,11o5,q,11ow,,11ox,q,11po,,11pp,q,11qg,,11qh,q,11r8,,11r9,q,11s0,,11s1,q,11ss,,11st,q,11tk,,11tl,q,11uc,,11ud,q,11v4,,11v5,q,11vw,,11vx,q,11wo,,11wp,q,11xg,,11xh,q,11y8,,11y9,q,11z0,,11z1,q,11zs,,11zt,q,120k,,120l,q,121c,,121d,q,1224,,1225,q,122w,,122x,q,123o,,123p,q,124g,,124h,q,1258,,1259,q,1260,,1261,q,126s,,126t,q,127k,,127l,q,128c,,128d,q,1294,,1295,q,129w,,129x,q,12ao,,12ap,q,12bg,,12bh,q,12c8,,12c9,q,12d0,,12d1,q,12ds,,12dt,q,12ek,,12el,q,12fc,,12fd,q,12g4,,12g5,q,12gw,,12gx,q,12ho,,12hp,q,12ig,,12ih,q,12j8,,12j9,q,12k0,,12k1,q,12ks,,12kt,q,12lk,,12ll,q,12mc,,12md,q,12n4,,12n5,q,12nw,,12nx,q,12oo,,12op,q,12pg,,12ph,q,12q8,,12q9,q,12r0,,12r1,q,12rs,,12rt,q,12sk,,12sl,q,12tc,,12td,q,12u4,,12u5,q,12uw,,12ux,q,12vo,,12vp,q,12wg,,12wh,q,12x8,,12x9,q,12y0,,12y1,q,12ys,,12yt,q,12zk,,12zl,q,130c,,130d,q,1314,,1315,q,131w,,131x,q,132o,,132p,q,133g,,133h,q,1348,,1349,q,1350,,1351,q,135s,,135t,q,136k,,136l,q,137c,,137d,q,1384,,1385,q,138w,,138x,q,139o,,139p,q,13ag,,13ah,q,13b8,,13b9,q,13c0,,13c1,q,13cs,,13ct,q,13dk,,13dl,q,13ec,,13ed,q,13f4,,13f5,q,13fw,,13fx,q,13go,,13gp,q,13hg,,13hh,q,13i8,,13i9,q,13j0,,13j1,q,13js,,13jt,q,13kk,,13kl,q,13lc,,13ld,q,13m4,,13m5,q,13mw,,13mx,q,13no,,13np,q,13og,,13oh,q,13p8,,13p9,q,13q0,,13q1,q,13qs,,13qt,q,13rk,,13rl,q,13sc,,13sd,q,13t4,,13t5,q,13tw,,13tx,q,13uo,,13up,q,13vg,,13vh,q,13w8,,13w9,q,13x0,,13x1,q,13xs,,13xt,q,13yk,,13yl,q,13zc,,13zd,q,1404,,1405,q,140w,,140x,q,141o,,141p,q,142g,,142h,q,1438,,1439,q,1440,,1441,q,144s,,144t,q,145k,,145l,q,146c,,146d,q,1474,,1475,q,147w,,147x,q,148o,,148p,q,149g,,149h,q,14a8,,14a9,q,14b0,,14b1,q,14bs,,14bt,q,14ck,,14cl,q,14dc,,14dd,q,14e4,,14e5,q,14ew,,14ex,q,14fo,,14fp,q,14gg,,14gh,q,14h8,,14h9,q,14i0,,14i1,q,14is,,14it,q,14jk,,14jl,q,14kc,,14kd,q,14l4,,14l5,q,14lw,,14lx,q,14mo,,14mp,q,14ng,,14nh,q,14o8,,14o9,q,14p0,,14p1,q,14ps,,14pt,q,14qk,,14ql,q,14rc,,14rd,q,14s4,,14s5,q,14sw,,14sx,q,14to,,14tp,q,14ug,,14uh,q,14v8,,14v9,q,14w0,,14w1,q,14ws,,14wt,q,14xk,,14xl,q,14yc,,14yd,q,14z4,,14z5,q,14zw,,14zx,q,150o,,150p,q,151g,,151h,q,1528,,1529,q,1530,,1531,q,153s,,153t,q,154k,,154l,q,155c,,155d,q,1564,,1565,q,156w,,156x,q,157o,,157p,q,158g,,158h,q,1598,,1599,q,15a0,,15a1,q,15as,,15at,q,15bk,,15bl,q,15cc,,15cd,q,15d4,,15d5,q,15dw,,15dx,q,15eo,,15ep,q,15fg,,15fh,q,15g8,,15g9,q,15h0,,15h1,q,15hs,,15ht,q,15ik,,15il,q,15jc,,15jd,q,15k4,,15k5,q,15kw,,15kx,q,15lo,,15lp,q,15mg,,15mh,q,15n8,,15n9,q,15o0,,15o1,q,15os,,15ot,q,15pk,,15pl,q,15qc,,15qd,q,15r4,,15r5,q,15rw,,15rx,q,15so,,15sp,q,15tg,,15th,q,15u8,,15u9,q,15v0,,15v1,q,15vs,,15vt,q,15wk,,15wl,q,15xc,,15xd,q,15y4,,15y5,q,15yw,,15yx,q,15zo,,15zp,q,160g,,160h,q,1618,,1619,q,1620,,1621,q,162s,,162t,q,163k,,163l,q,164c,,164d,q,1654,,1655,q,165w,,165x,q,166o,,166p,q,167g,,167h,q,1688,,1689,q,1690,,1691,q,169s,,169t,q,16ak,,16al,q,16bc,,16bd,q,16c4,,16c5,q,16cw,,16cx,q,16do,,16dp,q,16eg,,16eh,q,16f8,,16f9,q,16g0,,16g1,q,16gs,,16gt,q,16hk,,16hl,q,16ic,,16id,q,16j4,,16j5,q,16jw,,16jx,q,16ko,,16kp,q,16ls,m,16mj,1c,1dlq,,1e68,f,1e74,f,1edb,,1ehq,1,1ek0,b,1eyl,,1f4w,,1f92,4,1gjl,2,1gjp,1,1gjw,3,1gl4,2,1glb,,1gpx,1,1h5w,3,1h7t,4,1hgr,1,1hj0,3,1hl2,a,1hmq,3,1hq8,,1hq9,,1hqa,,1hrs,e,1htc,,1htf,1,1htr,2,1htu,,1hv4,2,1hv7,3,1hvb,1,1hvd,1,1hvh,,1hvm,,1hvx,,1hxc,2,1hyf,4,1hyk,,1hyl,7,1hz9,1,1i0j,,1i0w,1,1i0y,,1i2b,2,1i2e,8,1i2n,,1i2o,,1i2q,1,1i2x,3,1i32,,1i33,,1i5o,2,1i5r,2,1i5u,1,1i5w,3,1i66,,1i69,,1ian,,1iao,2,1iar,7,1ibk,1,1ibm,1,1id7,1,1ida,,1idb,,1idc,,1idd,3,1idj,1,1idn,1,1idp,,1idz,,1iea,1,1iee,6,1ieo,4,1igo,,1igp,1,1igr,5,1igy,,1ih1,,1ih3,2,1ih6,,1ih8,1,1iha,2,1ihd,,1ihe,,1iht,1,1ik5,2,1ik8,7,1ikg,1,1iki,2,1ikl,,1ikm,,1ila,,1ink,,1inl,1,1inn,5,1int,,1inu,,1inv,1,1inx,,1iny,,1inz,1,1io1,,1io2,1,1iun,,1iuo,1,1iuq,3,1iuw,3,1iv0,1,1iv2,,1iv3,1,1ivw,1,1iy8,2,1iyb,7,1iyj,1,1iyl,,1iym,,1iyn,1,1j1n,,1j1o,,1j1p,,1j1q,1,1j1s,7,1j4t,,1j4u,,1j4v,,1j4y,3,1j52,,1j53,4,1jcc,2,1jcf,8,1jco,,1jcp,1,1jjk,,1jjl,4,1jjr,1,1jjv,3,1jjz,,1jk0,,1jk1,,1jk2,,1jk3,,1jo1,2,1jo4,3,1joa,1,1joc,3,1jog,,1jok,,1jpd,9,1jqr,5,1jqx,,1jqy,,1jqz,3,1jrb,,1jrl,5,1jrr,1,1jrt,2,1jt0,5,1jt6,c,1jtj,,1jtk,1,1k4v,,1k4w,6,1k54,5,1k5a,,1k5b,,1k7m,l,1k89,,1k8a,6,1k8h,,1k8i,1,1k8k,,1k8l,1,1kc1,5,1kca,,1kcc,1,1kcf,6,1kcm,,1kcn,,1kei,4,1keo,1,1ker,1,1ket,,1keu,,1kev,,1koj,1,1kol,1,1kow,1,1koy,,1koz,,1kqc,1,1kqe,4,1kqm,1,1kqo,2,1kre,,1ovk,f,1ow0,,1ow7,e,1xr2,b,1xre,2,1xrh,2,1zow,4,1zqo,6,206b,,206f,3,20jz,,20k1,1i,20lr,3,20o4,,20og,1,2ftp,1,2fts,3,2jgg,19,2jhs,m,2jxh,4,2jxp,5,2jxv,7,2jy3,7,2jyd,6,2jze,3,2k3m,2,2lmo,1i,2lob,1d,2lpx,,2lqc,,2lqz,4,2lr5,e,2mtc,6,2mtk,g,2mu3,6,2mub,1,2mue,4,2mxb,,2n1s,6,2nce,,2ne4,3,2nsc,3,2nzi,1,2ok0,6,2on8,6,2pz4,73,2q6l,2,2q7j,,2q98,5,2q9q,1,2qa6,,2qa9,9,2qb1,1k,2qcm,p,2qdd,e,2qe2,,2qen,,2qeq,8,2qf0,3,2qfd,c1,2qrf,4,2qrk,8t,2r0m,7d,2r9c,3j,2rg4,b,2rit,16,2rkc,3,2rm0,7,2rmi,5,2rns,7,2rou,29,2rrg,1a,2rss,9,2rt3,c8,2scg,sd,jny8,v,jnz4,2n,jo1s,3j,jo5c,6n,joc0,2rz'), + '262122424333333393233393339333333333393393b3b3b3b3b333b33b3bb33333b3b3333333b3b33bb3333b33b3bb33333b3bbb333b333b33333b3b3b3b3333b3b33b3bb39333b33b33b3b3b333b333333b3b333333b33b3b3333b3335dc333333b3b3b33323333b3bb3b33b3b3b3333b3333b3b333bb3b33b3b3b3b3b333b333b3323e2244234444444444444444444444444444444444444444443333443443333333b3b3bb33333b353b3b3b3b333b3b333b333333b3bb3b3b3bbdc333232333333333333333b3b3333bb3b393933b3b33bb3b393b3b3b3333b33b33b3bbb33b333b3333bb3933b3b3b333b3b3b3b3b33b3b3b33b3b3b33b3b33b33b3b3b33bb39b9b3b33b3b33b9333b393b3b33b33b3b3b3333393b3b3b33b39bb3b332333b333dd3b33332333323333333333333333333333344444444a44444434444444444444423232', +); diff --git a/src/_incb_data.js b/src/_incb_data.js index ad9a8b1..3b6c063 100644 --- a/src/_incb_data.js +++ b/src/_incb_data.js @@ -3,17 +3,19 @@ // // @ts-check -import { initUnicodeRangeBuffer } from './core.js'; +import { decodeUnicodeData } from './core.js'; /** - * @typedef {import('./core.js').UnicodeRangeEncoding} UnicodeRangeEncoding + * @typedef {import('./core.js').UnicodeRange} UnicodeRange + * @typedef {import('./core.js').UnicodeDataEncoding} UnicodeDataEncoding */ /** * The Unicode `Indic_Conjunct_Break=Consonant` derived property table + * + * @type {UnicodeRange[]} */ -export const consonant_buffer = initUnicodeRangeBuffer( - Array(52), - /** @type {UnicodeRangeEncoding} */ +export const consonant_ranges = decodeUnicodeData( + /** @type {UnicodeDataEncoding} */ ('1sl,10,1ug,7,1vc,7,1w5,j,1wq,6,1wy,,1x2,3,1y4,1,1y7,,1yo,1,239,j,23u,6,242,1,245,4,261,,26t,j,27e,6,27m,1,27p,4,28s,1,28v,,29d,,2dx,j,2ei,f,2fs,2,2l1,11') ); diff --git a/src/core.js b/src/core.js index ae8410a..005527b 100644 --- a/src/core.js +++ b/src/core.js @@ -1,48 +1,47 @@ // @ts-check /** - * @typedef {[from: number, to: number]} UnicodeRange - * - * [from..to] code points included - */ - -/** - * @template {number} T + * @template {number} [T=number] * @typedef {[from: number, to: number, category: T]} CategorizedUnicodeRange */ /** - * @typedef {string & { __tag: 'LookupTableEncoding' }} LookupTableEncoding - * - * Base36 encoded {@link LookupTableBuffer} data. It's a sequence of `base36(code point)` with separators. - * - * Separator can be omitted if each value is small (=< 36) + * @typedef {CategorizedUnicodeRange<0>} UnicodeRange */ /** - * @typedef {Array & { __tag: 'LookupTableBuffer' }} LookupTableBuffer + * @typedef {string & { __tag: 'UnicodeDataEncoding' }} UnicodeDataEncoding * - * Value lookup table serialized into a TypedArray - */ - -/** - * @typedef {Array & { __tag: 'UnicodeRangeBuffer' }} UnicodeRangeBuffer + * Encoding for array of {@link UnicodeRange}, items separated by comma. * - * {@link UnicodeRange} data serialized into a TypedArray + * Each {@link UnicodeDataRow} packed as a base36 integer: * - * It's a dense array like `[from,to,from,to,...]` - * So always has an even length and is quantized into 2-items chunks. + * padding = to - from + * encoding = base36(from) + ',' + base36(padding) * - * The pairs must be sorted in ascending order to allow binary search. + * Notes: + * - base36 can hold surprisingly large numbers in a few characters. + * - The biggest codepoint is 0xE01F0 (918,000) at this point + * - The max value of a category is 23; https://www.unicode.org/reports/tr29/tr29-45.html#Table_Word_Break_Property_Values + * - The longest range is 42,720; CJK UNIFIED IDEOGRAPH-20000..CJK UNIFIED IDEOGRAPH-2A6DF */ /** - * @typedef {string & { __tag: 'UnicodeRangeEncoding' }} UnicodeRangeEncoding - * - * Base36 encoded {@link UnicodeRangeBuffer} data. It's a sequence of `base36(code point),base36(padding)` - * - * Value `0` is represented as empty strings + * @template {number} [T=number] + * @param {UnicodeDataEncoding} data + * @param {string} [cats=''] + * @returns {Array>} */ +export function decodeUnicodeData(data, cats = '') { + let buf = /** @type {Array>} */([]) + , nums = data.split(',').map(s => s ? parseInt(s, 36) : 0) + , n = 0; + for (let i = 0; i < nums.length; i++) + i % 2 + ? buf.push([n, n + nums[i], /** @type {T} */ (cats ? parseInt(cats[i >> 1], 36) : 0)]) + : n = nums[i]; + return buf; +} /** * @template {object} Ext @@ -59,53 +58,22 @@ */ /** - * @template {number} T - * @param {T} x - * @param {UnicodeRangeBuffer} buffer - * @param {number} [sliceFrom] - * @param {number} [sliceTo] - * @return {number} index of including range, or -(low+1) if there isn't + * @template {number} [T=number] + * @param {number} cp + * @param {CategorizedUnicodeRange[]} ranges + * @return {number} index of matched unicode range, or -1 if no match */ -export function searchUnicodeRange(x, buffer, sliceFrom = 0, sliceTo = buffer.length) { - let lo = sliceFrom; - let hi = sliceTo - 2; - +export function findUnicodeRangeIndex(cp, ranges) { + let lo = 0 + , hi = ranges.length - 1; while (lo <= hi) { - let mid = lo + hi >> 1 & ~1; - let l = buffer[mid], h = buffer[mid + 1]; - if (l <= x && x <= h) { - return mid; - } else if (h < x) { - lo = mid + 2; - } else { - hi = mid - 2; - } + let mid = lo + hi >> 1 + , range = ranges[mid] + , l = range[0] + , h = range[1]; + if (l <= cp && cp <= h) return mid; + else if (cp > h) lo = mid + 1; + else hi = mid - 1; } - - return -lo - 1; + return -1; } - -/** - * @param {Array} buffer - * @param {LookupTableEncoding} value - * @param {'' | ','} [sep = ''] - * @return {LookupTableBuffer} - */ -export function initLookupTableBuffer(buffer, value, sep = '') { - let nums = value.split(sep).map(s => s ? parseInt(s, 36) : 0); - for (let i = 0; i < nums.length; i++) - buffer[i] = nums[i]; - return /** @type {LookupTableBuffer} */ (buffer); -}; - -/** - * @param {Array} buffer - * @param {UnicodeRangeEncoding} value - * @return {UnicodeRangeBuffer} - */ -export function initUnicodeRangeBuffer(buffer, value) { - let nums = value.split(',').map(s => s ? parseInt(s, 36) : 0); - for (let i = 0, n = 0; i < nums.length; i++) - buffer[i] = i % 2 ? n + nums[i] : (n = nums[i]); - return /** @type {UnicodeRangeBuffer} */ (buffer); -}; diff --git a/src/emoji.js b/src/emoji.js index 490ce9d..8c6feec 100644 --- a/src/emoji.js +++ b/src/emoji.js @@ -1,9 +1,9 @@ // @ts-check -import { searchUnicodeRange } from './core.js'; +import { findUnicodeRangeIndex } from './core.js'; import { - emoji_presentation_buffer, - extended_pictographic_buffer, + emoji_presentation_ranges, + extended_pictographic_ranges, } from './_emoji_data.js'; /** @@ -25,7 +25,7 @@ export function isEmoji(cp) { * @return boolean */ export function isEmojiPresentation(cp) { - return searchUnicodeRange(cp, emoji_presentation_buffer) >= 0; + return findUnicodeRangeIndex(cp, emoji_presentation_ranges) >= 0; } /** @@ -35,5 +35,5 @@ export function isEmojiPresentation(cp) { * @return boolean */ export function isExtendedPictographic(cp) { - return searchUnicodeRange(cp, extended_pictographic_buffer) >= 0; + return findUnicodeRangeIndex(cp, extended_pictographic_ranges) >= 0; } diff --git a/src/general.js b/src/general.js index eeeafce..1e995a1 100644 --- a/src/general.js +++ b/src/general.js @@ -1,10 +1,10 @@ // @ts-check -import { searchUnicodeRange } from './core.js'; +import { findUnicodeRangeIndex } from './core.js'; import { - letter_buffer, - alphabetic_buffer, - numeric_buffer, + letter_ranges, + alphabetic_ranges, + numeric_ranges, } from './_general_data.js'; /** @@ -14,7 +14,7 @@ import { * @return boolean */ export function isLetter(cp) { - return searchUnicodeRange(cp, letter_buffer) >= 0; + return findUnicodeRangeIndex(cp, letter_ranges) >= 0; } /** @@ -24,7 +24,7 @@ export function isLetter(cp) { * @return boolean */ export function isAlphabetic(cp) { - return searchUnicodeRange(cp, alphabetic_buffer) >= 0; + return findUnicodeRangeIndex(cp, alphabetic_ranges) >= 0; } /** @@ -34,7 +34,7 @@ export function isAlphabetic(cp) { * @return boolean true if */ export function isNumeric(cp) { - return searchUnicodeRange(cp, numeric_buffer) >= 0; + return findUnicodeRangeIndex(cp, numeric_ranges) >= 0; } /** diff --git a/src/grapheme.js b/src/grapheme.js index 6c38f0d..375e55a 100644 --- a/src/grapheme.js +++ b/src/grapheme.js @@ -13,17 +13,10 @@ // @ts-check -import { searchUnicodeRange } from './core.js'; +import { findUnicodeRangeIndex } from './core.js'; import { isBMP } from './utils.js'; -import { - GraphemeCategory, - findGraphemeIndex, - grapheme_buffer, - grapheme_cats, -} from './_grapheme_data.js'; -import { - consonant_buffer, -} from './_incb_data.js'; +import { GraphemeCategory, grapheme_ranges } from './_grapheme_data.js'; +import { consonant_ranges } from './_incb_data.js'; /** * @typedef {import('./_grapheme_data.js').GC_Any} GC_Any @@ -52,15 +45,11 @@ export { * @return A {@link GraphemeCategoryRange} value if found, or garbage value with {@link GC_Any} category. */ export function searchGraphemeCategory(cp) { - let index = findGraphemeIndex(cp); + let index = findUnicodeRangeIndex(cp, grapheme_ranges); if (index < 0) { return [0, 0, 0 /* GC_Any */]; } - return [ - grapheme_buffer[index], - grapheme_buffer[index + 1], - grapheme_cats[index >> 1], - ]; + return grapheme_ranges[index]; } /** @@ -259,15 +248,14 @@ function cat(cp, cache) { // If this char isn't within the cached range, update the cache to the // range that includes it. if (cp < cache[0] || cp > cache[1]) { - let index = findGraphemeIndex(cp); - + let index = findUnicodeRangeIndex(cp, grapheme_ranges); if (index < 0) { return 0; } - - cache[0] = grapheme_buffer[index]; - cache[1] = grapheme_buffer[index + 1]; - cache[2] = /** @type {GraphemeCategoryNum} */ (grapheme_cats[index >> 1]); + let range = grapheme_ranges[index]; + cache[0] = range[0]; + cache[1] = range[1]; + cache[2] = range[2]; } return cache[2]; } @@ -278,7 +266,7 @@ function cat(cp, cache) { * @return {boolean} */ function isIndicConjunctCosonant(cp) { - return searchUnicodeRange(cp, consonant_buffer) >= 0; + return findUnicodeRangeIndex(cp, consonant_ranges) >= 0; } /**