Skip to content

Commit aef0db1

Browse files
authored
fix: Improve speed of dictionary lookup (#8193)
1 parent 7743132 commit aef0db1

24 files changed

+504
-219
lines changed

packages/cspell-trie-lib/api/api.d.ts

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

packages/cspell-trie-lib/package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@
3636
"test:watch": "vitest",
3737
"test:perf": "pnpm test:perf:ts --all",
3838
"test:perf:ts": "insight --file \"**/*.perf.{mts,ts}\" -t 500",
39-
"test:perf:prof": "NODE_ENV=production node --cpu-prof ../../node_modules/perf-insight/bin.mjs -t 1000",
39+
"test:perf:prof": "NODE_ENV=production node --cpu-prof --cpu-prof-interval=100 ../../node_modules/perf-insight/bin.mjs --file \"**/*.perf.{mts,ts}\" -t 5000",
4040
"perf": "pnpm test:perf",
4141
"test": "vitest run",
4242
"test:update-snapshot": "vitest run -u",

packages/cspell-trie-lib/perf/Utf8.perf.ts

Lines changed: 94 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,14 @@ import {
88
decodeUtf8N_LE,
99
encodeCodePointsToUtf8Into,
1010
encodeTextToUtf8,
11+
encodeTextToUtf8_32,
12+
encodeTextToUtf8_32Into,
1113
encodeTextToUtf8Into,
1214
encodeUtf8N_BE,
1315
encodeUtf8N_LE,
1416
textToCodePoints,
1517
} from '../src/lib/TrieBlob/Utf8.ts';
16-
import { Utf8Encoder } from '../src/lib/TrieBlob/Utf8Encoder.ts';
18+
import { Utf8Encoder, Utf8Encoder2 } from '../src/lib/TrieBlob/Utf8Encoder.ts';
1719

1820
const iterations = 1000;
1921
const text = sampleText();
@@ -23,6 +25,7 @@ suite('Utf8 encode', async (test) => {
2325
const encoder = new TextEncoder();
2426
const scratchBuffer = new Uint8Array(1024);
2527
const utf8Encoder = new Utf8Encoder();
28+
const utf8Encoder2 = new Utf8Encoder2(1024);
2629

2730
test(`TextEncoder.encodeInto words (${words.length})`, () => {
2831
const buffer = scratchBuffer;
@@ -124,6 +127,57 @@ suite('Utf8 encode', async (test) => {
124127
}
125128
});
126129

130+
test(`utf8Encoder2(word) to array words (${words.length})`, () => {
131+
const _words = words;
132+
for (let i = iterations; i > 0; --i) {
133+
for (const word of _words) {
134+
utf8Encoder2.encode(word);
135+
}
136+
}
137+
});
138+
139+
test(`toUtf8Array(word) to array words (${words.length})`, () => {
140+
const _words = words;
141+
for (let i = iterations; i > 0; --i) {
142+
for (const word of _words) {
143+
toUtf8Array(word);
144+
}
145+
}
146+
});
147+
148+
test(`toCodePoints(word) to array words (${words.length})`, () => {
149+
const _words = words;
150+
for (let i = iterations; i > 0; --i) {
151+
for (const word of _words) {
152+
toCodePoints(word);
153+
}
154+
}
155+
});
156+
157+
test(`encodeTextToUtf8PointsInto(word) to array words (${words.length})`, () => {
158+
const _words = words;
159+
const buffer: number[] = new Array(100);
160+
for (let i = iterations; i > 0; --i) {
161+
for (const word of _words) {
162+
encodeTextToUtf8_32Into(word, buffer);
163+
}
164+
}
165+
});
166+
167+
test(`encodeTextToUtf8_32(word) to array words (${words.length})`, () => {
168+
const _words = words;
169+
const buffer: number[] = new Array(100);
170+
for (let i = iterations; i > 0; --i) {
171+
for (const word of _words) {
172+
const len = word.length;
173+
let j = 0;
174+
for (let p = { text: word, offset: 0 }; p.offset < len; ) {
175+
buffer[j++] = encodeTextToUtf8_32(p);
176+
}
177+
}
178+
}
179+
});
180+
127181
test(`encoder.encode(word) to array words (${words.length})`, () => {
128182
const _words = words;
129183
for (let i = iterations; i > 0; --i) {
@@ -361,3 +415,42 @@ function sampleText() {
361415
`;
362416
// cspell:enable
363417
}
418+
419+
const textEncoder = new TextEncoder();
420+
const charMap: Record<string, number> = Object.create(null);
421+
422+
function encodeChar(char: string): number {
423+
const bytes = textEncoder.encode(char);
424+
let code = 0;
425+
for (let i = bytes.length - 1; i >= 0; i--) {
426+
code = (code << 8) | bytes[i];
427+
}
428+
return code;
429+
}
430+
431+
function toUtf8Array(text: string): number[] {
432+
const src: string[] = [...text];
433+
const dst: number[] = src as unknown as number[];
434+
435+
for (let i = 0; i < src.length; i++) {
436+
const char = src[i];
437+
let code = charMap[char];
438+
if (code === undefined) {
439+
code = encodeChar(char);
440+
charMap[char] = code;
441+
}
442+
dst[i] = code;
443+
}
444+
return dst;
445+
}
446+
447+
function toCodePoints(text: string): number[] {
448+
const src: string[] = [...text];
449+
const dst: number[] = src as unknown as number[];
450+
451+
for (let i = 0; i < src.length; i++) {
452+
const char = src[i];
453+
dst[i] = char.codePointAt(0) || 0;
454+
}
455+
return dst;
456+
}

packages/cspell-trie-lib/perf/charIndex.perf.ts

Lines changed: 2 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import { suite } from 'perf-insight';
22

3+
import { CharIndex } from '../src/lib/TrieBlob/CharIndex.ts';
34
import { encodeTextToUtf8 } from '../src/lib/TrieBlob/Utf8.ts';
45
import { readFastTrieBlobFromConfig, readTrieFromConfig } from '../src/test/dictionaries.test.helper.ts';
56

@@ -13,8 +14,7 @@ suite('encode to sequence', async (test) => {
1314
const words = await getWords();
1415
const msgSuffix = ' - ' + words.length + ' words';
1516
const fastTrieBlob = await getFastTrieBlob();
16-
const trieBlob = fastTrieBlob.toTrieBlob();
17-
const charIndex = trieBlob.charIndex;
17+
const charIndex = CharIndex.fromIterable(words);
1818
const encoder = new TextEncoder();
1919

2020
test('fastTrieBlob.wordToNodeCharIndexSequence' + msgSuffix, () => {
@@ -23,20 +23,6 @@ suite('encode to sequence', async (test) => {
2323
}
2424
});
2525

26-
test('trieBlob.wordToNodeCharIndexSequence' + msgSuffix, () => {
27-
for (const word of words) {
28-
trieBlob.wordToUtf8Seq(word);
29-
}
30-
});
31-
32-
test('trieBlob.wordToNodeCharIndexSequence x4' + msgSuffix, () => {
33-
for (const word of words) {
34-
for (let i = 0; i < 4; ++i) {
35-
trieBlob.wordToUtf8Seq(word);
36-
}
37-
}
38-
});
39-
4026
test('charIndex.wordToCharIndexSequence' + msgSuffix, () => {
4127
for (const word of words) {
4228
charIndex.wordToUtf8Seq(word);

packages/cspell-trie-lib/perf/has.perf.ts

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ suite('trie has', async (test) => {
1919
const iTrieFast = new ITrieImpl(fastTrieBlob);
2020
const iTrieBlob = new ITrieImpl(trieBlob);
2121
const setOfWords = new Set(words);
22+
console.log(`Number of words: ${words.length}`);
2223

2324
test('set has words', () => {
2425
trieHasWords(setOfWords, words);
@@ -54,7 +55,7 @@ function _getFastTrieBlob() {
5455
}
5556

5657
function trieHasWords(trie: { has: (word: string) => boolean }, words: string[]): boolean {
57-
const has = (word: string) => trie.has(word);
58+
const has = trie.has.bind(trie);
5859
const len = words.length;
5960
let success = true;
6061
for (let i = 0; i < len; ++i) {

0 commit comments

Comments
 (0)