Skip to content

Commit 90ae3e2

Browse files
Jason3SCopilot
andauthored
fix: Support string prefixes when walking nodes (#8259)
Signed-off-by: Jason Dent <[email protected]> Co-authored-by: Copilot <[email protected]>
1 parent 40ec03f commit 90ae3e2

24 files changed

+684
-245
lines changed
10 Bytes
Binary file not shown.

packages/cspell-trie-lib/perf/Utf8.perf.ts

Lines changed: 0 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,6 @@ import {
88
decodeUtf8ByteStream,
99
encodeCodePointsToUtf8Into,
1010
encodeTextToUtf8,
11-
encodeTextToUtf8_32,
12-
encodeTextToUtf8_32Into,
1311
encodeTextToUtf8Into,
1412
encodeToUtf8_32,
1513
encodeToUtf8_32Rev,
@@ -154,30 +152,6 @@ suite('Utf8 encode', async (test) => {
154152
}
155153
});
156154

157-
test(`encodeTextToUtf8PointsInto(word) to array words (${words.length})`, () => {
158-
const _words = words;
159-
const buffer: number[] = new Array(100);
160-
for (let i = iterations; i > 0; --i) {
161-
for (const word of _words) {
162-
encodeTextToUtf8_32Into(word, buffer);
163-
}
164-
}
165-
});
166-
167-
test(`encodeTextToUtf8_32(word) to array words (${words.length})`, () => {
168-
const _words = words;
169-
const buffer: number[] = new Array(100);
170-
for (let i = iterations; i > 0; --i) {
171-
for (const word of _words) {
172-
const len = word.length;
173-
let j = 0;
174-
for (let p = { text: word, offset: 0 }; p.offset < len; ) {
175-
buffer[j++] = encodeTextToUtf8_32(p);
176-
}
177-
}
178-
}
179-
});
180-
181155
test(`encoder.encode(word) to array words (${words.length})`, () => {
182156
const _words = words;
183157
for (let i = iterations; i > 0; --i) {

packages/cspell-trie-lib/src/lib/StringTable/StringTable.ts

Lines changed: 45 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -57,12 +57,13 @@ export class StringTable {
5757
return this.#strLenBits;
5858
}
5959

60-
getStringBytes(idx: number): Uint8Array | undefined {
60+
get length(): number {
61+
return this.#index.length;
62+
}
63+
64+
getStringBytes(idx: number): U8Array | undefined {
6165
if (idx < 0 || idx >= this.#index.length) return undefined;
62-
const value = this.#index[idx];
63-
const offset = value >>> this.#strLenBits;
64-
const length = value & this.#strLenMask;
65-
return this.#data.subarray(offset, offset + length);
66+
return this.#getBytesByIndexValue(this.#index[idx]);
6667
}
6768

6869
getString(idx: number): string | undefined {
@@ -71,6 +72,16 @@ export class StringTable {
7172
return this.#decoder.decode(bytes);
7273
}
7374

75+
#getBytesByIndexValue(value: number): U8Array {
76+
const offset = value >>> this.#strLenBits;
77+
const length = value & this.#strLenMask;
78+
return this.#data.subarray(offset, offset + length);
79+
}
80+
81+
values(): U8Array[] {
82+
return [...this.#index].map((v) => this.#getBytesByIndexValue(v));
83+
}
84+
7485
toString(): string {
7586
return [...this.#index].map((_, i) => this.getString(i) || '').join(', ');
7687
}
@@ -85,7 +96,7 @@ export class StringTable {
8596
}
8697

8798
export class StringTableBuilder {
88-
#buffers: (number[] | Uint8Array)[] = [];
99+
#data: (number[] | Uint8Array)[] = [];
89100
#encoder = new TextEncoder();
90101
#lookupTrie = new GTrie<number, number>();
91102
#locked = false;
@@ -97,7 +108,7 @@ export class StringTableBuilder {
97108
if (found !== undefined) {
98109
return found;
99110
}
100-
const idx = this.#buffers.push(bytes) - 1;
111+
const idx = this.#data.push(bytes) - 1;
101112
this.#lookupTrie.insert(bytes, idx);
102113
this.#maxStrLen = Math.max(this.#maxStrLen, bytes.length);
103114
return idx;
@@ -108,20 +119,28 @@ export class StringTableBuilder {
108119
return this.addStringBytes(bytes);
109120
}
110121

122+
getEntry(idx: number): number[] | Uint8Array | undefined {
123+
return this.#data[idx];
124+
}
125+
126+
get length(): number {
127+
return this.#data.length;
128+
}
129+
111130
build(): StringTable {
112131
this.#locked = true;
113132

114-
if (!this.#buffers.length) {
133+
if (!this.#data.length) {
115134
return new StringTable([], new Uint8Array(0), 8);
116135
}
117136

118137
// sorted by size descending
119-
const sortedBySize = this.#buffers.map((b, i) => ({ b, i })).sort((a, b) => b.b.length - a.b.length);
138+
const sortedBySize = this.#data.map((b, i) => ({ b, i })).sort((a, b) => b.b.length - a.b.length);
120139
const byteValues: number[] = [];
121140

122141
const strLenBits = Math.ceil(Math.log2(this.#maxStrLen + 1));
123142
const strLenMask = (1 << strLenBits) - 1;
124-
const index: number[] = new Array(this.#buffers.length);
143+
const index: number[] = new Array(this.#data.length);
125144

126145
for (const { b, i } of sortedBySize) {
127146
let offset = findValues(b);
@@ -162,6 +181,17 @@ export class StringTableBuilder {
162181
return offset;
163182
}
164183
}
184+
185+
static fromStringTable(table: StringTable): StringTableBuilder {
186+
const builder = new StringTableBuilder();
187+
const values = table.values();
188+
const len = values.length;
189+
for (let i = 0; i < len; ++i) {
190+
builder.addStringBytes(values[i]);
191+
}
192+
193+
return builder;
194+
}
165195
}
166196

167197
function getStringTableBinaryFormat(): BinaryFormat {
@@ -176,7 +206,7 @@ function getStringTableBinaryFormat(): BinaryFormat {
176206
.build();
177207
}
178208

179-
export function encodeStringTableToBinary(table: StringTable, endian: 'LE' | 'BE'): U8Array {
209+
export function encodeStringTableToBinary(table: StringTable, endian?: 'LE' | 'BE'): U8Array {
180210
const strLenBits = table.strLenBits;
181211
const offsetBits = Math.ceil(Math.log2(table.charData.length + 1));
182212
const minIndexBits = strLenBits + offsetBits;
@@ -198,7 +228,10 @@ export function encodeStringTableToBinary(table: StringTable, endian: 'LE' | 'BE
198228
return builder.build();
199229
}
200230

201-
export function decodeStringTableFromBinary(data: U8Array, endian: 'LE' | 'BE'): StringTable {
231+
export function decodeStringTableFromBinary(data: U8Array, endian?: 'LE' | 'BE'): StringTable {
232+
if (!data?.length) {
233+
return new StringTable([], new Uint8Array(0), 8);
234+
}
202235
const reader = new BinaryDataReader(data, getStringTableBinaryFormat(), endian);
203236
const indexBits = reader.getUint8('indexBits');
204237
const strLenBits = reader.getUint8('strLenBits');

packages/cspell-trie-lib/src/lib/TrieBlob/FastTrieBlob.test.ts

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -111,9 +111,9 @@ describe('optimization', async () => {
111111
test('English Dict', () => {
112112
const trie = trieEn;
113113
const ft = FastTrieBlobBuilder.fromTrieRoot(trie.root, false);
114-
const ft2 = FastTrieBlobBuilder.fromTrieRoot(trie.root, true);
115-
expect(ft2.size).toBeLessThanOrEqual(ft.size);
116-
expect([...ft.words()]).toEqual([...ft2.words()]);
114+
const tb = FastTrieBlobBuilder.fromTrieRoot(trie.root, true);
115+
expect(tb.size).toBeLessThanOrEqual(ft.size);
116+
expect([...tb.words()]).toEqual([...ft.words()]);
117117
});
118118
});
119119

@@ -129,15 +129,10 @@ describe('Using String Tables', async () => {
129129
const trie = trieEn;
130130
const ft = FastTrieBlobBuilder.fromTrieRoot(trie.root, false);
131131
const ft2 = FastTrieBlobBuilder.fromTrieRoot(trie.root, true);
132-
console.log(`English Dict: Original Size: ${ft.size}, Optimized Size: ${ft2.size}`);
133-
134-
const stringTable = ft2.testExtractStringTable();
135-
// console.log(`String Table Size: ${stringTable.charData.length} bytes for ${stringTable.index.length} strings.`);
136-
// console.log('%s', hexDump(stringTable.charData));
137-
expect(stringTable.getString(0)).toBeDefined();
132+
// console.log(`English Dict: Original Size: ${ft.size}, Optimized Size: ${ft2.size}`);
138133

139134
expect(ft2.size).toBeLessThan(ft.size);
140-
expect([...ft.words()]).toEqual([...ft2.words()]);
135+
expect([...ft2.words()]).toEqual([...ft.words()]);
141136
});
142137
});
143138

packages/cspell-trie-lib/src/lib/TrieBlob/FastTrieBlob.ts

Lines changed: 39 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,15 @@ import { CharIndex } from './CharIndex.ts';
88
import type { FastTrieBlobInternals } from './FastTrieBlobInternals.ts';
99
import { assertSorted, FastTrieBlobInternalsAndMethods, sortNodes } from './FastTrieBlobInternals.ts';
1010
import { FastTrieBlobIRoot } from './FastTrieBlobIRoot.ts';
11-
import { extractStringTable } from './optimizeNodes.ts';
1211
import { TrieBlob } from './TrieBlob.ts';
13-
import { NodeChildIndexRefShift, NodeHeaderEOWMask, NodeMaskCharByte, type TrieBlobNode32 } from './TrieBlobFormat.ts';
12+
import {
13+
NodeChildIndexRefShift,
14+
NodeHeaderEOWMask,
15+
NodeHeaderNumChildrenMask,
16+
NodeHeaderPrefixShift,
17+
NodeMaskCharByte,
18+
type TrieBlobNode32,
19+
} from './TrieBlobFormat.ts';
1420
import { Utf8Accumulator } from './Utf8.ts';
1521

1622
type FastTrieBlobNode = TrieBlobNode32;
@@ -30,11 +36,13 @@ export class FastTrieBlob implements TrieData {
3036
readonly hasNonStrictWords: boolean;
3137
readonly hasPreferredSuggestions: boolean;
3238
#nodes: FastTrieBlobNode[];
39+
#stringTable: StringTable;
3340
#charIndex: CharIndex;
3441
readonly info: Readonly<TrieInfo>;
3542

36-
private constructor(nodes: FastTrieBlobNode[], info: Readonly<TrieInfo>) {
43+
private constructor(nodes: FastTrieBlobNode[], stringTable: StringTable, info: Readonly<TrieInfo>) {
3744
this.#nodes = nodes;
45+
this.#stringTable = stringTable;
3846
this.#charIndex = new CharIndex();
3947
this.info = info;
4048
this.wordToCharacters = (word: string) => [...word];
@@ -141,18 +149,23 @@ export class FastTrieBlob implements TrieData {
141149
nodeIdx: number;
142150
pos: number;
143151
word: string;
144-
accumulator: Utf8Accumulator;
152+
acc: Utf8Accumulator;
145153
}
146154
const nodeMaskChildCharIndex = NodeMaskCharByte;
147155
const nodeChildRefShift = NodeChildIndexRefShift;
148156
const NodeMaskEOW = NodeHeaderEOWMask;
157+
const pfxShift = NodeHeaderPrefixShift;
149158
const nodes = this.#nodes;
150-
const accumulator = Utf8Accumulator.create();
151-
const stack: StackItem[] = [{ nodeIdx: rootIdx, pos: 0, word: '', accumulator }];
159+
const st = this.#stringTable;
160+
const stack: StackItem[] = [{ nodeIdx: rootIdx, pos: 0, word: '', acc: Utf8Accumulator.create() }];
152161
let depth = 0;
153162

154163
while (depth >= 0) {
155-
const { nodeIdx, pos, word, accumulator } = stack[depth];
164+
const s = stack[depth];
165+
if (!s.pos) {
166+
applyPrefixString(s);
167+
}
168+
const { nodeIdx, pos, word, acc } = s;
156169
const node = nodes[nodeIdx];
157170

158171
if (!pos && node[0] & NodeMaskEOW) {
@@ -165,17 +178,28 @@ export class FastTrieBlob implements TrieData {
165178
const nextPos = ++stack[depth].pos;
166179
const entry = node[nextPos];
167180
const charIdx = entry & nodeMaskChildCharIndex;
168-
const acc = accumulator.clone();
169-
const codePoint = acc.decode(charIdx);
181+
const nAcc = acc.clone();
182+
const codePoint = nAcc.decode(charIdx);
170183
const letter = (codePoint && String.fromCodePoint(codePoint)) || '';
171184
++depth;
172185
stack[depth] = {
173186
nodeIdx: entry >>> nodeChildRefShift,
174187
pos: 0,
175188
word: word + letter,
176-
accumulator: acc,
189+
acc: nAcc,
177190
};
178191
}
192+
193+
function applyPrefixString(s: StackItem): void {
194+
const prefixIdx = nodes[s.nodeIdx][0] >>> pfxShift;
195+
const pfx = prefixIdx ? st.getStringBytes(prefixIdx) : undefined;
196+
if (!pfx) return;
197+
s.word += s.acc.decodeBytesToString(pfx);
198+
}
199+
}
200+
201+
get stringTable(): StringTable {
202+
return this.#stringTable;
179203
}
180204

181205
toTrieBlob(): TrieBlob {
@@ -203,7 +227,7 @@ export class FastTrieBlob implements TrieData {
203227
for (let i = 0; i < nodes.length; ++i) {
204228
const node = nodes[i];
205229
// assert(offset === nodeToIndex[i]);
206-
binNodes[offset++] = ((node.length - 1) << lenShift) | node[0];
230+
binNodes[offset++] = ((node.length - 1) << lenShift) | (node[0] & ~NodeHeaderNumChildrenMask);
207231
for (let j = 1; j < node.length; ++j) {
208232
const v = node[j];
209233
const nodeRef = v >>> nodeChildRefShift;
@@ -212,7 +236,7 @@ export class FastTrieBlob implements TrieData {
212236
}
213237
}
214238

215-
return new TrieBlob(binNodes, this.info);
239+
return new TrieBlob(binNodes, this.#stringTable, this.info);
216240
}
217241

218242
isReadonly(): boolean {
@@ -240,12 +264,12 @@ export class FastTrieBlob implements TrieData {
240264
}
241265

242266
static create(data: FastTrieBlobInternals): FastTrieBlob {
243-
return new FastTrieBlob(data.nodes, data.info);
267+
return new FastTrieBlob(data.nodes, data.stringTable, data.info);
244268
}
245269

246270
static toITrieNodeRoot(trie: FastTrieBlob): ITrieNodeRoot {
247271
return new FastTrieBlobIRoot(
248-
new FastTrieBlobInternalsAndMethods(trie.#nodes, trie.info, {
272+
new FastTrieBlobInternalsAndMethods(trie.#nodes, trie.#stringTable, trie.info, {
249273
nodeFindNode: (idx: number, word: string) => trie.#lookupNode(idx, trie.wordToUtf8Seq(word)),
250274
nodeFindExact: (idx: number, word: string) => trie.#has(idx, word),
251275
nodeGetChild: (idx: number, letter: string) => trie.#searchNodeForChar(idx, letter),
@@ -345,16 +369,12 @@ export class FastTrieBlob implements TrieData {
345369
node[j] = (idx << TrieBlob.NodeChildRefShift) | charIndex;
346370
}
347371
}
348-
return new FastTrieBlob(sortNodes(nodes, TrieBlob.NodeMaskChildCharIndex), trie.info);
372+
return new FastTrieBlob(sortNodes(nodes, TrieBlob.NodeMaskChildCharIndex), trie.stringTable, trie.info);
349373
}
350374

351375
static isFastTrieBlob(obj: unknown): obj is FastTrieBlob {
352376
return obj instanceof FastTrieBlob;
353377
}
354-
355-
testExtractStringTable(): StringTable {
356-
return extractStringTable(this.#nodes);
357-
}
358378
}
359379

360380
interface TrieBlobNodeInfo {

packages/cspell-trie-lib/src/lib/TrieBlob/FastTrieBlobBuilder.test.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,7 @@ describe('FastTrieBlobBuilder', () => {
159159
const words = sampleWords();
160160
const t = FastTrieBlobBuilder.fromTrieRoot(buildTrie(words, false), true);
161161
const sortedUnique = [...new Set(words)].sort();
162-
expect([...t.words()].sort()).toEqual(sortedUnique);
162+
expect([...t.words()]).toEqual(sortedUnique);
163163
});
164164

165165
test('fromTrieRoot optimized trie', () => {

packages/cspell-trie-lib/src/lib/TrieBlob/FastTrieBlobBuilder.ts

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,15 @@
11
import type { BuilderCursor, TrieBuilder } from '../Builder/index.ts';
22
import type { PartialTrieInfo, TrieCharacteristics, TrieInfo } from '../ITrieNode/TrieInfo.ts';
33
import { TrieInfoBuilder } from '../ITrieNode/TrieInfo.ts';
4+
import { StringTableBuilder } from '../StringTable/StringTable.ts';
45
import type { TrieNode, TrieRoot } from '../TrieNode/TrieNode.ts';
56
import { assert } from '../utils/assert.ts';
67
import { assertValidUtf16Character } from '../utils/text.ts';
78
import { CharIndexBuilder } from './CharIndex.ts';
89
import type { NodeToJSON } from './FastTrieBlob.ts';
910
import { FastTrieBlob, nodesToJSON } from './FastTrieBlob.ts';
1011
import { FastTrieBlobInternals, sortNodes } from './FastTrieBlobInternals.ts';
11-
import { optimizeNodes } from './optimizeNodes.ts';
12+
import { optimizeNodesWithStringTable } from './optimizeNodes.ts';
1213
import { resolveMap } from './resolveMap.ts';
1314
import { TrieBlob } from './TrieBlob.ts';
1415
import { NodeChildIndexRefShift, NodeHeaderEOWMask, NodeMaskCharByte } from './TrieBlobFormat.ts';
@@ -241,8 +242,8 @@ export class FastTrieBlobBuilder implements TrieBuilder<FastTrieBlob> {
241242
const wLen = word.length;
242243
const bytes: number[] = [];
243244

244-
for (const t = { text: word, offset: 0 }; t.offset < wLen; ) {
245-
const isLastChar = t.offset >= wLen - 1;
245+
for (const t = { text: word, i: 0 }; t.i < wLen; ) {
246+
const isLastChar = t.i >= wLen - 1;
246247
for (let utf8Code = encodeTextToUtf8_32Rev(t); utf8Code; utf8Code >>>= 8) {
247248
const seq = utf8Code & 0xff;
248249
bytes.push(seq);
@@ -336,9 +337,13 @@ export class FastTrieBlobBuilder implements TrieBuilder<FastTrieBlob> {
336337
NodeMaskCharByte,
337338
);
338339

339-
const nodes = optimize ? optimizeNodes(sortedNodes) : sortedNodes;
340+
const stringTable = new StringTableBuilder().build();
340341

341-
return FastTrieBlob.create(new FastTrieBlobInternals(nodes, info.info, info.characteristics));
342+
const r = optimize
343+
? optimizeNodesWithStringTable({ nodes: sortedNodes, stringTable })
344+
: { nodes: sortedNodes, stringTable };
345+
346+
return FastTrieBlob.create(new FastTrieBlobInternals(r.nodes, r.stringTable, info.info, info.characteristics));
342347
}
343348

344349
toJSON(): {

0 commit comments

Comments
 (0)