Skip to content

Commit 6a330ff

Browse files
Jason3SCopilot
andauthored
fix: Add StringTable and refactor BinaryFormat (#8243)
Signed-off-by: Jason Dent <[email protected]> Co-authored-by: Copilot <[email protected]>
1 parent f68a529 commit 6a330ff

File tree

21 files changed

+1199
-526
lines changed

21 files changed

+1199
-526
lines changed

packages/cspell-dictionary/src/SpellingDictionary/SpellingDictionaryFromTrie.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -235,7 +235,7 @@ type FindAnyFormResult = FindFullResult;
235235
* @returns SpellingDictionary
236236
*/
237237
export function createSpellingDictionaryFromTrieFile(
238-
data: string | Uint8Array,
238+
data: string | Uint8Array<ArrayBuffer>,
239239
name: string,
240240
source: string,
241241
options: SpellingDictionaryOptionsRO,

packages/cspell-lib/src/lib/SpellingDictionary/DictionaryController/DictionaryLoader.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ interface CacheEntry {
5757
}
5858

5959
interface Reader {
60-
read(filename: URL): Promise<Uint8Array>;
60+
read(filename: URL): Promise<Uint8Array<ArrayBuffer>>;
6161
readText(filename: URL): Promise<string>;
6262
readLines(filename: URL): Promise<string[]>;
6363
}

packages/cspell-trie-lib/api/api.d.ts

Lines changed: 3 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
import { describe, expect, test } from 'vitest';
2+
3+
import { hexDump } from '../binary/hexDump.ts';
4+
import { decodeStringTableFromBinary, encodeStringTableToBinary, StringTableBuilder } from './StringTable.ts';
5+
6+
describe('StringTableBuilder', () => {
7+
test('should create a StringTableBuilder instance', () => {
8+
const builder = new StringTableBuilder();
9+
expect(builder).toBeDefined();
10+
});
11+
12+
test('should build empty StringTable', () => {
13+
const builder = new StringTableBuilder();
14+
const table = builder.build();
15+
expect(table).toBeDefined();
16+
expect(table.index.length).toBe(0);
17+
expect(table.charData.length).toBe(0);
18+
});
19+
20+
const segments = [
21+
'hello',
22+
'world',
23+
'he',
24+
'hell',
25+
'o',
26+
'ing',
27+
're',
28+
'er',
29+
'run',
30+
'fall',
31+
'falling',
32+
'runs',
33+
'apple',
34+
'app',
35+
'rest',
36+
'restaurant',
37+
'take',
38+
'ake',
39+
'people',
40+
'peoples',
41+
'careful',
42+
'carefully',
43+
'caregiver',
44+
'care',
45+
'giver',
46+
'll',
47+
'el',
48+
'lo',
49+
'ref',
50+
];
51+
52+
test('should build StringTable with strings and be able to get them back', () => {
53+
const builder = new StringTableBuilder();
54+
55+
const indices = segments.map((s) => builder.addString(s));
56+
const table = builder.build();
57+
58+
// console.log(hexDump(table.charData));
59+
60+
const retrieved = indices.map((i) => table.getString(i));
61+
expect(retrieved).toEqual(segments);
62+
});
63+
64+
test('encode and decode StringTable', () => {
65+
const builder = new StringTableBuilder();
66+
67+
const indices = segments.map((s) => builder.addString(s));
68+
const table = builder.build();
69+
70+
const encoded = encodeStringTableToBinary(table, 'LE');
71+
72+
expect(hexDump(encoded)).toMatchSnapshot();
73+
74+
const decodedTable = decodeStringTableFromBinary(encoded, 'LE');
75+
76+
const retrieved = indices.map((i) => decodedTable.getString(i));
77+
expect(retrieved).toEqual(segments);
78+
});
79+
});
Lines changed: 210 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,210 @@
1+
import type { BinaryFormat } from '../binary/index.ts';
2+
import { BinaryDataBuilder, BinaryDataReader, BinaryFormatBuilder } from '../binary/index.ts';
3+
import { GTrie } from '../GTrie/index.ts';
4+
import { assert } from '../utils/assert.ts';
5+
6+
type U32Array = Uint32Array<ArrayBuffer>;
7+
type U16Array = Uint16Array<ArrayBuffer>;
8+
type U8Array = Uint8Array<ArrayBuffer>;
9+
10+
type IndexArray = U32Array | U16Array | number[];
11+
12+
type IndexArrayRO = U32Array | U16Array | Readonly<number[]>;
13+
14+
/**
15+
* This is a set of strings stored in a compact form.
16+
*
17+
* Strings are stored as UTF-8 encoded bytes in a single contiguous buffer.
18+
* Each string is referenced by its starting index and length within the buffer.
19+
*
20+
* This design minimizes memory overhead by avoiding individual string objects,
21+
* allowing efficient storage and retrieval of a large number of strings.
22+
*
23+
* Strings are retrieved based on their index.
24+
*
25+
* The internal index table contains the offset and length of each string in the buffer.
26+
*
27+
*/
28+
export class StringTable {
29+
#index: IndexArray;
30+
#data: U8Array;
31+
#strLenBits: number;
32+
#strLenMask: number;
33+
#decoder = new TextDecoder();
34+
35+
/**
36+
*
37+
* @param index - the lookup index format: `offset|len` where the low bits are the length
38+
* @param utf8ByteData - the UTF-8 encoded byte data for all the strings
39+
* @param strLenBits - number of bits used to store the length of the string in the index entry
40+
*/
41+
constructor(index: IndexArray, utf8ByteData: U8Array, strLenBits: number) {
42+
this.#index = index;
43+
this.#data = utf8ByteData;
44+
this.#strLenBits = strLenBits;
45+
this.#strLenMask = (1 << strLenBits) - 1;
46+
}
47+
48+
get index(): Readonly<IndexArray> {
49+
return this.#index;
50+
}
51+
52+
get charData(): Readonly<U8Array> {
53+
return this.#data;
54+
}
55+
56+
get strLenBits(): number {
57+
return this.#strLenBits;
58+
}
59+
60+
getStringBytes(idx: number): Uint8Array | undefined {
61+
if (idx < 0 || idx >= this.#index.length) return undefined;
62+
const value = this.#index[idx];
63+
const offset = value >>> this.#strLenBits;
64+
const length = value & this.#strLenMask;
65+
return this.#data.subarray(offset, offset + length);
66+
}
67+
68+
getString(idx: number): string | undefined {
69+
const bytes = this.getStringBytes(idx);
70+
if (!bytes) return undefined;
71+
return this.#decoder.decode(bytes);
72+
}
73+
}
74+
75+
export class StringTableBuilder {
76+
#buffers: (number[] | Uint8Array)[] = [];
77+
#encoder = new TextEncoder();
78+
#lookupTrie = new GTrie<number, number>();
79+
#locked = false;
80+
#maxStrLen = 0;
81+
82+
addStringBytes(bytes: Uint8Array | number[]): number {
83+
assert(!this.#locked, 'StringTableBuilder is locked and cannot be modified.');
84+
const found = this.#lookupTrie.get(bytes);
85+
if (found !== undefined) {
86+
return found;
87+
}
88+
const idx = this.#buffers.push(bytes) - 1;
89+
this.#lookupTrie.insert(bytes, idx);
90+
this.#maxStrLen = Math.max(this.#maxStrLen, bytes.length);
91+
return idx;
92+
}
93+
94+
addString(str: string): number {
95+
const bytes = this.#encoder.encode(str);
96+
return this.addStringBytes(bytes);
97+
}
98+
99+
build(): StringTable {
100+
this.#locked = true;
101+
102+
if (!this.#buffers.length) {
103+
return new StringTable([], new Uint8Array(0), 8);
104+
}
105+
106+
// sorted by size descending
107+
const sortedBySize = this.#buffers.map((b, i) => ({ b, i })).sort((a, b) => b.b.length - a.b.length);
108+
const byteValues: number[] = [];
109+
110+
const strLenBits = Math.ceil(Math.log2(this.#maxStrLen + 1));
111+
const strLenMask = (1 << strLenBits) - 1;
112+
const index: number[] = new Array(this.#buffers.length);
113+
114+
for (const { b, i } of sortedBySize) {
115+
let offset = findValues(b);
116+
if (offset < 0) {
117+
offset = appendValues(b);
118+
}
119+
const length = b.length;
120+
assert(length <= strLenMask, `String length ${length} exceeds maximum of ${strLenMask}`);
121+
index[i] = (offset << strLenBits) | length;
122+
}
123+
124+
return new StringTable(index, new Uint8Array(byteValues), strLenBits);
125+
126+
function findValues(buf: number[] | Uint8Array): number {
127+
const bufLen = buf.length;
128+
const byteLen = byteValues.length;
129+
const maxOffset = byteLen - bufLen;
130+
131+
for (let i = 0; i <= maxOffset; i++) {
132+
let match = true;
133+
for (let j = 0; j < bufLen; j++) {
134+
if (byteValues[i + j] !== buf[j]) {
135+
match = false;
136+
break;
137+
}
138+
}
139+
if (match) {
140+
return i;
141+
}
142+
}
143+
144+
return -1;
145+
}
146+
147+
function appendValues(buf: number[] | Uint8Array): number {
148+
const offset = byteValues.length;
149+
byteValues.push(...buf);
150+
return offset;
151+
}
152+
}
153+
}
154+
155+
function getStringTableBinaryFormat(): BinaryFormat {
156+
return new BinaryFormatBuilder()
157+
.addUint8('indexBits', 'The number of bits needed for each index entry', 32)
158+
.addUint8('strLenBits', 'The number of bits needed to store the max length of a string in the table.', 8)
159+
.addString('reserved', 'Reserved for future use', 6)
160+
.addUint32ArrayPtr('index32', 'String index array of 32 bit entries')
161+
.addUint16ArrayPtr('index16', 'String index array of 16 bit entries', 'index32')
162+
.addUint8ArrayPtr('index', 'String index array of 8 bit entries', 'index32')
163+
.addUint8ArrayPtr('data', 'String byte data')
164+
.build();
165+
}
166+
167+
export function encodeStringTableToBinary(table: StringTable, endian: 'LE' | 'BE'): U8Array {
168+
const strLenBits = table.strLenBits;
169+
const offsetBits = Math.ceil(Math.log2(table.charData.length + 1));
170+
const minIndexBits = strLenBits + offsetBits;
171+
const indexBits = minIndexBits <= 16 ? 16 : 32;
172+
assert(minIndexBits <= indexBits, `Index bits ${indexBits} is too small for required bits ${minIndexBits}`);
173+
174+
const format = getStringTableBinaryFormat();
175+
176+
const builder = new BinaryDataBuilder(format, endian);
177+
builder.setUint8('indexBits', indexBits);
178+
builder.setUint8('strLenBits', strLenBits);
179+
if (indexBits === 16) {
180+
builder.setPtrUint16Array('index16', toU16Array(table.index));
181+
} else {
182+
builder.setPtrUint32Array('index32', toU32Array(table.index));
183+
}
184+
builder.setPtrUint8Array('data', table.charData);
185+
186+
return builder.build();
187+
}
188+
189+
export function decodeStringTableFromBinary(data: U8Array, endian: 'LE' | 'BE'): StringTable {
190+
const reader = new BinaryDataReader(data, getStringTableBinaryFormat(), endian);
191+
const indexBits = reader.getUint8('indexBits');
192+
const strLenBits = reader.getUint8('strLenBits');
193+
const index = indexBits === 16 ? reader.getPtrUint16Array('index16') : reader.getPtrUint32Array('index32');
194+
const buffer = reader.getPtrUint8Array('data');
195+
return new StringTable(index, buffer, strLenBits);
196+
}
197+
198+
function toU16Array(data: IndexArrayRO): U16Array {
199+
if (data instanceof Uint16Array) {
200+
return data;
201+
}
202+
return new Uint16Array(data);
203+
}
204+
205+
function toU32Array(data: IndexArrayRO): U32Array {
206+
if (data instanceof Uint32Array) {
207+
return data;
208+
}
209+
return new Uint32Array(data);
210+
}
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
// Vitest Snapshot v1, https://vitest.dev/guide/snapshot.html
2+
3+
exports[`StringTableBuilder > encode and decode StringTable 1`] = `
4+
"00000000 10 04 00 00 00 00 00 00 18 00 00 00 3a 00 00 00 ............:...
5+
00000010 52 00 00 00 41 00 00 00 a5 02 f5 02 a2 02 a4 02 R...A...........
6+
00000020 51 02 03 02 02 00 a2 01 93 03 c4 01 c7 01 94 03 Q...............
7+
00000030 45 03 43 03 04 00 0a 00 d4 03 e3 03 36 02 37 02 E.C.........6.7.
8+
00000040 a7 00 a9 00 39 01 a4 00 75 01 02 01 b2 02 d2 02 ....9...u.......
9+
00000050 c3 00 72 65 73 74 61 75 72 61 6e 74 63 61 72 65 ..restaurantcare
10+
00000060 66 75 6c 6c 79 63 61 72 65 67 69 76 65 72 66 61 fullycaregiverfa
11+
00000070 6c 6c 69 6e 67 70 65 6f 70 6c 65 73 68 65 6c 6c llingpeopleshell
12+
00000080 6f 77 6f 72 6c 64 61 70 70 6c 65 72 75 6e 73 74 oworldapplerunst
13+
00000090 61 6b 65 ake"
14+
`;

packages/cspell-trie-lib/src/lib/TrieBlob/FastTrieBlob.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ export class FastTrieBlob implements TrieData {
8888
* @param seq - the byte sequence of the character to look for
8989
* @returns
9090
*/
91-
#lookupNode(nodeIdx: number, seq: readonly number[] | Readonly<Uint8Array>): number | undefined {
91+
#lookupNode(nodeIdx: number, seq: readonly number[] | Readonly<Uint8Array<ArrayBuffer>>): number | undefined {
9292
const NodeMaskChildCharIndex = this.bitMasksInfo.NodeMaskChildCharIndex;
9393
const NodeChildRefShift = this.bitMasksInfo.NodeChildRefShift;
9494
const nodes = this.#nodes;
@@ -224,7 +224,7 @@ export class FastTrieBlob implements TrieData {
224224
return this;
225225
}
226226

227-
encodeToBTrie(): Uint8Array {
227+
encodeToBTrie(): Uint8Array<ArrayBuffer> {
228228
return this.toTrieBlob().encodeToBTrie();
229229
}
230230

packages/cspell-trie-lib/src/lib/TrieBlob/TrieBlob.test.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
import { describe, expect, test } from 'vitest';
22

33
import { readFastTrieBlobFromConfig } from '../../test/dictionaries.test.helper.ts';
4+
import { hexDump } from '../binary/index.ts';
45
import { validateTrie } from '../TrieNode/trie-util.ts';
56
import { buildTrieNodeTrieFromWords } from '../TrieNode/TrieNodeBuilder.ts';
67
import { createTrieBlob } from './createTrieBlob.ts';
78
import { FastTrieBlobBuilder } from './FastTrieBlobBuilder.ts';
8-
import { hexDump } from './hexDump.ts';
99
import { TrieBlob } from './TrieBlob.ts';
1010

1111
describe('TrieBlob', () => {

0 commit comments

Comments
 (0)