Skip to content

Commit 59a0cfd

Browse files
Jason3SCopilot
andauthored
fix: Make endian required when encoding a StringTable (#8265)
Signed-off-by: Jason Dent <[email protected]> Co-authored-by: Copilot <[email protected]>
1 parent e824661 commit 59a0cfd

File tree

7 files changed

+116
-7
lines changed

7 files changed

+116
-7
lines changed

cspell.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@
7878
"test-packages/yarn/yarn2/test-eslint-plugin/**"
7979
],
8080
"useGitignore": true,
81-
"flagWords": [],
81+
"flagWords": ["unit8:uint8", "Unit8:Uint8", "unit16:uint16", "Unit16:Uint16", "unit32:uint32", "Unit32:Uint32"],
8282
"ignoreWords": ["commitcomment"],
8383
"features": {
8484
"weighted-suggestions": true

packages/cspell-trie-lib/src/lib/StringTable/StringTable.ts

Lines changed: 29 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -194,19 +194,36 @@ export class StringTableBuilder {
194194
}
195195
}
196196

197+
/**
198+
* The endian code used to identify endianness in the binary format.
199+
* We use the 16-bit value 0x5453 (corresponding to the characters 'S' (0x53) and 'T' (0x54)).
200+
* In little-endian representation, 0x5453 is stored as bytes 0x53 0x54 ('S', 'T').
201+
* In big-endian representation, 0x5453 is stored as bytes 0x54 0x53 ('T', 'S').
202+
*
203+
* The value stored should match the value retrieved, otherwise the endianness is incorrect.
204+
*/
205+
const bomCode = 0x5453; // 16-bit BOM value used for endianness check
206+
197207
function getStringTableBinaryFormat(): BinaryFormat {
198208
return new BinaryFormatBuilder()
199209
.addUint8('indexBits', 'The number of bits needed for each index entry', 32)
200210
.addUint8('strLenBits', 'The number of bits needed to store the max length of a string in the table.', 8)
201-
.addString('reserved', 'Reserved for future use', 6)
211+
.addUint16('bom', 'The Byte Order Mark.', bomCode)
212+
.addString('reserved', 'Reserved for future use', 4)
202213
.addUint32ArrayPtr('index32', 'String index array of 32 bit entries')
203214
.addUint16ArrayPtr('index16', 'String index array of 16 bit entries', 'index32')
204215
.addUint8ArrayPtr('index', 'String index array of 8 bit entries', 'index32')
205216
.addUint8ArrayPtr('data', 'String byte data')
206217
.build();
207218
}
208219

209-
export function encodeStringTableToBinary(table: StringTable, endian?: 'LE' | 'BE'): U8Array {
220+
/**
221+
* Encodes a StringTable into binary data so that it can be stored or transmitted.
222+
* @param table - the string table to encode
223+
* @param endian - the resulting endianness of the data.
224+
* @returns The encoded string table binary data.
225+
*/
226+
export function encodeStringTableToBinary(table: StringTable, endian: 'LE' | 'BE'): U8Array {
210227
const strLenBits = table.strLenBits;
211228
const offsetBits = Math.ceil(Math.log2(table.charData.length + 1));
212229
const minIndexBits = strLenBits + offsetBits;
@@ -218,6 +235,7 @@ export function encodeStringTableToBinary(table: StringTable, endian?: 'LE' | 'B
218235
const builder = new BinaryDataBuilder(format, endian);
219236
builder.setUint8('indexBits', indexBits);
220237
builder.setUint8('strLenBits', strLenBits);
238+
builder.setUint16('bom', bomCode); // store the little endian value
221239
if (indexBits === 16) {
222240
builder.setPtrUint16Array('index16', toU16Array(table.index));
223241
} else {
@@ -228,13 +246,21 @@ export function encodeStringTableToBinary(table: StringTable, endian?: 'LE' | 'B
228246
return builder.build();
229247
}
230248

231-
export function decodeStringTableFromBinary(data: U8Array, endian?: 'LE' | 'BE'): StringTable {
249+
/**
250+
* Decodes binary data into a StringTable.
251+
* @param data - the byte data of the string table.
252+
* @param endian - the endianness of the encoded data.
253+
* @returns The decoded StringTable.
254+
*/
255+
export function decodeStringTableFromBinary(data: U8Array, endian: 'LE' | 'BE'): StringTable {
232256
if (!data?.length) {
233257
return new StringTable([], new Uint8Array(0), 8);
234258
}
235259
const reader = new BinaryDataReader(data, getStringTableBinaryFormat(), endian);
236260
const indexBits = reader.getUint8('indexBits');
237261
const strLenBits = reader.getUint8('strLenBits');
262+
const bomStored = reader.getUint16('bom');
263+
assert(!bomStored || bomStored === bomCode, 'Endian mismatch');
238264
const index = indexBits === 16 ? reader.getPtrUint16Array('index16') : reader.getPtrUint32Array('index32');
239265
const buffer = reader.getPtrUint8Array('data');
240266
return new StringTable(index, buffer, strLenBits);

packages/cspell-trie-lib/src/lib/StringTable/__snapshots__/StringTable.test.ts.snap

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
// Vitest Snapshot v1, https://vitest.dev/guide/snapshot.html
22

33
exports[`StringTableBuilder > encode and decode StringTable 1`] = `
4-
"00000000 10 04 00 00 00 00 00 00 18 00 00 00 3a 00 00 00 ............:...
4+
"00000000 10 04 53 54 00 00 00 00 18 00 00 00 3a 00 00 00 ..ST........:...
55
00000010 52 00 00 00 41 00 00 00 a5 02 f5 02 a2 02 a4 02 R...A...........
66
00000020 51 02 03 02 02 00 a2 01 93 03 c4 01 c7 01 94 03 Q...............
77
00000030 45 03 43 03 04 00 0a 00 d4 03 e3 03 36 02 37 02 E.C.........6.7.

packages/cspell-trie-lib/src/lib/TrieBlob/TrieBlobEncoder.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ export function encodeTrieBlobToBTrie(blob: TrieBlobInfo): U8Array {
3939
builder.setString('characteristics', cvtTrieCharacteristicsToFlags(blob.characteristics));
4040

4141
if (blob.stringTable.length) {
42-
const stringTableData = encodeStringTableToBinary(blob.stringTable);
42+
const stringTableData = encodeStringTableToBinary(blob.stringTable, builder.endian);
4343
builder.setPtrUint8Array('stringTable', stringTableData);
4444
}
4545

packages/cspell-trie-lib/src/lib/TrieBlob/__snapshots__/TrieBlob.test.ts.snap

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ exports[`TrieBlob encode/decode > encode optimize hexDump 1`] = `
3939
00000080 00 00 00 00 05 00 00 00 61 07 00 00 62 08 00 00 ........a...b...
4040
00000090 67 09 00 00 6f 0a 00 00 73 0b 00 00 00 01 00 00 g...o...s.......
4141
000000a0 00 09 00 00 00 13 00 00 00 19 00 00 00 21 00 00 .............!..
42-
000000b0 00 33 00 00 10 04 00 00 00 00 00 00 18 00 00 00 .3..............
42+
000000b0 00 33 00 00 10 04 53 54 00 00 00 00 18 00 00 00 .3....ST........
4343
000000c0 34 00 00 00 4c 00 00 00 1b 00 00 00 00 00 51 00 4...L.........Q.
4444
000000d0 52 01 43 01 34 01 21 00 a2 00 93 00 a4 00 95 00 R.C.4.!.........
4545
000000e0 92 01 83 01 74 01 12 01 03 01 f4 00 e5 00 81 00 ....t...........

packages/cspell-trie-lib/src/lib/binary/binaryFormat.test.ts

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,46 @@ describe('BinaryDataBuilder', () => {
169169
const readArr = reader.getPtrUint16Array('arrayPtr');
170170
expect(readArr).toEqual(arr);
171171
});
172+
173+
test('reader.getField', () => {
174+
const format = new BinaryFormatBuilder()
175+
.addString('header', 'The file header', 'Test Header')
176+
.addUint16('value16', 'A uint16 value', 0x1234)
177+
.addUint32ArrayPtr('arrayPtr', 'Pointer to uint32 array')
178+
.build();
179+
const builder = new BinaryDataBuilder(format);
180+
const arr = new Uint32Array(numberRange(10, 20));
181+
builder.setPtrUint32Array('arrayPtr', arr);
182+
const data = builder.build();
183+
184+
const reader = new BinaryDataReader(data, format);
185+
186+
// make sure the field matches
187+
const field = reader.getField('arrayPtr');
188+
expect(field).toEqual(format.getField('arrayPtr'));
189+
});
190+
191+
test('reader.getAsUint16', () => {
192+
const format = new BinaryFormatBuilder()
193+
.addString('header', 'The file header', 'Test Header')
194+
.addUint16('data16', 'A 16-bit value', 0xabcd)
195+
.addUint8Array('data8', 'An array of uint8', [0x12, 0x34])
196+
.build();
197+
const builder = new BinaryDataBuilder(format);
198+
const data = builder.build();
199+
200+
const reader = new BinaryDataReader(data, format);
201+
202+
expect(reader.getUint16('data16')).toBe(0xabcd);
203+
204+
const expectedUint8Array = reader.endian === 'LE' ? new Uint8Array([0xcd, 0xab]) : new Uint8Array([0xab, 0xcd]);
205+
const val8 = reader.getUint8Array('data16');
206+
expect(val8).toEqual(expectedUint8Array);
207+
208+
const expectedUint16 = reader.endian === 'LE' ? 0x3412 : 0x1234;
209+
const val16 = reader.getAsUint16('data8');
210+
expect(val16).toBe(expectedUint16);
211+
});
172212
});
173213

174214
describe('field overrides', () => {

packages/cspell-trie-lib/src/lib/binary/binaryFormat.ts

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,13 @@ export class BinaryFormatBuilder {
172172
return this;
173173
}
174174

175+
addUint8Array(name: string, description: string, length: number | Uint8Array | number[]): BinaryFormatBuilder {
176+
// `as number` is needed because the type definition for `new Uint8Array` is wrong.
177+
const value = new Uint8Array(length as number);
178+
this.addData(name, description, 'value', value);
179+
return this;
180+
}
181+
175182
addData(name: string, description: string, formatType: FormatType, data: DataArrayView): BinaryFormatBuilder {
176183
const byteSize = data.byteLength / data.length;
177184
assert(isByteAlignment(byteSize), `Invalid byte size: ${byteSize} for field: ${name}`);
@@ -636,6 +643,18 @@ export class BinaryDataReader {
636643
return view.getUint16(0, this.#useLE);
637644
}
638645

646+
/**
647+
* Read a field as Uint16 starting at the given byte offset.
648+
* @param name - name of field
649+
* @param byteOffset - offset of in bytes from the beginning of the field
650+
* @returns the value read.
651+
*/
652+
getAsUint16(name: string, byteOffset: number = 0): number {
653+
const element = this.getDataElement(name);
654+
const view = new DataView(element.data.buffer, element.data.byteOffset, element.data.byteLength);
655+
return view.getUint16(byteOffset, this.#useLE);
656+
}
657+
639658
/**
640659
* Get a Uint8 from the data.
641660
* @param name - name of the Uint8 field
@@ -754,9 +773,33 @@ export class BinaryDataReader {
754773
this.#useLE = endian === 'LE';
755774
}
756775

776+
get endian(): 'LE' | 'BE' {
777+
return this.#useLE ? 'LE' : 'BE';
778+
}
779+
757780
reverseEndian(): void {
758781
this.#useLE = !this.#useLE;
759782
}
783+
784+
/**
785+
* Get the raw bytes for a field.
786+
* @param name - name of the field
787+
* @returns the bytes or undefined
788+
*/
789+
getUint8Array(name: string): U8Array | undefined {
790+
const element = this.getDataElement(name);
791+
if (!element) return undefined;
792+
return element.data;
793+
}
794+
795+
/**
796+
* Get the FormatElement for a field.
797+
* @param name - name of the field
798+
* @returns the element or undefined
799+
*/
800+
getField(name: string): FormatElement | undefined {
801+
return this.format.getField(name);
802+
}
760803
}
761804

762805
function formatElementToJSON(fe: FormatElement): unknown {

0 commit comments

Comments
 (0)