|
| 1 | +import type { BinaryFormat } from '../binary/index.ts'; |
| 2 | +import { BinaryDataBuilder, BinaryDataReader, BinaryFormatBuilder } from '../binary/index.ts'; |
| 3 | +import { GTrie } from '../GTrie/index.ts'; |
| 4 | +import { assert } from '../utils/assert.ts'; |
| 5 | + |
| 6 | +type U32Array = Uint32Array<ArrayBuffer>; |
| 7 | +type U16Array = Uint16Array<ArrayBuffer>; |
| 8 | +type U8Array = Uint8Array<ArrayBuffer>; |
| 9 | + |
| 10 | +type IndexArray = U32Array | U16Array | number[]; |
| 11 | + |
| 12 | +type IndexArrayRO = U32Array | U16Array | Readonly<number[]>; |
| 13 | + |
| 14 | +/** |
| 15 | + * This is a set of strings stored in a compact form. |
| 16 | + * |
| 17 | + * Strings are stored as UTF-8 encoded bytes in a single contiguous buffer. |
| 18 | + * Each string is referenced by its starting index and length within the buffer. |
| 19 | + * |
| 20 | + * This design minimizes memory overhead by avoiding individual string objects, |
| 21 | + * allowing efficient storage and retrieval of a large number of strings. |
| 22 | + * |
| 23 | + * Strings are retrieved based on their index. |
| 24 | + * |
| 25 | + * The internal index table contains the offset and length of each string in the buffer. |
| 26 | + * |
| 27 | + */ |
| 28 | +export class StringTable { |
| 29 | + #index: IndexArray; |
| 30 | + #data: U8Array; |
| 31 | + #strLenBits: number; |
| 32 | + #strLenMask: number; |
| 33 | + #decoder = new TextDecoder(); |
| 34 | + |
| 35 | + /** |
| 36 | + * |
| 37 | + * @param index - the lookup index format: `offset|len` where the low bits are the length |
| 38 | + * @param utf8ByteData - the UTF-8 encoded byte data for all the strings |
| 39 | + * @param strLenBits - number of bits used to store the length of the string in the index entry |
| 40 | + */ |
| 41 | + constructor(index: IndexArray, utf8ByteData: U8Array, strLenBits: number) { |
| 42 | + this.#index = index; |
| 43 | + this.#data = utf8ByteData; |
| 44 | + this.#strLenBits = strLenBits; |
| 45 | + this.#strLenMask = (1 << strLenBits) - 1; |
| 46 | + } |
| 47 | + |
| 48 | + get index(): Readonly<IndexArray> { |
| 49 | + return this.#index; |
| 50 | + } |
| 51 | + |
| 52 | + get charData(): Readonly<U8Array> { |
| 53 | + return this.#data; |
| 54 | + } |
| 55 | + |
| 56 | + get strLenBits(): number { |
| 57 | + return this.#strLenBits; |
| 58 | + } |
| 59 | + |
| 60 | + getStringBytes(idx: number): Uint8Array | undefined { |
| 61 | + if (idx < 0 || idx >= this.#index.length) return undefined; |
| 62 | + const value = this.#index[idx]; |
| 63 | + const offset = value >>> this.#strLenBits; |
| 64 | + const length = value & this.#strLenMask; |
| 65 | + return this.#data.subarray(offset, offset + length); |
| 66 | + } |
| 67 | + |
| 68 | + getString(idx: number): string | undefined { |
| 69 | + const bytes = this.getStringBytes(idx); |
| 70 | + if (!bytes) return undefined; |
| 71 | + return this.#decoder.decode(bytes); |
| 72 | + } |
| 73 | +} |
| 74 | + |
| 75 | +export class StringTableBuilder { |
| 76 | + #buffers: (number[] | Uint8Array)[] = []; |
| 77 | + #encoder = new TextEncoder(); |
| 78 | + #lookupTrie = new GTrie<number, number>(); |
| 79 | + #locked = false; |
| 80 | + #maxStrLen = 0; |
| 81 | + |
| 82 | + addStringBytes(bytes: Uint8Array | number[]): number { |
| 83 | + assert(!this.#locked, 'StringTableBuilder is locked and cannot be modified.'); |
| 84 | + const found = this.#lookupTrie.get(bytes); |
| 85 | + if (found !== undefined) { |
| 86 | + return found; |
| 87 | + } |
| 88 | + const idx = this.#buffers.push(bytes) - 1; |
| 89 | + this.#lookupTrie.insert(bytes, idx); |
| 90 | + this.#maxStrLen = Math.max(this.#maxStrLen, bytes.length); |
| 91 | + return idx; |
| 92 | + } |
| 93 | + |
| 94 | + addString(str: string): number { |
| 95 | + const bytes = this.#encoder.encode(str); |
| 96 | + return this.addStringBytes(bytes); |
| 97 | + } |
| 98 | + |
| 99 | + build(): StringTable { |
| 100 | + this.#locked = true; |
| 101 | + |
| 102 | + if (!this.#buffers.length) { |
| 103 | + return new StringTable([], new Uint8Array(0), 8); |
| 104 | + } |
| 105 | + |
| 106 | + // sorted by size descending |
| 107 | + const sortedBySize = this.#buffers.map((b, i) => ({ b, i })).sort((a, b) => b.b.length - a.b.length); |
| 108 | + const byteValues: number[] = []; |
| 109 | + |
| 110 | + const strLenBits = Math.ceil(Math.log2(this.#maxStrLen + 1)); |
| 111 | + const strLenMask = (1 << strLenBits) - 1; |
| 112 | + const index: number[] = new Array(this.#buffers.length); |
| 113 | + |
| 114 | + for (const { b, i } of sortedBySize) { |
| 115 | + let offset = findValues(b); |
| 116 | + if (offset < 0) { |
| 117 | + offset = appendValues(b); |
| 118 | + } |
| 119 | + const length = b.length; |
| 120 | + assert(length <= strLenMask, `String length ${length} exceeds maximum of ${strLenMask}`); |
| 121 | + index[i] = (offset << strLenBits) | length; |
| 122 | + } |
| 123 | + |
| 124 | + return new StringTable(index, new Uint8Array(byteValues), strLenBits); |
| 125 | + |
| 126 | + function findValues(buf: number[] | Uint8Array): number { |
| 127 | + const bufLen = buf.length; |
| 128 | + const byteLen = byteValues.length; |
| 129 | + const maxOffset = byteLen - bufLen; |
| 130 | + |
| 131 | + for (let i = 0; i <= maxOffset; i++) { |
| 132 | + let match = true; |
| 133 | + for (let j = 0; j < bufLen; j++) { |
| 134 | + if (byteValues[i + j] !== buf[j]) { |
| 135 | + match = false; |
| 136 | + break; |
| 137 | + } |
| 138 | + } |
| 139 | + if (match) { |
| 140 | + return i; |
| 141 | + } |
| 142 | + } |
| 143 | + |
| 144 | + return -1; |
| 145 | + } |
| 146 | + |
| 147 | + function appendValues(buf: number[] | Uint8Array): number { |
| 148 | + const offset = byteValues.length; |
| 149 | + byteValues.push(...buf); |
| 150 | + return offset; |
| 151 | + } |
| 152 | + } |
| 153 | +} |
| 154 | + |
| 155 | +function getStringTableBinaryFormat(): BinaryFormat { |
| 156 | + return new BinaryFormatBuilder() |
| 157 | + .addUint8('indexBits', 'The number of bits needed for each index entry', 32) |
| 158 | + .addUint8('strLenBits', 'The number of bits needed to store the max length of a string in the table.', 8) |
| 159 | + .addString('reserved', 'Reserved for future use', 6) |
| 160 | + .addUint32ArrayPtr('index32', 'String index array of 32 bit entries') |
| 161 | + .addUint16ArrayPtr('index16', 'String index array of 16 bit entries', 'index32') |
| 162 | + .addUint8ArrayPtr('index', 'String index array of 8 bit entries', 'index32') |
| 163 | + .addUint8ArrayPtr('data', 'String byte data') |
| 164 | + .build(); |
| 165 | +} |
| 166 | + |
| 167 | +export function encodeStringTableToBinary(table: StringTable, endian: 'LE' | 'BE'): U8Array { |
| 168 | + const strLenBits = table.strLenBits; |
| 169 | + const offsetBits = Math.ceil(Math.log2(table.charData.length + 1)); |
| 170 | + const minIndexBits = strLenBits + offsetBits; |
| 171 | + const indexBits = minIndexBits <= 16 ? 16 : 32; |
| 172 | + assert(minIndexBits <= indexBits, `Index bits ${indexBits} is too small for required bits ${minIndexBits}`); |
| 173 | + |
| 174 | + const format = getStringTableBinaryFormat(); |
| 175 | + |
| 176 | + const builder = new BinaryDataBuilder(format, endian); |
| 177 | + builder.setUint8('indexBits', indexBits); |
| 178 | + builder.setUint8('strLenBits', strLenBits); |
| 179 | + if (indexBits === 16) { |
| 180 | + builder.setPtrUint16Array('index16', toU16Array(table.index)); |
| 181 | + } else { |
| 182 | + builder.setPtrUint32Array('index32', toU32Array(table.index)); |
| 183 | + } |
| 184 | + builder.setPtrUint8Array('data', table.charData); |
| 185 | + |
| 186 | + return builder.build(); |
| 187 | +} |
| 188 | + |
| 189 | +export function decodeStringTableFromBinary(data: U8Array, endian: 'LE' | 'BE'): StringTable { |
| 190 | + const reader = new BinaryDataReader(data, getStringTableBinaryFormat(), endian); |
| 191 | + const indexBits = reader.getUint8('indexBits'); |
| 192 | + const strLenBits = reader.getUint8('strLenBits'); |
| 193 | + const index = indexBits === 16 ? reader.getPtrUint16Array('index16') : reader.getPtrUint32Array('index32'); |
| 194 | + const buffer = reader.getPtrUint8Array('data'); |
| 195 | + return new StringTable(index, buffer, strLenBits); |
| 196 | +} |
| 197 | + |
| 198 | +function toU16Array(data: IndexArrayRO): U16Array { |
| 199 | + if (data instanceof Uint16Array) { |
| 200 | + return data; |
| 201 | + } |
| 202 | + return new Uint16Array(data); |
| 203 | +} |
| 204 | + |
| 205 | +function toU32Array(data: IndexArrayRO): U32Array { |
| 206 | + if (data instanceof Uint32Array) { |
| 207 | + return data; |
| 208 | + } |
| 209 | + return new Uint32Array(data); |
| 210 | +} |
0 commit comments