|
| 1 | +/** |
| 2 | + * Todo: add dedup: we actually need to remember chunks already written, and not add themm to the xorb, and also |
| 3 | + * take that into account for file reconstruction |
| 4 | + * Todo: byte grouping? |
| 5 | + */ |
| 6 | + |
| 7 | +import { XET_CHUNK_HEADER_BYTES, XetChunkCompressionScheme } from "./XetBlob"; |
| 8 | +import { compress as lz4_compress } from "../vendor/lz4js"; |
| 9 | + |
| 10 | +const TARGET_CHUNK_SIZE = 64 * 1024; |
| 11 | +const XORB_SIZE = 64 * 1024 * 1024; |
| 12 | +const MAX_XORB_CHUNKS = 8 * 1024; |
| 13 | + |
| 14 | +export async function* createXorbs( |
| 15 | + fileSource: Blob |
| 16 | +): AsyncGenerator<{ xorb: Uint8Array; hash: string }, void, undefined> { |
| 17 | + const chunkModule = await import("../vendor/xet-chunk/chunker_wasm"); |
| 18 | + await chunkModule.init(); |
| 19 | + const chunker = new chunkModule.Chunker(TARGET_CHUNK_SIZE); |
| 20 | + |
| 21 | + let xorb = new Uint8Array(XORB_SIZE); |
| 22 | + const sourceChunks: Array<Uint8Array> = []; |
| 23 | + |
| 24 | + try { |
| 25 | + const reader = fileSource.stream().getReader(); |
| 26 | + let xorbOffset = 0; |
| 27 | + let xorbChunks = Array<{ hash: string; length: number }>(); |
| 28 | + |
| 29 | + const addChunks = function* (chunks: Array<{ hash: string; length: number }>) { |
| 30 | + for (const chunk of chunks) { |
| 31 | + let chunkToCopy: Uint8Array; |
| 32 | + if (chunk.length === sourceChunks[0].length) { |
| 33 | + chunkToCopy = sourceChunks[0]; |
| 34 | + sourceChunks.shift(); |
| 35 | + } else if (chunk.length < sourceChunks[0].length) { |
| 36 | + chunkToCopy = sourceChunks[0].subarray(0, chunk.length); |
| 37 | + sourceChunks[0] = sourceChunks[0].subarray(chunk.length); |
| 38 | + } else { |
| 39 | + chunkToCopy = new Uint8Array(chunk.length); |
| 40 | + let copyOffset = 0; |
| 41 | + let index = 0; |
| 42 | + while (copyOffset < chunk.length) { |
| 43 | + chunkToCopy.set(sourceChunks[index].subarray(0, chunk.length - copyOffset), copyOffset); |
| 44 | + copyOffset += sourceChunks[index].length; |
| 45 | + index++; |
| 46 | + } |
| 47 | + sourceChunks.splice(0, index); |
| 48 | + } |
| 49 | + xorbOffset = writeChunk(xorb, xorbOffset, chunkToCopy); |
| 50 | + if (xorbOffset === 0) { |
| 51 | + // Failure to write chunk, maybe because it went over xorb size limit |
| 52 | + yield { xorb: xorb.subarray(0, xorbOffset), hash: "" }; |
| 53 | + xorb = new Uint8Array(XORB_SIZE); |
| 54 | + xorbOffset = writeChunk(xorb, 0, chunkToCopy); |
| 55 | + |
| 56 | + if (xorbOffset === 0) { |
| 57 | + throw new Error("Failed to write chunk into xorb"); |
| 58 | + } |
| 59 | + } |
| 60 | + xorbChunks.push(chunk); |
| 61 | + if (xorbChunks.length >= MAX_XORB_CHUNKS) { |
| 62 | + yield { xorb: xorb.subarray(0, xorbOffset), hash: chunkModule.compute_xorb_hash(xorbChunks) }; |
| 63 | + xorbOffset = 0; |
| 64 | + xorbChunks = []; |
| 65 | + xorb = new Uint8Array(XORB_SIZE); |
| 66 | + } |
| 67 | + } |
| 68 | + }; |
| 69 | + |
| 70 | + while (true) { |
| 71 | + const { done, value } = await reader.read(); |
| 72 | + if (done) { |
| 73 | + yield* addChunks(chunker.finish()); |
| 74 | + break; |
| 75 | + } |
| 76 | + sourceChunks.push(value); |
| 77 | + yield* addChunks(chunker.add_data(value)); |
| 78 | + } |
| 79 | + } finally { |
| 80 | + chunker.free(); |
| 81 | + // ^ is this really needed ? |
| 82 | + } |
| 83 | +} |
| 84 | + |
| 85 | +// interface ChunkHeader { |
| 86 | +// version: number; // u8, 1 byte |
| 87 | +// compressed_length: number; // 3 * u8, 3 bytes |
| 88 | +// compression_scheme: CompressionScheme; // u8, 1 byte |
| 89 | +// uncompressed_length: number; // 3 * u8, 3 bytes |
| 90 | +// } |
| 91 | + |
| 92 | +// const CHUNK_HEADER_BYTES = 8; |
| 93 | + |
| 94 | +/** |
| 95 | + * Write a chunk header to the xorb and return the offset of where to write the next chunk |
| 96 | + * |
| 97 | + * If it returns 0, it means there wasn't enough space in the xorb |
| 98 | + * |
| 99 | + * Todo: add bg4 compression maybe? |
| 100 | + */ |
| 101 | +function writeChunk(xorb: Uint8Array, offset: number, chunk: Uint8Array): number { |
| 102 | + const compressedChunk = lz4_compress(chunk); |
| 103 | + const chunkToWrite = compressedChunk.length < chunk.length ? compressedChunk : chunk; |
| 104 | + |
| 105 | + if (offset + XET_CHUNK_HEADER_BYTES + chunkToWrite.length > XORB_SIZE) { |
| 106 | + return 0; |
| 107 | + } |
| 108 | + |
| 109 | + xorb[offset] = 0; |
| 110 | + xorb[offset + 1] = chunkToWrite.length & 0xff; |
| 111 | + xorb[offset + 2] = (chunkToWrite.length >> 8) & 0xff; |
| 112 | + xorb[offset + 3] = (chunkToWrite.length >> 16) & 0xff; |
| 113 | + xorb[offset + 4] = |
| 114 | + chunkToWrite.length < chunk.length ? XetChunkCompressionScheme.LZ4 : XetChunkCompressionScheme.None; |
| 115 | + xorb[offset + 5] = chunk.length & 0xff; |
| 116 | + xorb[offset + 6] = (chunk.length >> 8) & 0xff; |
| 117 | + xorb[offset + 7] = (chunk.length >> 16) & 0xff; |
| 118 | + |
| 119 | + xorb.set(chunkToWrite, offset + XET_CHUNK_HEADER_BYTES); |
| 120 | + return offset + XET_CHUNK_HEADER_BYTES + chunkToWrite.length; |
| 121 | +} |
0 commit comments