Also handle byte-grouping + LZ4 when compressing data for xorbs

coyotte508 · coyotte508 · commit 352d386af917 · 2025-07-16T15:31:02.000+02:00
diff --git a/packages/hub/src/utils/XetBlob.spec.ts b/packages/hub/src/utils/XetBlob.spec.ts
@@ -1,6 +1,6 @@
 import { describe, expect, it } from "vitest";
 import type { ReconstructionInfo } from "./XetBlob";
-import { bg4_regoup_bytes, XetBlob } from "./XetBlob";
+import { bg4_regroup_bytes, bg4_split_bytes, XetBlob } from "./XetBlob";
 import { sum } from "./sum";
 
 describe("XetBlob", () => {
@@ -173,30 +173,72 @@ describe("XetBlob", () => {
 
 	describe("bg4_regoup_bytes", () => {
 		it("should regroup bytes when the array is %4 length", () => {
-			expect(bg4_regoup_bytes(new Uint8Array([1, 5, 2, 6, 3, 7, 4, 8]))).toEqual(
+			expect(bg4_regroup_bytes(new Uint8Array([1, 5, 2, 6, 3, 7, 4, 8]))).toEqual(
 				new Uint8Array([1, 2, 3, 4, 5, 6, 7, 8])
 			);
 		});
 
 		it("should regroup bytes when the array is %4 + 1 length", () => {
-			expect(bg4_regoup_bytes(new Uint8Array([1, 5, 9, 2, 6, 3, 7, 4, 8]))).toEqual(
+			expect(bg4_regroup_bytes(new Uint8Array([1, 5, 9, 2, 6, 3, 7, 4, 8]))).toEqual(
 				new Uint8Array([1, 2, 3, 4, 5, 6, 7, 8, 9])
 			);
 		});
 
 		it("should regroup bytes when the array is %4 + 2 length", () => {
-			expect(bg4_regoup_bytes(new Uint8Array([1, 5, 9, 2, 6, 10, 3, 7, 4, 8]))).toEqual(
+			expect(bg4_regroup_bytes(new Uint8Array([1, 5, 9, 2, 6, 10, 3, 7, 4, 8]))).toEqual(
 				new Uint8Array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
 			);
 		});
 
 		it("should regroup bytes when the array is %4 + 3 length", () => {
-			expect(bg4_regoup_bytes(new Uint8Array([1, 5, 9, 2, 6, 10, 3, 7, 11, 4, 8]))).toEqual(
+			expect(bg4_regroup_bytes(new Uint8Array([1, 5, 9, 2, 6, 10, 3, 7, 11, 4, 8]))).toEqual(
 				new Uint8Array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11])
 			);
 		});
 	});
 
+	describe("bg4_split_bytes", () => {
+		it("should split bytes when the array is %4 length", () => {
+			expect(bg4_split_bytes(new Uint8Array([1, 2, 3, 4, 5, 6, 7, 8]))).toEqual(
+				new Uint8Array([1, 5, 2, 6, 3, 7, 4, 8])
+			);
+		});
+
+		it("should split bytes when the array is %4 + 1 length", () => {
+			expect(bg4_split_bytes(new Uint8Array([1, 2, 3, 4, 5, 6, 7, 8, 9]))).toEqual(
+				new Uint8Array([1, 5, 9, 2, 6, 3, 7, 4, 8])
+			);
+		});
+
+		it("should split bytes when the array is %4 + 2 length", () => {
+			expect(bg4_split_bytes(new Uint8Array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]))).toEqual(
+				new Uint8Array([1, 5, 9, 2, 6, 10, 3, 7, 4, 8])
+			);
+		});
+
+		it("should split bytes when the array is %4 + 3 length", () => {
+			expect(bg4_split_bytes(new Uint8Array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]))).toEqual(
+				new Uint8Array([1, 5, 9, 2, 6, 10, 3, 7, 11, 4, 8])
+			);
+		});
+
+		it("should be the inverse of bg4_regroup_bytes", () => {
+			const testArrays = [
+				new Uint8Array([1, 2, 3, 4, 5, 6, 7, 8]),
+				new Uint8Array([1, 2, 3, 4, 5, 6, 7, 8, 9]),
+				new Uint8Array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]),
+				new Uint8Array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]),
+				new Uint8Array([42]),
+				new Uint8Array([1, 2]),
+				new Uint8Array([1, 2, 3]),
+			];
+
+			testArrays.forEach((arr) => {
+				expect(bg4_regroup_bytes(bg4_split_bytes(arr))).toEqual(arr);
+			});
+		});
+	});
+
 	describe("when mocked", () => {
 		describe("loading many chunks every read", () => {
 			it("should load different slices", async () => {
diff --git a/packages/hub/src/utils/XetBlob.ts b/packages/hub/src/utils/XetBlob.ts
@@ -376,7 +376,7 @@ export class XetBlob extends Blob {
 							chunkHeader.compression_scheme === XetChunkCompressionScheme.LZ4
 								? lz4_decompress(result.value.slice(0, chunkHeader.compressed_length), chunkHeader.uncompressed_length)
 								: chunkHeader.compression_scheme === XetChunkCompressionScheme.ByteGroupingLZ4
-								  ? bg4_regoup_bytes(
+								  ? bg4_regroup_bytes(
 											lz4_decompress(
 												result.value.slice(0, chunkHeader.compressed_length),
 												chunkHeader.uncompressed_length
@@ -529,7 +529,7 @@ function cacheKey(params: { refreshUrl: string; initialAccessToken: string | und
 }
 
 // exported for testing purposes
-export function bg4_regoup_bytes(bytes: Uint8Array): Uint8Array {
+export function bg4_regroup_bytes(bytes: Uint8Array): Uint8Array {
 	// python code
 
 	// split = len(x) // 4
@@ -590,6 +590,40 @@ export function bg4_regoup_bytes(bytes: Uint8Array): Uint8Array {
 	// }
 }
 
+export function bg4_split_bytes(bytes: Uint8Array): Uint8Array {
+	// This function does the opposite of bg4_regroup_bytes
+	// It takes interleaved bytes and groups them by 4
+
+	const ret = new Uint8Array(bytes.byteLength);
+	const split = Math.floor(bytes.byteLength / 4);
+	const rem = bytes.byteLength % 4;
+
+	// Calculate group positions in the output array
+	const g1_pos = split + (rem >= 1 ? 1 : 0);
+	const g2_pos = g1_pos + split + (rem >= 2 ? 1 : 0);
+	const g3_pos = g2_pos + split + (rem == 3 ? 1 : 0);
+
+	// Extract every 4th byte starting from position 0, 1, 2, 3
+	// and place them in their respective groups
+	for (let i = 0, j = 0; i < bytes.byteLength; i += 4, j++) {
+		ret[j] = bytes[i];
+	}
+
+	for (let i = 1, j = g1_pos; i < bytes.byteLength; i += 4, j++) {
+		ret[j] = bytes[i];
+	}
+
+	for (let i = 2, j = g2_pos; i < bytes.byteLength; i += 4, j++) {
+		ret[j] = bytes[i];
+	}
+
+	for (let i = 3, j = g3_pos; i < bytes.byteLength; i += 4, j++) {
+		ret[j] = bytes[i];
+	}
+
+	return ret;
+}
+
 async function getAccessToken(
 	initialAccessToken: string | undefined,
 	customFetch: typeof fetch,
diff --git a/packages/hub/src/utils/createXorbs.ts b/packages/hub/src/utils/createXorbs.ts
@@ -4,7 +4,7 @@
  * Todo: byte grouping?
  */
 
-import { XET_CHUNK_HEADER_BYTES, XetChunkCompressionScheme } from "./XetBlob";
+import { bg4_split_bytes, XET_CHUNK_HEADER_BYTES, XetChunkCompressionScheme } from "./XetBlob";
 import { compress as lz4_compress } from "../vendor/lz4js";
 
 const TARGET_CHUNK_SIZE = 64 * 1024;
@@ -99,7 +99,10 @@ export async function* createXorbs(
  * Todo: add bg4 compression maybe?
  */
 function writeChunk(xorb: Uint8Array, offset: number, chunk: Uint8Array): number {
-	const compressedChunk = lz4_compress(chunk);
+	const regularCompressedChunk = lz4_compress(chunk);
+	const bgCompressedChunk = lz4_compress(bg4_split_bytes(chunk));
+	const compressedChunk =
+		regularCompressedChunk.length < bgCompressedChunk.length ? regularCompressedChunk : bgCompressedChunk;
 	const chunkToWrite = compressedChunk.length < chunk.length ? compressedChunk : chunk;
 
 	if (offset + XET_CHUNK_HEADER_BYTES + chunkToWrite.length > XORB_SIZE) {
@@ -111,7 +114,11 @@ function writeChunk(xorb: Uint8Array, offset: number, chunk: Uint8Array): number
 	xorb[offset + 2] = (chunkToWrite.length >> 8) & 0xff;
 	xorb[offset + 3] = (chunkToWrite.length >> 16) & 0xff;
 	xorb[offset + 4] =
-		chunkToWrite.length < chunk.length ? XetChunkCompressionScheme.LZ4 : XetChunkCompressionScheme.None;
+		chunkToWrite.length < chunk.length
+			? bgCompressedChunk.length < chunk.length
+				? XetChunkCompressionScheme.ByteGroupingLZ4
+				: XetChunkCompressionScheme.LZ4
+			: XetChunkCompressionScheme.None;
 	xorb[offset + 5] = chunk.length & 0xff;
 	xorb[offset + 6] = (chunk.length >> 8) & 0xff;
 	xorb[offset + 7] = (chunk.length >> 16) & 0xff;