Skip to content

Commit 876c717

Browse files
authored
Xet upload: backtrack when dedup info is received (#1708)
Fix #1703 cc @assafvayner for viz, cc @Kakulukian too ## Note Only backtrack since the end of the last file, and only in the current xorb. It means that we maybe lose ~2MB on average at the end of a xorb - only if we filled the first 60MB of the xorb with new data ## Improvement Running `pnpm --filter hub bench`: ```console === BENCHMARK RESULTS === File Statistics: ================ 📄 64-8bits.tflite: Size: 119.36 MB Deduplication: 99.90% 📄 64-fp16.tflite: Size: 236.77 MB Deduplication: 100.00% === SUMMARY === Total files: 2 Total size: 356.13 MB Total xorbs: 1 Total shards: 1 Total xorb bytes: 119 926 bytes Total shard bytes: 1 400 bytes Average deduplication: 99.95% ``` we bump the second file from 83% to 100% dedup
1 parent 57d7cf7 commit 876c717

File tree

4 files changed

+366
-129
lines changed

4 files changed

+366
-129
lines changed

packages/hub/.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,3 @@
11
xet-core-wasm-build
2+
shard.bin
3+
xorb.bin

packages/hub/scripts/bench.ts

Lines changed: 47 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import type { RepoId } from "../src/types/public.js";
88
import { toRepoId } from "../src/utils/toRepoId.js";
99
import { commitIter } from "../src/index.js";
1010
import { pathToFileURL } from "node:url";
11+
import { WebBlob } from "../src/utils/WebBlob.js";
1112

1213
/**
1314
* This script downloads the files from openai-community/gpt2 and simulates an upload to a xet repo.
@@ -23,10 +24,17 @@ const FILES_TO_DOWNLOAD = [
2324
{
2425
url: "https://huggingface.co/openai-community/gpt2/resolve/main/64-8bits.tflite?download=true",
2526
filename: "64-8bits.tflite",
27+
sha256: "c966da3b74697803352ca7c6f2f220e7090a557b619de9da0c6b34d89f7825c1",
2628
},
2729
{
2830
url: "https://huggingface.co/openai-community/gpt2/resolve/main/64-fp16.tflite?download=true",
2931
filename: "64-fp16.tflite",
32+
sha256: "1ceafd82e733dd4b21570b2a86cf27556a983041806c033a55d086e0ed782cd3",
33+
},
34+
{
35+
url: "https://huggingface.co/openai-community/gpt2/resolve/main/64.tflite?download=true",
36+
filename: "64.tflite",
37+
sha256: "cfcd510b239d90b71ee87d4e57a5a8c2d55b2a941e5d9fe5852298268ddbe61b",
3038
},
3139
];
3240

@@ -68,6 +76,15 @@ async function* createFileSource(
6876
const sha256Hash = res.value;
6977

7078
console.log(`SHA256 for ${file.filename}: ${sha256Hash}`);
79+
80+
if (sha256Hash !== FILES_TO_DOWNLOAD.find((f) => f.filename === file.filename)?.sha256) {
81+
throw new Error(
82+
`SHA256 mismatch for ${file.filename}: ${sha256Hash} !== ${FILES_TO_DOWNLOAD.find(
83+
(f) => f.filename === file.filename
84+
)?.sha256}`
85+
);
86+
}
87+
7188
yield {
7289
content: blob,
7390
path: file.filename,
@@ -92,7 +109,7 @@ function getBodySize(body: RequestInit["body"]): string {
92109
return "unknown size";
93110
}
94111

95-
function createMockFetch(): {
112+
function createMockFetch(args: { write: boolean }): {
96113
fetch: typeof fetch;
97114
getStats: () => { xorbCount: number; shardCount: number; xorbBytes: number; shardBytes: number };
98115
} {
@@ -111,6 +128,11 @@ function createMockFetch(): {
111128
xorbBytes += parseInt(bodySize);
112129
console.log(`[MOCK] Xorb upload ${xorbCount}: ${init?.method || "GET"} ${url} (${bodySize})`);
113130

131+
if (args.write) {
132+
// Write the body to a file
133+
await writeFile("xorb.bin", init?.body as Uint8Array);
134+
}
135+
114136
return new Response(null, {
115137
status: 200,
116138
statusText: "OK",
@@ -123,6 +145,11 @@ function createMockFetch(): {
123145
shardBytes += parseInt(bodySize);
124146
console.log(`[MOCK] Shard upload ${shardCount}: ${init?.method || "GET"} ${url} (${bodySize})`);
125147

148+
if (args.write) {
149+
// Write the body to a file
150+
await writeFile("shard.bin", init?.body as Uint8Array);
151+
}
152+
126153
return new Response(null, {
127154
status: 200,
128155
statusText: "OK",
@@ -158,6 +185,11 @@ async function main() {
158185
short: "c",
159186
default: false,
160187
},
188+
write: {
189+
type: "boolean",
190+
short: "w",
191+
default: false,
192+
},
161193
},
162194
});
163195

@@ -189,7 +221,7 @@ async function main() {
189221
const repo: RepoId = toRepoId(repoName);
190222

191223
// Create mock fetch
192-
const mockFetchObj = createMockFetch();
224+
const mockFetchObj = createMockFetch({ write: args.write });
193225

194226
// Setup upload parameters
195227
const uploadParams = {
@@ -290,6 +322,19 @@ async function main() {
290322
}
291323

292324
console.log("Done committing");
325+
326+
console.log("Redownloading files and verifying SHA256 integrity");
327+
for (const file of FILES_TO_DOWNLOAD) {
328+
const fileBlob = await WebBlob.create(new URL(file.url));
329+
const sha256Hash = sha256(fileBlob, { useWebWorker: false });
330+
let res: IteratorResult<number, string>;
331+
do {
332+
res = await sha256Hash.next();
333+
} while (!res.done);
334+
const finalHash = res.value;
335+
336+
console.log(`${file.filename}: ${finalHash} === ${file.sha256} ${finalHash === file.sha256 ? "✅" : "❌"}`);
337+
}
293338
}
294339
}
295340

packages/hub/src/utils/ChunkCache.ts

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,15 +43,18 @@ export class ChunkCache {
4343

4444
getChunk(
4545
hash: string,
46-
hmacFunction: (hash: string, key: string) => string
46+
/**
47+
* Set to null if you only want to check against locally created chunks, or the hash is already a hmac
48+
*/
49+
hmacFunction: ((hash: string, key: string) => string) | null
4750
):
4851
| {
4952
xorbIndex: number;
5053
chunkIndex: number;
5154
}
5255
| undefined {
5356
let index = this.map.get(hash);
54-
if (index === undefined) {
57+
if (index === undefined && hmacFunction !== null) {
5558
for (const hmac of this.hmacs) {
5659
index = this.map.get(hmacFunction(hash, hmac));
5760
if (index !== undefined) {
@@ -67,4 +70,8 @@ export class ChunkCache {
6770
chunkIndex: this.chunkIndices[index],
6871
};
6972
}
73+
74+
removeChunkFromCache(hash: string): void {
75+
this.map.delete(hash);
76+
}
7077
}

0 commit comments

Comments
 (0)