Skip to content

Commit bd3c428

Browse files
authored
xet: Commit operation to edit part of file (optimized for handling edits at beginning of file) (#1718)
Fix #1705 ```ts // Edit a file by adding prefix & suffix await commit({ repo, accessToken: "hf_...", operations: [{ type: "edit", originalContent: originalFile, edits: [{ start: 0, end: 0, content: new Blob(["prefix"]) }, { start: originalFile.length, end: originalFile.length, content: new Blob(["suffix"]) }] }] }) ``` ```ts // Edit first kB of file await commit({ repo, accessToken: "hf_...", operations: [{ type: "edit", originalContent: originalFile, edits: [{ start: 0, end: 1000, content: new Blob(["blablabla"]) }] }] }) ``` cc @mishig25 @assafvayner @jsulz also - fallback to LFS for non-xet repos (even if useXet is true) - remove invalid Accepts header ## How it works under the hood - we load dedup info for first chunk of original file content if it's changed - we upload the blob as normal ## Todo currently blob is being processed twice, once for sha256 and once for hashing. The file should be processed only once (maybe after #1704 - using workers for different processes)
1 parent e9a55c9 commit bd3c428

File tree

12 files changed

+1069
-76
lines changed

12 files changed

+1069
-76
lines changed

packages/hub/README.md

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,25 @@ for await (const progressEvent of await hub.uploadFilesWithProgress({
8282
console.log(progressEvent);
8383
}
8484

85+
// Edit a file by adding prefix & suffix
86+
await commit({
87+
repo,
88+
accessToken: "hf_...",
89+
operations: [{
90+
type: "edit",
91+
originalContent: originalFile,
92+
edits: [{
93+
start: 0,
94+
end: 0,
95+
content: new Blob(["prefix"])
96+
}, {
97+
start: originalFile.length,
98+
end: originalFile.length,
99+
content: new Blob(["suffix"])
100+
}]
101+
}]
102+
})
103+
85104
await hub.deleteFile({repo, accessToken: "hf_...", path: "myfile.bin"});
86105

87106
await (await hub.downloadFile({ repo, path: "README.md" })).text();

packages/hub/scripts/bench.ts

Lines changed: 94 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,10 @@ import { join } from "node:path";
66
import { writeFile, readFile, stat, mkdir } from "node:fs/promises";
77
import type { RepoId } from "../src/types/public.js";
88
import { toRepoId } from "../src/utils/toRepoId.js";
9-
import { commitIter } from "../src/index.js";
9+
import type { CommitOperation } from "../src/index.js";
10+
import { commitIter, downloadFile } from "../src/index.js";
11+
import { SplicedBlob } from "../src/utils/SplicedBlob.js";
1012
import { pathToFileURL } from "node:url";
11-
import { WebBlob } from "../src/utils/WebBlob.js";
1213

1314
/**
1415
* This script downloads the files from openai-community/gpt2 and simulates an upload to a xet repo.
@@ -38,6 +39,23 @@ const FILES_TO_DOWNLOAD = [
3839
},
3940
];
4041

42+
const FILES_TO_EDIT = [
43+
{
44+
url: "https://huggingface.co/openai-community/gpt2/resolve/main/64-8bits.tflite?download=true",
45+
filename: "64-8bits.tflite.edited",
46+
sha256: "c2b116ccc9e5362d55dd60b344a4b93156594feeef312b5b8833151f0732aa0a",
47+
edits: [
48+
{
49+
start: 0,
50+
end: 1000,
51+
content: new Blob([
52+
"Adding a new prefix to this TFLite file. Will xet still be efficient in deduplicating the file?",
53+
]),
54+
},
55+
],
56+
},
57+
];
58+
4159
async function downloadFileIfNotExists(url: string, filepath: string): Promise<void> {
4260
try {
4361
await stat(filepath);
@@ -58,13 +76,25 @@ async function downloadFileIfNotExists(url: string, filepath: string): Promise<v
5876
console.log(`Downloaded ${filepath} (${buffer.byteLength} bytes)`);
5977
}
6078

61-
async function* createFileSource(
62-
files: Array<{ filepath: string; filename: string }>
63-
): AsyncGenerator<{ content: Blob; path: string; sha256: string }> {
79+
async function* createFileSource(files: Array<{ filepath: string; filename: string }>): AsyncGenerator<{
80+
content: Blob;
81+
path: string;
82+
sha256: string;
83+
edits?: Array<{ start: number; end: number; content: Blob }>;
84+
}> {
6485
for (const file of files) {
6586
console.log(`Processing ${file.filename}...`);
6687
const buffer = await readFile(file.filepath);
67-
const blob = new Blob([buffer]);
88+
let blob = new Blob([buffer]);
89+
90+
if (file.filename.endsWith(".edited")) {
91+
const edits = FILES_TO_EDIT.find((f) => f.filename === file.filename)?.edits;
92+
if (edits !== undefined) {
93+
for (const edit of edits) {
94+
blob = SplicedBlob.create(blob, [{ insert: edit.content, start: edit.start, end: edit.end }]);
95+
}
96+
}
97+
}
6898

6999
// Calculate sha256
70100
console.log(`Calculating SHA256 for ${file.filename}...`);
@@ -77,12 +107,11 @@ async function* createFileSource(
77107

78108
console.log(`SHA256 for ${file.filename}: ${sha256Hash}`);
79109

80-
if (sha256Hash !== FILES_TO_DOWNLOAD.find((f) => f.filename === file.filename)?.sha256) {
81-
throw new Error(
82-
`SHA256 mismatch for ${file.filename}: ${sha256Hash} !== ${FILES_TO_DOWNLOAD.find(
83-
(f) => f.filename === file.filename
84-
)?.sha256}`
85-
);
110+
const sha256ToCheck =
111+
FILES_TO_DOWNLOAD.find((f) => f.filename === file.filename)?.sha256 ||
112+
FILES_TO_EDIT.find((f) => f.filename === file.filename)?.sha256;
113+
if (sha256Hash !== sha256ToCheck) {
114+
throw new Error(`SHA256 mismatch for ${file.filename}: ${sha256Hash} !== ${sha256ToCheck}`);
86115
}
87116

88117
yield {
@@ -215,6 +244,12 @@ async function main() {
215244
files.push({ filepath, filename: fileInfo.filename });
216245
}
217246

247+
for (const fileInfo of FILES_TO_EDIT) {
248+
const filepath = join(downloadDir, fileInfo.filename);
249+
await downloadFileIfNotExists(fileInfo.url, filepath);
250+
files.push({ filepath, filename: fileInfo.filename });
251+
}
252+
218253
// Parse repo
219254
const repoName = args.repo;
220255

@@ -302,13 +337,25 @@ async function main() {
302337

303338
if (args.commit) {
304339
console.log("\n=== Committing files ===");
340+
const operations: CommitOperation[] = [];
341+
for (const fileInfo of FILES_TO_DOWNLOAD) {
342+
operations.push({
343+
operation: "addOrUpdate",
344+
content: pathToFileURL(join(downloadDir, fileInfo.filename)),
345+
path: fileInfo.filename,
346+
});
347+
}
348+
for (const fileInfo of FILES_TO_EDIT) {
349+
operations.push({
350+
operation: "edit",
351+
originalContent: new Blob([await readFile(join(downloadDir, fileInfo.filename))]),
352+
edits: fileInfo.edits,
353+
path: fileInfo.filename,
354+
});
355+
}
305356
const iterator = commitIter({
306357
repo,
307-
operations: files.map((file) => ({
308-
operation: "addOrUpdate",
309-
content: pathToFileURL(file.filepath),
310-
path: file.filename,
311-
})),
358+
operations,
312359
accessToken: args.token,
313360
title: "Upload xet files with JS lib",
314361
useXet: true,
@@ -325,7 +372,16 @@ async function main() {
325372

326373
console.log("Redownloading files and verifying SHA256 integrity");
327374
for (const file of FILES_TO_DOWNLOAD) {
328-
const fileBlob = await WebBlob.create(new URL(file.url));
375+
const fileBlob = await downloadFile({
376+
repo,
377+
path: file.filename,
378+
accessToken: args.token,
379+
});
380+
381+
if (!fileBlob) {
382+
throw new Error(`Failed to download ${file.filename}`);
383+
}
384+
329385
const sha256Hash = sha256(fileBlob, { useWebWorker: false });
330386
let res: IteratorResult<number, string>;
331387
do {
@@ -335,6 +391,26 @@ async function main() {
335391

336392
console.log(`${file.filename}: ${finalHash} === ${file.sha256} ${finalHash === file.sha256 ? "✅" : "❌"}`);
337393
}
394+
395+
for (const file of FILES_TO_EDIT) {
396+
const fileBlob = await downloadFile({
397+
repo,
398+
path: file.filename,
399+
accessToken: args.token,
400+
});
401+
402+
if (!fileBlob) {
403+
throw new Error(`Failed to download ${file.filename}`);
404+
}
405+
406+
const sha256Hash = sha256(fileBlob, { useWebWorker: false });
407+
let res: IteratorResult<number, string>;
408+
do {
409+
res = await sha256Hash.next();
410+
} while (!res.done);
411+
const finalHash = res.value;
412+
console.log(`${file.filename}: ${finalHash} === ${file.sha256} ${finalHash === file.sha256 ? "✅" : "❌"}`);
413+
}
338414
}
339415
}
340416

packages/hub/src/lib/commit.ts

Lines changed: 75 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ import { createBlobs } from "../utils/createBlobs";
2525
import { uploadShards } from "../utils/uploadShards";
2626
import { splitAsyncGenerator } from "../utils/splitAsyncGenerator";
2727
import { mergeAsyncGenerators } from "../utils/mergeAsyncGenerators";
28+
import { SplicedBlob } from "../utils/SplicedBlob";
2829

2930
const CONCURRENT_SHAS = 5;
3031
const CONCURRENT_LFS_UPLOADS = 5;
@@ -44,6 +45,36 @@ export interface CommitFile {
4445
// forceLfs?: boolean
4546
}
4647

48+
/**
49+
* Opitmized when only the beginning or the end of the file is replaced
50+
*
51+
* todo: handle other cases
52+
*/
53+
export interface CommitEditFile {
54+
operation: "edit";
55+
path: string;
56+
/** Later, will be ContentSource. For now simpler to just handle blobs */
57+
originalContent: Blob;
58+
edits: Array<{
59+
/**
60+
* Later, will be ContentSource. For now simpler to just handle blobs
61+
*
62+
* originalContent from [start, end) will be replaced by this
63+
*/
64+
content: Blob;
65+
/**
66+
* The start position of the edit in the original content
67+
*/
68+
start: number;
69+
/**
70+
* The end position of the edit in the original content
71+
*
72+
* originalContent from [start, end) will be replaced by the edit
73+
*/
74+
end: number;
75+
}>;
76+
}
77+
4778
type CommitBlob = Omit<CommitFile, "content"> & { content: Blob };
4879

4980
// TODO: find a nice way to handle LFS & non-LFS files in an uniform manner, see https://github.com/huggingface/moon-landing/issues/4370
@@ -54,7 +85,7 @@ type CommitBlob = Omit<CommitFile, "content"> & { content: Blob };
5485
// content?: ContentSource;
5586
// };
5687

57-
export type CommitOperation = CommitDeletedEntry | CommitFile /* | CommitRenameFile */;
88+
export type CommitOperation = CommitDeletedEntry | CommitFile | CommitEditFile /* | CommitRenameFile */;
5889
type CommitBlobOperation = Exclude<CommitOperation, CommitFile> | CommitBlob;
5990

6091
export type CommitParams = {
@@ -91,9 +122,6 @@ export type CommitParams = {
91122
fetch?: typeof fetch;
92123
abortSignal?: AbortSignal;
93124
// Credentials are optional due to custom fetch functions or cookie auth
94-
/**
95-
* @deprecated Not yet ready for production use
96-
*/
97125
useXet?: boolean;
98126
} & Partial<CredentialsParams>;
99127

@@ -138,6 +166,25 @@ export async function* commitIter(params: CommitParams): AsyncGenerator<CommitPr
138166
const repoId = toRepoId(params.repo);
139167
yield { event: "phase", phase: "preuploading" };
140168

169+
let useXet = params.useXet;
170+
if (useXet) {
171+
const info = await (params.fetch ?? fetch)(
172+
`${params.hubUrl ?? HUB_URL}/api/${repoId.type}s/${repoId.name}?expand[]=xetEnabled`,
173+
{
174+
headers: {
175+
...(accessToken && { Authorization: `Bearer ${accessToken}` }),
176+
},
177+
}
178+
);
179+
180+
if (!info.ok) {
181+
throw await createApiError(info);
182+
}
183+
184+
const data = await info.json();
185+
useXet = !!data.xetEnabled;
186+
}
187+
141188
const lfsShas = new Map<string, string | null>();
142189

143190
const abortController = new AbortController();
@@ -160,6 +207,23 @@ export async function* commitIter(params: CommitParams): AsyncGenerator<CommitPr
160207
const allOperations = (
161208
await Promise.all(
162209
params.operations.map(async (operation) => {
210+
if (operation.operation === "edit" && !useXet) {
211+
throw new Error("Edit operation is not supported when Xet is disabled");
212+
}
213+
214+
if (operation.operation === "edit") {
215+
// Convert EditFile operation to a file operation with SplicedBlob
216+
const splicedBlob = SplicedBlob.create(
217+
operation.originalContent,
218+
operation.edits.map((splice) => ({ insert: splice.content, start: splice.start, end: splice.end }))
219+
);
220+
return {
221+
operation: "addOrUpdate" as const,
222+
path: operation.path,
223+
content: splicedBlob,
224+
};
225+
}
226+
163227
if (operation.operation !== "addOrUpdate") {
164228
return operation;
165229
}
@@ -678,6 +742,13 @@ async function convertOperationToNdJson(operation: CommitBlobOperation): Promise
678742
},
679743
};
680744
}
745+
case "edit": {
746+
// Note: By the time we get here, splice operations should have been converted to addOrUpdate operations with SplicedBlob
747+
// But we handle this case for completeness
748+
throw new Error(
749+
"Edit operations should be converted to addOrUpdate operations before reaching convertOperationToNdJson"
750+
);
751+
}
681752
default:
682753
throw new TypeError("Unknown operation: " + (operation as { operation: string }).operation);
683754
}

packages/hub/src/lib/dataset-info.ts

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,6 @@ export async function datasetInfo<
3737
{
3838
headers: {
3939
...(accessToken ? { Authorization: `Bearer ${accessToken}` } : {}),
40-
Accepts: "application/json",
4140
},
4241
}
4342
);

packages/hub/src/lib/model-info.ts

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,6 @@ export async function modelInfo<
3838
{
3939
headers: {
4040
...(accessToken ? { Authorization: `Bearer ${accessToken}` } : {}),
41-
Accepts: "application/json",
4241
},
4342
}
4443
);

packages/hub/src/lib/space-info.ts

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,6 @@ export async function spaceInfo<
3838
{
3939
headers: {
4040
...(accessToken ? { Authorization: `Bearer ${accessToken}` } : {}),
41-
Accepts: "application/json",
4241
},
4342
}
4443
);

packages/hub/src/lib/upload-file.ts

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,6 @@ export function uploadFile(
1515
fetch?: CommitParams["fetch"];
1616
useWebWorkers?: CommitParams["useWebWorkers"];
1717
abortSignal?: CommitParams["abortSignal"];
18-
/**
19-
* @deprecated Not yet ready for production use
20-
*/
2118
useXet?: CommitParams["useXet"];
2219
} & Partial<CredentialsParams>
2320
): Promise<CommitOutput> {

packages/hub/src/lib/upload-files-with-progress.ts

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,9 +29,6 @@ export async function* uploadFilesWithProgress(
2929
parentCommit?: CommitParams["parentCommit"];
3030
abortSignal?: CommitParams["abortSignal"];
3131
maxFolderDepth?: CommitParams["maxFolderDepth"];
32-
/**
33-
* @deprecated Not yet ready for production use
34-
*/
3532
useXet?: CommitParams["useXet"];
3633
/**
3734
* Set this to true in order to have progress events for hashing

packages/hub/src/lib/upload-files.ts

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,6 @@ export function uploadFiles(
1616
useWebWorkers?: CommitParams["useWebWorkers"];
1717
maxFolderDepth?: CommitParams["maxFolderDepth"];
1818
abortSignal?: CommitParams["abortSignal"];
19-
/**
20-
* @deprecated Not yet ready for production use
21-
*/
2219
useXet?: CommitParams["useXet"];
2320
} & Partial<CredentialsParams>
2421
): Promise<CommitOutput> {

0 commit comments

Comments
 (0)