Skip to content
Merged
19 changes: 19 additions & 0 deletions packages/hub/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,25 @@ for await (const progressEvent of await hub.uploadFilesWithProgress({
console.log(progressEvent);
}

// Edit a file by adding prefix & suffix
await commit({
repo,
accessToken: "hf_...",
operations: [{
type: "edit",
originalContent: originalFile,
edits: [{
start: 0,
end: 0,
content: new Blob(["prefix"])
}, {
start: originalFile.length,
end: originalFile.length,
content: new Blob(["suffix"])
}]
}]
})

await hub.deleteFile({repo, accessToken: "hf_...", path: "myfile.bin"});

await (await hub.downloadFile({ repo, path: "README.md" })).text();
Expand Down
112 changes: 94 additions & 18 deletions packages/hub/scripts/bench.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@ import { join } from "node:path";
import { writeFile, readFile, stat, mkdir } from "node:fs/promises";
import type { RepoId } from "../src/types/public.js";
import { toRepoId } from "../src/utils/toRepoId.js";
import { commitIter } from "../src/index.js";
import type { CommitOperation } from "../src/index.js";
import { commitIter, downloadFile } from "../src/index.js";
import { SplicedBlob } from "../src/utils/SplicedBlob.js";
import { pathToFileURL } from "node:url";
import { WebBlob } from "../src/utils/WebBlob.js";

/**
* This script downloads the files from openai-community/gpt2 and simulates an upload to a xet repo.
Expand Down Expand Up @@ -38,6 +39,23 @@ const FILES_TO_DOWNLOAD = [
},
];

const FILES_TO_EDIT = [
{
url: "https://huggingface.co/openai-community/gpt2/resolve/main/64-8bits.tflite?download=true",
filename: "64-8bits.tflite.edited",
sha256: "c2b116ccc9e5362d55dd60b344a4b93156594feeef312b5b8833151f0732aa0a",
edits: [
{
start: 0,
end: 1000,
content: new Blob([
"Adding a new prefix to this TFLite file. Will xet still be efficient in deduplicating the file?",
]),
},
],
},
];

async function downloadFileIfNotExists(url: string, filepath: string): Promise<void> {
try {
await stat(filepath);
Expand All @@ -58,13 +76,25 @@ async function downloadFileIfNotExists(url: string, filepath: string): Promise<v
console.log(`Downloaded ${filepath} (${buffer.byteLength} bytes)`);
}

async function* createFileSource(
files: Array<{ filepath: string; filename: string }>
): AsyncGenerator<{ content: Blob; path: string; sha256: string }> {
async function* createFileSource(files: Array<{ filepath: string; filename: string }>): AsyncGenerator<{
content: Blob;
path: string;
sha256: string;
edits?: Array<{ start: number; end: number; content: Blob }>;
}> {
for (const file of files) {
console.log(`Processing ${file.filename}...`);
const buffer = await readFile(file.filepath);
const blob = new Blob([buffer]);
let blob = new Blob([buffer]);

if (file.filename.endsWith(".edited")) {
const edits = FILES_TO_EDIT.find((f) => f.filename === file.filename)?.edits;
if (edits !== undefined) {
for (const edit of edits) {
blob = SplicedBlob.create(blob, [{ insert: edit.content, start: edit.start, end: edit.end }]);
}
}
}

// Calculate sha256
console.log(`Calculating SHA256 for ${file.filename}...`);
Expand All @@ -77,12 +107,11 @@ async function* createFileSource(

console.log(`SHA256 for ${file.filename}: ${sha256Hash}`);

if (sha256Hash !== FILES_TO_DOWNLOAD.find((f) => f.filename === file.filename)?.sha256) {
throw new Error(
`SHA256 mismatch for ${file.filename}: ${sha256Hash} !== ${FILES_TO_DOWNLOAD.find(
(f) => f.filename === file.filename
)?.sha256}`
);
const sha256ToCheck =
FILES_TO_DOWNLOAD.find((f) => f.filename === file.filename)?.sha256 ||
FILES_TO_EDIT.find((f) => f.filename === file.filename)?.sha256;
if (sha256Hash !== sha256ToCheck) {
throw new Error(`SHA256 mismatch for ${file.filename}: ${sha256Hash} !== ${sha256ToCheck}`);
}

yield {
Expand Down Expand Up @@ -215,6 +244,12 @@ async function main() {
files.push({ filepath, filename: fileInfo.filename });
}

for (const fileInfo of FILES_TO_EDIT) {
const filepath = join(downloadDir, fileInfo.filename);
await downloadFileIfNotExists(fileInfo.url, filepath);
files.push({ filepath, filename: fileInfo.filename });
}

// Parse repo
const repoName = args.repo;

Expand Down Expand Up @@ -302,13 +337,25 @@ async function main() {

if (args.commit) {
console.log("\n=== Committing files ===");
const operations: CommitOperation[] = [];
for (const fileInfo of FILES_TO_DOWNLOAD) {
operations.push({
operation: "addOrUpdate",
content: pathToFileURL(join(downloadDir, fileInfo.filename)),
path: fileInfo.filename,
});
}
for (const fileInfo of FILES_TO_EDIT) {
operations.push({
operation: "edit",
originalContent: new Blob([await readFile(join(downloadDir, fileInfo.filename))]),
edits: fileInfo.edits,
path: fileInfo.filename,
});
}
const iterator = commitIter({
repo,
operations: files.map((file) => ({
operation: "addOrUpdate",
content: pathToFileURL(file.filepath),
path: file.filename,
})),
operations,
accessToken: args.token,
title: "Upload xet files with JS lib",
useXet: true,
Expand All @@ -325,7 +372,16 @@ async function main() {

console.log("Redownloading files and verifying SHA256 integrity");
for (const file of FILES_TO_DOWNLOAD) {
const fileBlob = await WebBlob.create(new URL(file.url));
const fileBlob = await downloadFile({
repo,
path: file.filename,
accessToken: args.token,
});

if (!fileBlob) {
throw new Error(`Failed to download ${file.filename}`);
}

const sha256Hash = sha256(fileBlob, { useWebWorker: false });
let res: IteratorResult<number, string>;
do {
Expand All @@ -335,6 +391,26 @@ async function main() {

console.log(`${file.filename}: ${finalHash} === ${file.sha256} ${finalHash === file.sha256 ? "✅" : "❌"}`);
}

for (const file of FILES_TO_EDIT) {
const fileBlob = await downloadFile({
repo,
path: file.filename,
accessToken: args.token,
});

if (!fileBlob) {
throw new Error(`Failed to download ${file.filename}`);
}

const sha256Hash = sha256(fileBlob, { useWebWorker: false });
let res: IteratorResult<number, string>;
do {
res = await sha256Hash.next();
} while (!res.done);
const finalHash = res.value;
console.log(`${file.filename}: ${finalHash} === ${file.sha256} ${finalHash === file.sha256 ? "✅" : "❌"}`);
}
}
}

Expand Down
79 changes: 75 additions & 4 deletions packages/hub/src/lib/commit.ts
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import { createBlobs } from "../utils/createBlobs";
import { uploadShards } from "../utils/uploadShards";
import { splitAsyncGenerator } from "../utils/splitAsyncGenerator";
import { mergeAsyncGenerators } from "../utils/mergeAsyncGenerators";
import { SplicedBlob } from "../utils/SplicedBlob";

const CONCURRENT_SHAS = 5;
const CONCURRENT_LFS_UPLOADS = 5;
Expand All @@ -44,6 +45,36 @@ export interface CommitFile {
// forceLfs?: boolean
}

/**
* Opitmized when only the beginning or the end of the file is replaced
*
* todo: handle other cases
*/
export interface CommitEditFile {
operation: "edit";
path: string;
/** Later, will be ContentSource. For now simpler to just handle blobs */
originalContent: Blob;
edits: Array<{
/**
* Later, will be ContentSource. For now simpler to just handle blobs
*
* originalContent from [start, end) will be replaced by this
*/
content: Blob;
/**
* The start position of the edit in the original content
*/
start: number;
/**
* The end position of the edit in the original content
*
* originalContent from [start, end) will be replaced by the edit
*/
end: number;
}>;
}

type CommitBlob = Omit<CommitFile, "content"> & { content: Blob };

// TODO: find a nice way to handle LFS & non-LFS files in an uniform manner, see https://github.com/huggingface/moon-landing/issues/4370
Expand All @@ -54,7 +85,7 @@ type CommitBlob = Omit<CommitFile, "content"> & { content: Blob };
// content?: ContentSource;
// };

export type CommitOperation = CommitDeletedEntry | CommitFile /* | CommitRenameFile */;
export type CommitOperation = CommitDeletedEntry | CommitFile | CommitEditFile /* | CommitRenameFile */;
type CommitBlobOperation = Exclude<CommitOperation, CommitFile> | CommitBlob;

export type CommitParams = {
Expand Down Expand Up @@ -91,9 +122,6 @@ export type CommitParams = {
fetch?: typeof fetch;
abortSignal?: AbortSignal;
// Credentials are optional due to custom fetch functions or cookie auth
/**
* @deprecated Not yet ready for production use
*/
useXet?: boolean;
} & Partial<CredentialsParams>;

Expand Down Expand Up @@ -138,6 +166,25 @@ export async function* commitIter(params: CommitParams): AsyncGenerator<CommitPr
const repoId = toRepoId(params.repo);
yield { event: "phase", phase: "preuploading" };

let useXet = params.useXet;
if (useXet) {
const info = await (params.fetch ?? fetch)(
`${params.hubUrl ?? HUB_URL}/api/${repoId.type}s/${repoId.name}?expand[]=xetEnabled`,
{
headers: {
...(accessToken && { Authorization: `Bearer ${accessToken}` }),
},
}
);

if (!info.ok) {
throw await createApiError(info);
}

const data = await info.json();
useXet = !!data.xetEnabled;
}

const lfsShas = new Map<string, string | null>();

const abortController = new AbortController();
Expand All @@ -160,6 +207,23 @@ export async function* commitIter(params: CommitParams): AsyncGenerator<CommitPr
const allOperations = (
await Promise.all(
params.operations.map(async (operation) => {
if (operation.operation === "edit" && !useXet) {
throw new Error("Edit operation is not supported when Xet is disabled");
}

if (operation.operation === "edit") {
// Convert EditFile operation to a file operation with SplicedBlob
const splicedBlob = SplicedBlob.create(
operation.originalContent,
operation.edits.map((splice) => ({ insert: splice.content, start: splice.start, end: splice.end }))
);
return {
operation: "addOrUpdate" as const,
path: operation.path,
content: splicedBlob,
};
}

if (operation.operation !== "addOrUpdate") {
return operation;
}
Expand Down Expand Up @@ -678,6 +742,13 @@ async function convertOperationToNdJson(operation: CommitBlobOperation): Promise
},
};
}
case "edit": {
// Note: By the time we get here, splice operations should have been converted to addOrUpdate operations with SplicedBlob
// But we handle this case for completeness
throw new Error(
"Edit operations should be converted to addOrUpdate operations before reaching convertOperationToNdJson"
);
}
default:
throw new TypeError("Unknown operation: " + (operation as { operation: string }).operation);
}
Expand Down
1 change: 0 additions & 1 deletion packages/hub/src/lib/dataset-info.ts
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@ export async function datasetInfo<
{
headers: {
...(accessToken ? { Authorization: `Bearer ${accessToken}` } : {}),
Accepts: "application/json",
},
}
);
Expand Down
1 change: 0 additions & 1 deletion packages/hub/src/lib/model-info.ts
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@ export async function modelInfo<
{
headers: {
...(accessToken ? { Authorization: `Bearer ${accessToken}` } : {}),
Accepts: "application/json",
},
}
);
Expand Down
1 change: 0 additions & 1 deletion packages/hub/src/lib/space-info.ts
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@ export async function spaceInfo<
{
headers: {
...(accessToken ? { Authorization: `Bearer ${accessToken}` } : {}),
Accepts: "application/json",
},
}
);
Expand Down
3 changes: 0 additions & 3 deletions packages/hub/src/lib/upload-file.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,6 @@ export function uploadFile(
fetch?: CommitParams["fetch"];
useWebWorkers?: CommitParams["useWebWorkers"];
abortSignal?: CommitParams["abortSignal"];
/**
* @deprecated Not yet ready for production use
*/
useXet?: CommitParams["useXet"];
} & Partial<CredentialsParams>
): Promise<CommitOutput> {
Expand Down
3 changes: 0 additions & 3 deletions packages/hub/src/lib/upload-files-with-progress.ts
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,6 @@ export async function* uploadFilesWithProgress(
parentCommit?: CommitParams["parentCommit"];
abortSignal?: CommitParams["abortSignal"];
maxFolderDepth?: CommitParams["maxFolderDepth"];
/**
* @deprecated Not yet ready for production use
*/
useXet?: CommitParams["useXet"];
/**
* Set this to true in order to have progress events for hashing
Expand Down
3 changes: 0 additions & 3 deletions packages/hub/src/lib/upload-files.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,6 @@ export function uploadFiles(
useWebWorkers?: CommitParams["useWebWorkers"];
maxFolderDepth?: CommitParams["maxFolderDepth"];
abortSignal?: CommitParams["abortSignal"];
/**
* @deprecated Not yet ready for production use
*/
useXet?: CommitParams["useXet"];
} & Partial<CredentialsParams>
): Promise<CommitOutput> {
Expand Down
Loading
Loading