| 
 | 1 | +import { uploadShards } from "../src/utils/uploadShards.js";  | 
 | 2 | +import { sha256 } from "../src/utils/sha256.js";  | 
 | 3 | +import { parseArgs } from "node:util";  | 
 | 4 | +import { tmpdir } from "node:os";  | 
 | 5 | +import { join } from "node:path";  | 
 | 6 | +import { writeFile, readFile, stat, mkdir } from "node:fs/promises";  | 
 | 7 | +import type { RepoId } from "../src/types/public.js";  | 
 | 8 | +import { toRepoId } from "../src/utils/toRepoId.js";  | 
 | 9 | +import { commitIter } from "../src/index.js";  | 
 | 10 | +import { pathToFileURL } from "node:url";  | 
 | 11 | + | 
 | 12 | +/**  | 
 | 13 | + * This script downloads the files from openai-community/gpt2 and simulates an upload to a xet repo.  | 
 | 14 | + * It prints the dedup % and the statistics  | 
 | 15 | + *  | 
 | 16 | + * Usage:  | 
 | 17 | + *  | 
 | 18 | + * pnpm --filter hub bench -t <write token> -r <xet repo>  | 
 | 19 | + * pnpm --filter hub bench -t <write token> -r <xet repo> --commit # Actually upload files  | 
 | 20 | + */  | 
 | 21 | + | 
 | 22 | +const FILES_TO_DOWNLOAD = [  | 
 | 23 | +	{  | 
 | 24 | +		url: "https://huggingface.co/openai-community/gpt2/resolve/main/64-8bits.tflite?download=true",  | 
 | 25 | +		filename: "64-8bits.tflite",  | 
 | 26 | +	},  | 
 | 27 | +	{  | 
 | 28 | +		url: "https://huggingface.co/openai-community/gpt2/resolve/main/64-fp16.tflite?download=true",  | 
 | 29 | +		filename: "64-fp16.tflite",  | 
 | 30 | +	},  | 
 | 31 | +];  | 
 | 32 | + | 
 | 33 | +async function downloadFileIfNotExists(url: string, filepath: string): Promise<void> {  | 
 | 34 | +	try {  | 
 | 35 | +		await stat(filepath);  | 
 | 36 | +		console.log(`File ${filepath} already exists, skipping download`);  | 
 | 37 | +		return;  | 
 | 38 | +	} catch {  | 
 | 39 | +		// File doesn't exist, proceed with download  | 
 | 40 | +	}  | 
 | 41 | + | 
 | 42 | +	console.log(`Downloading ${url} to ${filepath}...`);  | 
 | 43 | +	const response = await fetch(url);  | 
 | 44 | +	if (!response.ok) {  | 
 | 45 | +		throw new Error(`Failed to download ${url}: ${response.status} ${response.statusText}`);  | 
 | 46 | +	}  | 
 | 47 | + | 
 | 48 | +	const buffer = await response.arrayBuffer();  | 
 | 49 | +	await writeFile(filepath, new Uint8Array(buffer));  | 
 | 50 | +	console.log(`Downloaded ${filepath} (${buffer.byteLength} bytes)`);  | 
 | 51 | +}  | 
 | 52 | + | 
 | 53 | +async function* createFileSource(  | 
 | 54 | +	files: Array<{ filepath: string; filename: string }>  | 
 | 55 | +): AsyncGenerator<{ content: Blob; path: string; sha256: string }> {  | 
 | 56 | +	for (const file of files) {  | 
 | 57 | +		console.log(`Processing ${file.filename}...`);  | 
 | 58 | +		const buffer = await readFile(file.filepath);  | 
 | 59 | +		const blob = new Blob([buffer]);  | 
 | 60 | + | 
 | 61 | +		// Calculate sha256  | 
 | 62 | +		console.log(`Calculating SHA256 for ${file.filename}...`);  | 
 | 63 | +		const sha256Iterator = sha256(blob, { useWebWorker: false });  | 
 | 64 | +		let res: IteratorResult<number, string>;  | 
 | 65 | +		do {  | 
 | 66 | +			res = await sha256Iterator.next();  | 
 | 67 | +		} while (!res.done);  | 
 | 68 | +		const sha256Hash = res.value;  | 
 | 69 | + | 
 | 70 | +		console.log(`SHA256 for ${file.filename}: ${sha256Hash}`);  | 
 | 71 | +		yield {  | 
 | 72 | +			content: blob,  | 
 | 73 | +			path: file.filename,  | 
 | 74 | +			sha256: sha256Hash,  | 
 | 75 | +		};  | 
 | 76 | +	}  | 
 | 77 | +}  | 
 | 78 | + | 
 | 79 | +function getBodySize(body: RequestInit["body"]): string {  | 
 | 80 | +	if (!body) {  | 
 | 81 | +		return "no body";  | 
 | 82 | +	}  | 
 | 83 | +	if (body instanceof ArrayBuffer) {  | 
 | 84 | +		return body.byteLength.toString();  | 
 | 85 | +	}  | 
 | 86 | +	if (body instanceof Blob) {  | 
 | 87 | +		return "blob";  | 
 | 88 | +	}  | 
 | 89 | +	if (body instanceof Uint8Array) {  | 
 | 90 | +		return body.byteLength.toString();  | 
 | 91 | +	}  | 
 | 92 | +	return "unknown size";  | 
 | 93 | +}  | 
 | 94 | + | 
 | 95 | +function createMockFetch(): {  | 
 | 96 | +	fetch: typeof fetch;  | 
 | 97 | +	getStats: () => { xorbCount: number; shardCount: number; xorbBytes: number; shardBytes: number };  | 
 | 98 | +} {  | 
 | 99 | +	let xorbCount = 0;  | 
 | 100 | +	let shardCount = 0;  | 
 | 101 | +	let xorbBytes = 0;  | 
 | 102 | +	let shardBytes = 0;  | 
 | 103 | + | 
 | 104 | +	const mockFetch = async function (input: string | URL | Request, init?: RequestInit): Promise<Response> {  | 
 | 105 | +		const url = typeof input === "string" ? input : input.toString();  | 
 | 106 | + | 
 | 107 | +		// Mock successful responses for xorb and shard uploads  | 
 | 108 | +		if (url.includes("/xorb/")) {  | 
 | 109 | +			xorbCount++;  | 
 | 110 | +			const bodySize = getBodySize(init?.body);  | 
 | 111 | +			xorbBytes += parseInt(bodySize);  | 
 | 112 | +			console.log(`[MOCK] Xorb upload ${xorbCount}: ${init?.method || "GET"} ${url} (${bodySize})`);  | 
 | 113 | + | 
 | 114 | +			return new Response(null, {  | 
 | 115 | +				status: 200,  | 
 | 116 | +				statusText: "OK",  | 
 | 117 | +			});  | 
 | 118 | +		}  | 
 | 119 | + | 
 | 120 | +		if (url.endsWith("/shard")) {  | 
 | 121 | +			shardCount++;  | 
 | 122 | +			const bodySize = getBodySize(init?.body);  | 
 | 123 | +			shardBytes += parseInt(bodySize);  | 
 | 124 | +			console.log(`[MOCK] Shard upload ${shardCount}: ${init?.method || "GET"} ${url} (${bodySize})`);  | 
 | 125 | + | 
 | 126 | +			return new Response(null, {  | 
 | 127 | +				status: 200,  | 
 | 128 | +				statusText: "OK",  | 
 | 129 | +			});  | 
 | 130 | +		}  | 
 | 131 | + | 
 | 132 | +		// For other requests, use real fetch  | 
 | 133 | +		return fetch(input, init).then((res) => {  | 
 | 134 | +			console.log(`[real] ${res.status} ${res.statusText} ${url} ${res.headers.get("content-length")}`);  | 
 | 135 | +			return res;  | 
 | 136 | +		});  | 
 | 137 | +	};  | 
 | 138 | + | 
 | 139 | +	return {  | 
 | 140 | +		fetch: mockFetch,  | 
 | 141 | +		getStats: () => ({ xorbCount, shardCount, xorbBytes, shardBytes }),  | 
 | 142 | +	};  | 
 | 143 | +}  | 
 | 144 | + | 
 | 145 | +async function main() {  | 
 | 146 | +	const { values: args } = parseArgs({  | 
 | 147 | +		options: {  | 
 | 148 | +			token: {  | 
 | 149 | +				type: "string",  | 
 | 150 | +				short: "t",  | 
 | 151 | +			},  | 
 | 152 | +			repo: {  | 
 | 153 | +				type: "string",  | 
 | 154 | +				short: "r",  | 
 | 155 | +			},  | 
 | 156 | +			commit: {  | 
 | 157 | +				type: "boolean",  | 
 | 158 | +				short: "c",  | 
 | 159 | +				default: false,  | 
 | 160 | +			},  | 
 | 161 | +		},  | 
 | 162 | +	});  | 
 | 163 | + | 
 | 164 | +	if (!args.token || !args.repo) {  | 
 | 165 | +		console.error("Usage: pnpm --filter hub bench -t <write token> -r <xet repo>");  | 
 | 166 | +		console.error("Example: pnpm --filter hub bench -t hf_... -r myuser/myrepo");  | 
 | 167 | +		process.exit(1);  | 
 | 168 | +	}  | 
 | 169 | + | 
 | 170 | +	// Setup temp directory  | 
 | 171 | +	const tempDir = tmpdir();  | 
 | 172 | +	const downloadDir = join(tempDir, "hf-bench-downloads");  | 
 | 173 | + | 
 | 174 | +	// Ensure download directory exists  | 
 | 175 | +	await mkdir(downloadDir, { recursive: true });  | 
 | 176 | + | 
 | 177 | +	// Download files  | 
 | 178 | +	const files: Array<{ filepath: string; filename: string }> = [];  | 
 | 179 | + | 
 | 180 | +	for (const fileInfo of FILES_TO_DOWNLOAD) {  | 
 | 181 | +		const filepath = join(downloadDir, fileInfo.filename);  | 
 | 182 | +		await downloadFileIfNotExists(fileInfo.url, filepath);  | 
 | 183 | +		files.push({ filepath, filename: fileInfo.filename });  | 
 | 184 | +	}  | 
 | 185 | + | 
 | 186 | +	// Parse repo  | 
 | 187 | +	const repoName = args.repo;  | 
 | 188 | + | 
 | 189 | +	const repo: RepoId = toRepoId(repoName);  | 
 | 190 | + | 
 | 191 | +	// Create mock fetch  | 
 | 192 | +	const mockFetchObj = createMockFetch();  | 
 | 193 | + | 
 | 194 | +	// Setup upload parameters  | 
 | 195 | +	const uploadParams = {  | 
 | 196 | +		accessToken: args.token,  | 
 | 197 | +		hubUrl: "https://huggingface.co",  | 
 | 198 | +		customFetch: mockFetchObj.fetch,  | 
 | 199 | +		repo,  | 
 | 200 | +		rev: "main",  | 
 | 201 | +	};  | 
 | 202 | + | 
 | 203 | +	// Track statistics  | 
 | 204 | +	const stats: Array<{  | 
 | 205 | +		filename: string;  | 
 | 206 | +		size: number;  | 
 | 207 | +		dedupRatio: number;  | 
 | 208 | +	}> = [];  | 
 | 209 | + | 
 | 210 | +	console.log("\n=== Starting upload simulation ===");  | 
 | 211 | + | 
 | 212 | +	// Process files through uploadShards  | 
 | 213 | +	const fileSource = createFileSource(files);  | 
 | 214 | + | 
 | 215 | +	for await (const event of uploadShards(fileSource, uploadParams)) {  | 
 | 216 | +		switch (event.event) {  | 
 | 217 | +			case "file": {  | 
 | 218 | +				console.log(`\n📁 Processed file: ${event.path}`);  | 
 | 219 | +				console.log(`   SHA256: ${event.sha256}`);  | 
 | 220 | +				console.log(`   Dedup ratio: ${(event.dedupRatio * 100).toFixed(2)}%`);  | 
 | 221 | + | 
 | 222 | +				// Find the file size  | 
 | 223 | +				const file = files.find((f) => f.filename === event.path);  | 
 | 224 | +				if (file) {  | 
 | 225 | +					const fileStats = await stat(file.filepath);  | 
 | 226 | + | 
 | 227 | +					stats.push({  | 
 | 228 | +						filename: event.path,  | 
 | 229 | +						size: fileStats.size,  | 
 | 230 | +						dedupRatio: event.dedupRatio,  | 
 | 231 | +					});  | 
 | 232 | +				}  | 
 | 233 | +				break;  | 
 | 234 | +			}  | 
 | 235 | + | 
 | 236 | +			case "fileProgress": {  | 
 | 237 | +				const progress = (event.progress * 100).toFixed(1);  | 
 | 238 | +				console.log(`   📈 Progress for ${event.path}: ${progress}%`);  | 
 | 239 | +				break;  | 
 | 240 | +			}  | 
 | 241 | +		}  | 
 | 242 | +	}  | 
 | 243 | + | 
 | 244 | +	// Get actual upload counts from the mock fetch  | 
 | 245 | +	const uploadStats = mockFetchObj.getStats();  | 
 | 246 | +	console.log(`\n📊 Actual upload counts: ${uploadStats.xorbCount} xorbs, ${uploadStats.shardCount} shards`);  | 
 | 247 | + | 
 | 248 | +	// Output final statistics  | 
 | 249 | +	console.log("\n=== BENCHMARK RESULTS ===");  | 
 | 250 | +	console.log("File Statistics:");  | 
 | 251 | +	console.log("================");  | 
 | 252 | + | 
 | 253 | +	for (const stat of stats) {  | 
 | 254 | +		console.log(`\n📄 ${stat.filename}:`);  | 
 | 255 | +		console.log(`   Size: ${(stat.size / 1024 / 1024).toFixed(2)} MB`);  | 
 | 256 | +		console.log(`   Deduplication: ${(stat.dedupRatio * 100).toFixed(2)}%`);  | 
 | 257 | +	}  | 
 | 258 | + | 
 | 259 | +	console.log("\n=== SUMMARY ===");  | 
 | 260 | +	const totalSize = stats.reduce((sum, s) => sum + s.size, 0);  | 
 | 261 | +	const avgDedup = stats.reduce((sum, s) => sum + s.dedupRatio, 0) / stats.length;  | 
 | 262 | + | 
 | 263 | +	console.log(`Total files: ${stats.length}`);  | 
 | 264 | +	console.log(`Total size: ${(totalSize / 1024 / 1024).toFixed(2)} MB`);  | 
 | 265 | +	console.log(`Total xorbs: ${uploadStats.xorbCount}`);  | 
 | 266 | +	console.log(`Total shards: ${uploadStats.shardCount}`);  | 
 | 267 | +	console.log(`Total xorb bytes: ${uploadStats.xorbBytes.toLocaleString("fr")} bytes`);  | 
 | 268 | +	console.log(`Total shard bytes: ${uploadStats.shardBytes.toLocaleString("fr")} bytes`);  | 
 | 269 | +	console.log(`Average deduplication: ${(avgDedup * 100).toFixed(2)}%`);  | 
 | 270 | + | 
 | 271 | +	if (args.commit) {  | 
 | 272 | +		console.log("\n=== Committing files ===");  | 
 | 273 | +		const iterator = commitIter({  | 
 | 274 | +			repo,  | 
 | 275 | +			operations: files.map((file) => ({  | 
 | 276 | +				operation: "addOrUpdate",  | 
 | 277 | +				content: pathToFileURL(file.filepath),  | 
 | 278 | +				path: file.filename,  | 
 | 279 | +			})),  | 
 | 280 | +			accessToken: args.token,  | 
 | 281 | +			title: "Upload xet files with JS lib",  | 
 | 282 | +			xet: true,  | 
 | 283 | +		});  | 
 | 284 | +		for await (const event of iterator) {  | 
 | 285 | +			if (event.event === "fileProgress" && event.state === "hashing") {  | 
 | 286 | +				// We don't care about the hashing progress  | 
 | 287 | +			} else {  | 
 | 288 | +				console.log(event);  | 
 | 289 | +			}  | 
 | 290 | +		}  | 
 | 291 | + | 
 | 292 | +		console.log("Done committing");  | 
 | 293 | +	}  | 
 | 294 | +}  | 
 | 295 | + | 
 | 296 | +main().catch((error) => {  | 
 | 297 | +	console.error("Error:", error);  | 
 | 298 | +	process.exit(1);  | 
 | 299 | +});  | 
0 commit comments