@@ -10,17 +10,22 @@ import { existsSync } from "node:fs";
1010
1111/**
1212 * This script debugs xet uploads by capturing all network data locally
13- * It takes a local file , repo, and token, then uploads while saving:
13+ * It takes one or more local files , repo, and token, then uploads while saving:
1414 * - Dedup shards as dedup_[chunk_hash]_shard.bin
1515 * - Uploaded xorbs as uploaded_xorb_1.bin, uploaded_xorb_2.bin, etc.
1616 * - Uploaded shards as uploaded_shard_1.bin, uploaded_shard_2.bin, etc.
1717 *
18- * Normal mode: Captures all upload data to upload_[filename]/ directory
18+ * Normal mode: Captures all upload data to upload_[filename]/ directory (single file) or multiple-files/ directory (multiple files)
1919 * Replay mode: Validates upload data matches previously captured local files
2020 *
2121 * Usage:
22+ * Single file:
2223 * pnpm --filter hub debug-xet -f <local_file> -t <write_token> -r <xet_repo>
2324 * pnpm --filter hub debug-xet -f <local_file> -t <write_token> -r <xet_repo> --replay
25+ *
26+ * Multiple files (comma-separated):
27+ * pnpm --filter hub debug-xet -f <file1,file2,file3> -t <write_token> -r <xet_repo>
28+ * pnpm --filter hub debug-xet -f <file1,file2,file3> -t <write_token> -r <xet_repo> --replay
2429 */
2530
2631interface DebugFetchStats {
@@ -182,32 +187,34 @@ function createDebugFetch(args: { debugDir: string; replay?: boolean }): {
182187 } ;
183188}
184189
185- async function * createFileSource ( filepath : string ) : AsyncGenerator < {
190+ async function * createMultiFileSource ( filepaths : string [ ] ) : AsyncGenerator < {
186191 content : Blob ;
187192 path : string ;
188193 sha256 : string ;
189194} > {
190- const filename = basename ( filepath ) ;
191- console . log ( `Processing ${ filename } ...` ) ;
192-
193- const blob : Blob = await FileBlob . create ( filepath ) ;
194-
195- // Calculate sha256
196- console . log ( `Calculating SHA256 for ${ filename } ...` ) ;
197- const sha256Iterator = sha256 ( blob , { useWebWorker : false } ) ;
198- let res : IteratorResult < number , string > ;
199- do {
200- res = await sha256Iterator . next ( ) ;
201- } while ( ! res . done ) ;
202- const sha256Hash = res . value ;
203-
204- console . log ( `SHA256 for ${ filename } : ${ sha256Hash } ` ) ;
205-
206- yield {
207- content : blob ,
208- path : filename ,
209- sha256 : sha256Hash ,
210- } ;
195+ for ( const filepath of filepaths ) {
196+ const filename = basename ( filepath ) ;
197+ console . log ( `Processing ${ filename } ...` ) ;
198+
199+ const blob : Blob = await FileBlob . create ( filepath ) ;
200+
201+ // Calculate sha256
202+ console . log ( `Calculating SHA256 for ${ filename } ...` ) ;
203+ const sha256Iterator = sha256 ( blob , { useWebWorker : false } ) ;
204+ let res : IteratorResult < number , string > ;
205+ do {
206+ res = await sha256Iterator . next ( ) ;
207+ } while ( ! res . done ) ;
208+ const sha256Hash = res . value ;
209+
210+ console . log ( `SHA256 for ${ filename } : ${ sha256Hash } ` ) ;
211+
212+ yield {
213+ content : blob ,
214+ path : filename ,
215+ sha256 : sha256Hash ,
216+ } ;
217+ }
211218}
212219
213220async function main ( ) {
@@ -233,20 +240,27 @@ async function main() {
233240 } ) ;
234241
235242 if ( ! args . token || ! args . repo || ! args . file ) {
236- console . error ( "Usage: pnpm --filter hub debug-xet -f <local_file > -t <write_token> -r <xet_repo>" ) ;
243+ console . error ( "Usage: pnpm --filter hub debug-xet -f <file1,file2,file3 > -t <write_token> -r <xet_repo>" ) ;
237244 console . error ( "Example: pnpm --filter hub debug-xet -f ./model.bin -t hf_... -r myuser/myrepo" ) ;
245+ console . error ( "Example: pnpm --filter hub debug-xet -f ./model1.bin,./model2.bin -t hf_... -r myuser/myrepo" ) ;
238246 console . error ( "Options:" ) ;
239247 console . error ( " --replay Use local dedup info instead of remote" ) ;
240248 process . exit ( 1 ) ;
241249 }
242250
243- if ( ! existsSync ( args . file ) ) {
244- console . error ( `❌ File ${ args . file } does not exist` ) ;
245- process . exit ( 1 ) ;
251+ // Parse comma-separated file paths
252+ const filePaths = args . file . split ( "," ) . map ( ( f ) => f . trim ( ) ) ;
253+
254+ // Validate all files exist
255+ for ( const filePath of filePaths ) {
256+ if ( ! existsSync ( filePath ) ) {
257+ console . error ( `❌ File ${ filePath } does not exist` ) ;
258+ process . exit ( 1 ) ;
259+ }
246260 }
247261
248- const filename = basename ( args . file ) ;
249- const debugDir = `upload_${ filename } ` ;
262+ // Determine debug directory name
263+ const debugDir = filePaths . length > 1 ? "multiple-files" : `upload_${ basename ( filePaths [ 0 ] ) } ` ;
250264
251265 // Handle debug directory based on mode
252266 if ( args . replay ) {
@@ -288,20 +302,30 @@ async function main() {
288302 rev : "main" ,
289303 } ;
290304
291- console . log ( `\n=== Starting debug upload for ${ filename } ===` ) ;
305+ console . log (
306+ `\n=== Starting debug upload for ${ filePaths . length > 1 ? `${ filePaths . length } files` : basename ( filePaths [ 0 ] ) } ===`
307+ ) ;
292308 if ( args . replay ) {
293309 console . log ( "🔄 Replay mode: Using local dedup info when available" ) ;
294310 }
295311
296- // Get file stats
297- const fileStats = await stat ( args . file ) ;
298- console . log ( `📄 File size: ${ ( fileStats . size / 1024 / 1024 ) . toFixed ( 2 ) } MB` ) ;
312+ // Get total file stats
313+ let totalSize = 0 ;
314+ for ( const filePath of filePaths ) {
315+ const fileStats = await stat ( filePath ) ;
316+ totalSize += fileStats . size ;
317+ console . log ( `📄 ${ basename ( filePath ) } : ${ ( fileStats . size / 1_000_000 ) . toFixed ( 2 ) } MB` ) ;
318+ }
319+ console . log ( `📊 Total size: ${ ( totalSize / 1_000_000 ) . toFixed ( 2 ) } MB` ) ;
299320
300- // Process file through uploadShards
301- const fileSource = createFileSource ( args . file ) ;
321+ // Process files through uploadShards
322+ const fileSource = createMultiFileSource ( filePaths ) ;
302323
303- let dedupRatio = 0 ;
304- let fileSha256 = "" ;
324+ const processedFiles : Array < {
325+ path : string ;
326+ sha256 : string ;
327+ dedupRatio : number ;
328+ } > = [ ] ;
305329
306330 for await ( const event of uploadShards ( fileSource , uploadParams ) ) {
307331 switch ( event . event ) {
@@ -310,8 +334,11 @@ async function main() {
310334 console . log ( ` SHA256: ${ event . sha256 } ` ) ;
311335 console . log ( ` Dedup ratio: ${ ( event . dedupRatio * 100 ) . toFixed ( 2 ) } %` ) ;
312336
313- dedupRatio = event . dedupRatio ;
314- fileSha256 = event . sha256 ;
337+ processedFiles . push ( {
338+ path : event . path ,
339+ sha256 : event . sha256 ,
340+ dedupRatio : event . dedupRatio ,
341+ } ) ;
315342 break ;
316343 }
317344
@@ -327,9 +354,21 @@ async function main() {
327354
328355 console . log ( "\n=== DEBUG UPLOAD RESULTS ===" ) ;
329356 console . log ( `📁 Debug directory: ${ debugDir } ` ) ;
330- console . log ( `📄 Original file: ${ filename } (${ ( fileStats . size / 1024 / 1024 ) . toFixed ( 2 ) } MB)` ) ;
331- console . log ( `🔒 SHA256: ${ fileSha256 } ` ) ;
332- console . log ( `📊 Deduplication: ${ ( dedupRatio * 100 ) . toFixed ( 2 ) } %` ) ;
357+ console . log ( `📄 Processed files: ${ processedFiles . length } ` ) ;
358+ console . log ( `📊 Total size: ${ ( totalSize / 1024 / 1024 ) . toFixed ( 2 ) } MB` ) ;
359+
360+ // Show details for each file
361+ for ( const file of processedFiles ) {
362+ console . log ( `\n🔒 ${ file . path } :` ) ;
363+ console . log ( ` SHA256: ${ file . sha256 } ` ) ;
364+ console . log ( ` Deduplication: ${ ( file . dedupRatio * 100 ) . toFixed ( 2 ) } %` ) ;
365+ }
366+
367+ // Calculate average dedup ratio
368+ const avgDedupRatio =
369+ processedFiles . length > 0 ? processedFiles . reduce ( ( sum , f ) => sum + f . dedupRatio , 0 ) / processedFiles . length : 0 ;
370+
371+ console . log ( `\n📊 Average deduplication: ${ ( avgDedupRatio * 100 ) . toFixed ( 2 ) } %` ) ;
333372 console . log ( `📤 Network calls:` ) ;
334373 console . log ( ` - ${ stats . xorbCount } xorb uploads` ) ;
335374 console . log ( ` - ${ stats . shardCount } shard uploads` ) ;
0 commit comments