@@ -6,9 +6,10 @@ import { join } from "node:path";
66import { writeFile , readFile , stat , mkdir } from "node:fs/promises" ;
77import type { RepoId } from "../src/types/public.js" ;
88import { toRepoId } from "../src/utils/toRepoId.js" ;
9- import { commitIter } from "../src/index.js" ;
9+ import type { CommitOperation } from "../src/index.js" ;
10+ import { commitIter , downloadFile } from "../src/index.js" ;
11+ import { SplicedBlob } from "../src/utils/SplicedBlob.js" ;
1012import { pathToFileURL } from "node:url" ;
11- import { WebBlob } from "../src/utils/WebBlob.js" ;
1213
1314/**
1415 * This script downloads the files from openai-community/gpt2 and simulates an upload to a xet repo.
@@ -38,6 +39,23 @@ const FILES_TO_DOWNLOAD = [
3839 } ,
3940] ;
4041
42+ const FILES_TO_EDIT = [
43+ {
44+ url : "https://huggingface.co/openai-community/gpt2/resolve/main/64-8bits.tflite?download=true" ,
45+ filename : "64-8bits.tflite.edited" ,
46+ sha256 : "c2b116ccc9e5362d55dd60b344a4b93156594feeef312b5b8833151f0732aa0a" ,
47+ edits : [
48+ {
49+ start : 0 ,
50+ end : 1000 ,
51+ content : new Blob ( [
52+ "Adding a new prefix to this TFLite file. Will xet still be efficient in deduplicating the file?" ,
53+ ] ) ,
54+ } ,
55+ ] ,
56+ } ,
57+ ] ;
58+
4159async function downloadFileIfNotExists ( url : string , filepath : string ) : Promise < void > {
4260 try {
4361 await stat ( filepath ) ;
@@ -58,13 +76,25 @@ async function downloadFileIfNotExists(url: string, filepath: string): Promise<v
5876 console . log ( `Downloaded ${ filepath } (${ buffer . byteLength } bytes)` ) ;
5977}
6078
61- async function * createFileSource (
62- files : Array < { filepath : string ; filename : string } >
63- ) : AsyncGenerator < { content : Blob ; path : string ; sha256 : string } > {
79+ async function * createFileSource ( files : Array < { filepath : string ; filename : string } > ) : AsyncGenerator < {
80+ content : Blob ;
81+ path : string ;
82+ sha256 : string ;
83+ edits ?: Array < { start : number ; end : number ; content : Blob } > ;
84+ } > {
6485 for ( const file of files ) {
6586 console . log ( `Processing ${ file . filename } ...` ) ;
6687 const buffer = await readFile ( file . filepath ) ;
67- const blob = new Blob ( [ buffer ] ) ;
88+ let blob = new Blob ( [ buffer ] ) ;
89+
90+ if ( file . filename . endsWith ( ".edited" ) ) {
91+ const edits = FILES_TO_EDIT . find ( ( f ) => f . filename === file . filename ) ?. edits ;
92+ if ( edits !== undefined ) {
93+ for ( const edit of edits ) {
94+ blob = SplicedBlob . create ( blob , [ { insert : edit . content , start : edit . start , end : edit . end } ] ) ;
95+ }
96+ }
97+ }
6898
6999 // Calculate sha256
70100 console . log ( `Calculating SHA256 for ${ file . filename } ...` ) ;
@@ -77,12 +107,11 @@ async function* createFileSource(
77107
78108 console . log ( `SHA256 for ${ file . filename } : ${ sha256Hash } ` ) ;
79109
80- if ( sha256Hash !== FILES_TO_DOWNLOAD . find ( ( f ) => f . filename === file . filename ) ?. sha256 ) {
81- throw new Error (
82- `SHA256 mismatch for ${ file . filename } : ${ sha256Hash } !== ${ FILES_TO_DOWNLOAD . find (
83- ( f ) => f . filename === file . filename
84- ) ?. sha256 } `
85- ) ;
110+ const sha256ToCheck =
111+ FILES_TO_DOWNLOAD . find ( ( f ) => f . filename === file . filename ) ?. sha256 ||
112+ FILES_TO_EDIT . find ( ( f ) => f . filename === file . filename ) ?. sha256 ;
113+ if ( sha256Hash !== sha256ToCheck ) {
114+ throw new Error ( `SHA256 mismatch for ${ file . filename } : ${ sha256Hash } !== ${ sha256ToCheck } ` ) ;
86115 }
87116
88117 yield {
@@ -215,6 +244,12 @@ async function main() {
215244 files . push ( { filepath, filename : fileInfo . filename } ) ;
216245 }
217246
247+ for ( const fileInfo of FILES_TO_EDIT ) {
248+ const filepath = join ( downloadDir , fileInfo . filename ) ;
249+ await downloadFileIfNotExists ( fileInfo . url , filepath ) ;
250+ files . push ( { filepath, filename : fileInfo . filename } ) ;
251+ }
252+
218253 // Parse repo
219254 const repoName = args . repo ;
220255
@@ -302,13 +337,25 @@ async function main() {
302337
303338 if ( args . commit ) {
304339 console . log ( "\n=== Committing files ===" ) ;
340+ const operations : CommitOperation [ ] = [ ] ;
341+ for ( const fileInfo of FILES_TO_DOWNLOAD ) {
342+ operations . push ( {
343+ operation : "addOrUpdate" ,
344+ content : pathToFileURL ( join ( downloadDir , fileInfo . filename ) ) ,
345+ path : fileInfo . filename ,
346+ } ) ;
347+ }
348+ for ( const fileInfo of FILES_TO_EDIT ) {
349+ operations . push ( {
350+ operation : "edit" ,
351+ originalContent : new Blob ( [ await readFile ( join ( downloadDir , fileInfo . filename ) ) ] ) ,
352+ edits : fileInfo . edits ,
353+ path : fileInfo . filename ,
354+ } ) ;
355+ }
305356 const iterator = commitIter ( {
306357 repo,
307- operations : files . map ( ( file ) => ( {
308- operation : "addOrUpdate" ,
309- content : pathToFileURL ( file . filepath ) ,
310- path : file . filename ,
311- } ) ) ,
358+ operations,
312359 accessToken : args . token ,
313360 title : "Upload xet files with JS lib" ,
314361 useXet : true ,
@@ -325,7 +372,16 @@ async function main() {
325372
326373 console . log ( "Redownloading files and verifying SHA256 integrity" ) ;
327374 for ( const file of FILES_TO_DOWNLOAD ) {
328- const fileBlob = await WebBlob . create ( new URL ( file . url ) ) ;
375+ const fileBlob = await downloadFile ( {
376+ repo,
377+ path : file . filename ,
378+ accessToken : args . token ,
379+ } ) ;
380+
381+ if ( ! fileBlob ) {
382+ throw new Error ( `Failed to download ${ file . filename } ` ) ;
383+ }
384+
329385 const sha256Hash = sha256 ( fileBlob , { useWebWorker : false } ) ;
330386 let res : IteratorResult < number , string > ;
331387 do {
@@ -335,6 +391,26 @@ async function main() {
335391
336392 console . log ( `${ file . filename } : ${ finalHash } === ${ file . sha256 } ${ finalHash === file . sha256 ? "✅" : "❌" } ` ) ;
337393 }
394+
395+ for ( const file of FILES_TO_EDIT ) {
396+ const fileBlob = await downloadFile ( {
397+ repo,
398+ path : file . filename ,
399+ accessToken : args . token ,
400+ } ) ;
401+
402+ if ( ! fileBlob ) {
403+ throw new Error ( `Failed to download ${ file . filename } ` ) ;
404+ }
405+
406+ const sha256Hash = sha256 ( fileBlob , { useWebWorker : false } ) ;
407+ let res : IteratorResult < number , string > ;
408+ do {
409+ res = await sha256Hash . next ( ) ;
410+ } while ( ! res . done ) ;
411+ const finalHash = res . value ;
412+ console . log ( `${ file . filename } : ${ finalHash } === ${ file . sha256 } ${ finalHash === file . sha256 ? "✅" : "❌" } ` ) ;
413+ }
338414 }
339415}
340416
0 commit comments