@@ -62,7 +62,7 @@ def repack_h5(input_filename: str, output_filename: str):
6262def upload_folder (folder : str , repo_id : str ):
6363 api = HfApi ()
6464 api .upload_large_folder (
65- repo_id = repo_id , folder_path = folder , repo_type = "dataset" , private = True
65+ repo_id = repo_id , folder_path = folder , repo_type = "dataset" , private = False
6666 )
6767
6868
@@ -82,7 +82,21 @@ def process_file(
8282 output_directory : pathlib .Path ,
8383 dataset_tag : str ,
8484 dataset_name : str ,
85+ hdf5_repack : bool = True ,
8586):
87+ """Copy or process original files into a directory prior to uploading to HF hub.
88+ Args:
89+ root_directory: The directory containing the original file to process or copy.
90+ All existing files will be considered.
91+ file_path: File to be processed.
92+ output_directory: Where the files will be copied or processed.
93+ dataset_tag: HF dataset tags to add to the ReadMe header.
94+ dataset_name: Dataset name to add to the HF dataset card.
95+ hdf5_repack: Option to repack HDF5 files for cloud purposes.
96+ See https://www.hdfgroup.org/2024/01/08/strategies-and-software-to-optimize-hdf5-netcdf-4-files-for-the-cloud/
97+ for more details.
98+
99+ """
86100 in_dir_file_path = file_path .relative_to (root_directory )
87101 # Skip irrelevant files
88102 if not is_file_valid (file_path ):
@@ -97,10 +111,10 @@ def process_file(
97111 logger .debug (f"Convert ReadMe { file_path } " )
98112 edit_readme (file_path , target_filename , dataset_tag , dataset_name )
99113 # Process HDF5
100- elif file_path .suffix in [".hdf" , ".h5" , ".hdf5" ]:
114+ elif file_path .suffix in [".hdf" , ".h5" , ".hdf5" ] and hdf5_repack :
101115 logger .debug (f"Repack HDF5 { file_path } " )
102116 repack_h5 (file_path , target_filename )
103- # Simply copy remaining files
117+ # Simply copy remaining files as symbolic link
104118 else :
105119 logger .debug (f"Link file { file_path } " )
106120 target_filename .symlink_to (file_path )
@@ -118,11 +132,17 @@ def process_file(
118132 default = 1 ,
119133 help = "Number of workers for the file processing." ,
120134 )
135+ parser .add_argument (
136+ "--no-repack" ,
137+ action = "store_false" ,
138+ help = "Disable repacking HDF5 files for cloud optimization." ,
139+ )
121140 args = parser .parse_args ()
122141 directory = pathlib .Path (args .directory )
123142 repo_id = args .repo_id
124143 n_proc = args .n_proc
125144 dataset_tag = args .tag
145+ hdf5_repack = args .no_repack
126146 dataset_name = pathlib .Path (repo_id ).name
127147
128148 files = list (directory .rglob ("*" ))
@@ -135,6 +155,7 @@ def process_file(
135155 output_directory = tmp_dirname ,
136156 dataset_tag = dataset_tag ,
137157 dataset_name = dataset_name ,
158+ hdf5_repack = hdf5_repack ,
138159 )
139160 with multiprocessing .Pool () as pool :
140161 pool .map (process_fn , files , chunksize = chunk_size )
0 commit comments