Upload more datasets to HF (#31)

LTMeyer · web-flow · commit 7c98aa1938d8 · 2025-03-24T09:23:43.000+01:00
* Add the Well dataset collection mention to HF card

* Ignore streamlit local runs

* Make uploaded dataset public by default

* Add option to skip repacking HDF5 file

* Increase CPU resources in the uploading script
diff --git a/.gitignore b/.gitignore
@@ -99,3 +99,6 @@ outputs/
 wandb/
 datasets/rt_experimental
 check_well_data_4059043.out
+
+# Ignore streamlit local runs
+.streamlit/
diff --git a/scripts/huggingface/DATASET_README_HEADER_TEMPLATE.md b/scripts/huggingface/DATASET_README_HEADER_TEMPLATE.md
@@ -14,6 +14,8 @@ task_ids:
 - multivariate-time-series-forecasting
 ---
 
+This Dataset is part of [The Well Collection](https://huggingface.co/papers/2412.00568).
+
 # How To Load from HuggingFace Hub
 
 1. Be sure to have `the_well` installed (`pip install the_well`)
diff --git a/scripts/huggingface/upload.py b/scripts/huggingface/upload.py
@@ -62,7 +62,7 @@ def repack_h5(input_filename: str, output_filename: str):
 def upload_folder(folder: str, repo_id: str):
     api = HfApi()
     api.upload_large_folder(
-        repo_id=repo_id, folder_path=folder, repo_type="dataset", private=True
+        repo_id=repo_id, folder_path=folder, repo_type="dataset", private=False
     )
 
 
@@ -82,7 +82,21 @@ def process_file(
     output_directory: pathlib.Path,
     dataset_tag: str,
     dataset_name: str,
+    hdf5_repack: bool = True,
 ):
+    """Copy or process original files into a directory prior to uploading to HF hub.
+    Args:
+        root_directory: The directory containing the original file to process or copy.
+            All existing files will be considered.
+        file_path: File to be processed.
+        output_directory: Where the files will be copied or processed.
+        dataset_tag: HF dataset tags to add to the ReadMe header.
+        dataset_name: Dataset name to add to the HF dataset card.
+        hdf5_repack: Option to repack HDF5 files for cloud purposes.
+            See https://www.hdfgroup.org/2024/01/08/strategies-and-software-to-optimize-hdf5-netcdf-4-files-for-the-cloud/
+            for more details.
+
+    """
     in_dir_file_path = file_path.relative_to(root_directory)
     # Skip irrelevant files
     if not is_file_valid(file_path):
@@ -97,10 +111,10 @@ def process_file(
         logger.debug(f"Convert ReadMe {file_path}")
         edit_readme(file_path, target_filename, dataset_tag, dataset_name)
     # Process HDF5
-    elif file_path.suffix in [".hdf", ".h5", ".hdf5"]:
+    elif file_path.suffix in [".hdf", ".h5", ".hdf5"] and hdf5_repack:
         logger.debug(f"Repack HDF5 {file_path}")
         repack_h5(file_path, target_filename)
-    # Simply copy remaining files
+    # Simply copy remaining files as symbolic link
     else:
         logger.debug(f"Link file {file_path}")
         target_filename.symlink_to(file_path)
@@ -118,11 +132,17 @@ def process_file(
         default=1,
         help="Number of workers for the file processing.",
     )
+    parser.add_argument(
+        "--no-repack",
+        action="store_false",
+        help="Disable repacking HDF5 files for cloud optimization.",
+    )
     args = parser.parse_args()
     directory = pathlib.Path(args.directory)
     repo_id = args.repo_id
     n_proc = args.n_proc
     dataset_tag = args.tag
+    hdf5_repack = args.no_repack
     dataset_name = pathlib.Path(repo_id).name
 
     files = list(directory.rglob("*"))
@@ -135,6 +155,7 @@ def process_file(
             output_directory=tmp_dirname,
             dataset_tag=dataset_tag,
             dataset_name=dataset_name,
+            hdf5_repack=hdf5_repack,
         )
         with multiprocessing.Pool() as pool:
             pool.map(process_fn, files, chunksize=chunk_size)
diff --git a/scripts/huggingface/upload.sh b/scripts/huggingface/upload.sh
@@ -1,19 +1,19 @@
 #!/usr/bin/bash -l
 
-
 #SBATCH --partition=polymathic
 #SBATCH -C genoa
 #SBATCH --time=20:00:00
 #SBATCH -N 1
 #SBATCH -n 1
-#SBATCH --cpus-per-task=12
+#SBATCH --cpus-per-task=96
 #SBATCH --output=upload_well_data_%j.out
 
-set -x
 
 module load python
 module load hdf5
 source ~/venvs/well_venv/bin/activate
 
+set -x
+
 huggingface-cli login --token $HF_TOKEN --add-to-git-credential
 srun python -u upload.py $@