Skip to content

Commit 7c98aa1

Browse files
authored
Upload more datasets to HF (#31)
* Add the Well dataset collection mention to HF card * Ignore streamlit local runs * Make uploaded dataset public by default * Add option to skip repacking HDF5 file * Increase CPU resources in the uploading script
1 parent c730c77 commit 7c98aa1

File tree

4 files changed

+32
-6
lines changed

4 files changed

+32
-6
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,3 +99,6 @@ outputs/
9999
wandb/
100100
datasets/rt_experimental
101101
check_well_data_4059043.out
102+
103+
# Ignore streamlit local runs
104+
.streamlit/

scripts/huggingface/DATASET_README_HEADER_TEMPLATE.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@ task_ids:
1414
- multivariate-time-series-forecasting
1515
---
1616

17+
This Dataset is part of [The Well Collection](https://huggingface.co/papers/2412.00568).
18+
1719
# How To Load from HuggingFace Hub
1820

1921
1. Be sure to have `the_well` installed (`pip install the_well`)

scripts/huggingface/upload.py

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ def repack_h5(input_filename: str, output_filename: str):
6262
def upload_folder(folder: str, repo_id: str):
6363
api = HfApi()
6464
api.upload_large_folder(
65-
repo_id=repo_id, folder_path=folder, repo_type="dataset", private=True
65+
repo_id=repo_id, folder_path=folder, repo_type="dataset", private=False
6666
)
6767

6868

@@ -82,7 +82,21 @@ def process_file(
8282
output_directory: pathlib.Path,
8383
dataset_tag: str,
8484
dataset_name: str,
85+
hdf5_repack: bool = True,
8586
):
87+
"""Copy or process original files into a directory prior to uploading to HF hub.
88+
Args:
89+
root_directory: The directory containing the original file to process or copy.
90+
All existing files will be considered.
91+
file_path: File to be processed.
92+
output_directory: Where the files will be copied or processed.
93+
dataset_tag: HF dataset tags to add to the ReadMe header.
94+
dataset_name: Dataset name to add to the HF dataset card.
95+
hdf5_repack: Option to repack HDF5 files for cloud purposes.
96+
See https://www.hdfgroup.org/2024/01/08/strategies-and-software-to-optimize-hdf5-netcdf-4-files-for-the-cloud/
97+
for more details.
98+
99+
"""
86100
in_dir_file_path = file_path.relative_to(root_directory)
87101
# Skip irrelevant files
88102
if not is_file_valid(file_path):
@@ -97,10 +111,10 @@ def process_file(
97111
logger.debug(f"Convert ReadMe {file_path}")
98112
edit_readme(file_path, target_filename, dataset_tag, dataset_name)
99113
# Process HDF5
100-
elif file_path.suffix in [".hdf", ".h5", ".hdf5"]:
114+
elif file_path.suffix in [".hdf", ".h5", ".hdf5"] and hdf5_repack:
101115
logger.debug(f"Repack HDF5 {file_path}")
102116
repack_h5(file_path, target_filename)
103-
# Simply copy remaining files
117+
# Simply copy remaining files as symbolic link
104118
else:
105119
logger.debug(f"Link file {file_path}")
106120
target_filename.symlink_to(file_path)
@@ -118,11 +132,17 @@ def process_file(
118132
default=1,
119133
help="Number of workers for the file processing.",
120134
)
135+
parser.add_argument(
136+
"--no-repack",
137+
action="store_false",
138+
help="Disable repacking HDF5 files for cloud optimization.",
139+
)
121140
args = parser.parse_args()
122141
directory = pathlib.Path(args.directory)
123142
repo_id = args.repo_id
124143
n_proc = args.n_proc
125144
dataset_tag = args.tag
145+
hdf5_repack = args.no_repack
126146
dataset_name = pathlib.Path(repo_id).name
127147

128148
files = list(directory.rglob("*"))
@@ -135,6 +155,7 @@ def process_file(
135155
output_directory=tmp_dirname,
136156
dataset_tag=dataset_tag,
137157
dataset_name=dataset_name,
158+
hdf5_repack=hdf5_repack,
138159
)
139160
with multiprocessing.Pool() as pool:
140161
pool.map(process_fn, files, chunksize=chunk_size)

scripts/huggingface/upload.sh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,19 @@
11
#!/usr/bin/bash -l
22

3-
43
#SBATCH --partition=polymathic
54
#SBATCH -C genoa
65
#SBATCH --time=20:00:00
76
#SBATCH -N 1
87
#SBATCH -n 1
9-
#SBATCH --cpus-per-task=12
8+
#SBATCH --cpus-per-task=96
109
#SBATCH --output=upload_well_data_%j.out
1110

12-
set -x
1311

1412
module load python
1513
module load hdf5
1614
source ~/venvs/well_venv/bin/activate
1715

16+
set -x
17+
1818
huggingface-cli login --token $HF_TOKEN --add-to-git-credential
1919
srun python -u upload.py $@

0 commit comments

Comments
 (0)