Skip to content

Commit 8f1084e

Browse files
committed
Add support for dataset download from huggingface
1 parent c9b053c commit 8f1084e

File tree

3 files changed

+125
-25
lines changed

3 files changed

+125
-25
lines changed

README.md

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,20 @@ python benchmark_scripts/download_libero_datasets.py --datasets DATASET
7272
```
7373
where ```DATASET``` is chosen from `[libero_spatial, libero_object, libero_100, libero_goal`.
7474

75+
**NEW!!!**
76+
77+
Alternatively, you can download the dataset from HuggingFace by using:
78+
```python
79+
python benchmark_scripts/download_libero_datasets.py --use-huggingface
80+
```
81+
82+
This option can also be combined with the specific dataset selection:
83+
```python
84+
python benchmark_scripts/download_libero_datasets.py --datasets DATASET --use-huggingface
85+
```
86+
87+
The datasets hosted on HuggingFace are available at [here](https://huggingface.co/datasets/yifengzhu-hf/LIBERO-datasets).
88+
7589

7690
# Getting Started
7791

benchmark_scripts/download_libero_datasets.py

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import init_path
22
import argparse
33
import os
4+
import time
45

56
import libero.libero.utils.download_utils as download_utils
67
from libero.libero import get_libero_path
@@ -19,6 +20,11 @@ def parse_args():
1920
choices=["all", "libero_goal", "libero_spatial", "libero_object", "libero_100"],
2021
default="all",
2122
)
23+
parser.add_argument(
24+
"--use-huggingface",
25+
action="store_true",
26+
help="Use Hugging Face instead of original download links"
27+
)
2228
return parser.parse_args()
2329

2430

@@ -31,12 +37,26 @@ def main():
3137
print(f"Datasets downloaded to {args.download_dir}")
3238
print(f"Downloading {args.datasets} datasets")
3339

40+
if args.use_huggingface:
41+
print("Using Hugging Face as the download source")
42+
else:
43+
print("Using original download links (note: these may expire soon)")
44+
input_str = input("Download from original links may lead to failures. Do you want to continue? (y/n): ")
45+
if input_str.lower() != 'y':
46+
print("Switching to Hugging Face as the download source...")
47+
args.use_huggingface = True
48+
3449
# If not, download
3550
download_utils.libero_dataset_download(
36-
download_dir=args.download_dir, datasets=args.datasets
51+
download_dir=args.download_dir,
52+
datasets=args.datasets,
53+
use_huggingface=args.use_huggingface
3754
)
3855

39-
# (TODO) If datasets exist, check if datasets are the same as benchmark
56+
57+
# wait for 1 second
58+
time.sleep(1)
59+
print("\n\n\n")
4060

4161
# Check if datasets exist first
4262
download_utils.check_libero_dataset(download_dir=args.download_dir)

libero/libero/utils/download_utils.py

Lines changed: 89 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -13,14 +13,15 @@
1313

1414
from libero.libero import get_libero_path
1515

16-
DIR = os.path.dirname(__file__)
17-
18-
DATASET_LINKS = {
19-
"libero_object": "https://utexas.box.com/shared/static/avkklgeq0e1dgzxz52x488whpu8mgspk.zip",
20-
"libero_goal": "https://utexas.box.com/shared/static/iv5e4dos8yy2b212pkzkpxu9wbdgjfeg.zip",
21-
"libero_spatial": "https://utexas.box.com/shared/static/04k94hyizn4huhbv5sz4ev9p2h1p6s7f.zip",
22-
"libero_100": "https://utexas.box.com/shared/static/cv73j8zschq8auh9npzt876fdc1akvmk.zip",
23-
}
16+
try:
17+
from huggingface_hub import snapshot_download
18+
import shutil
19+
HUGGINGFACE_AVAILABLE = True
20+
except ImportError:
21+
HUGGINGFACE_AVAILABLE = False
22+
23+
import libero.libero.utils.download_utils as download_utils
24+
from libero.libero import get_libero_path
2425

2526

2627
class DownloadProgressBar(tqdm):
@@ -97,44 +98,109 @@ def download_url(url, download_dir, check_overwrite=True, is_zipfile=True):
9798
os.remove(file_to_write)
9899

99100

100-
def libero_dataset_download(datasets="all", download_dir=None, check_overwrite=True):
101+
DATASET_LINKS = {
102+
"libero_object": "https://utexas.box.com/shared/static/avkklgeq0e1dgzxz52x488whpu8mgspk.zip",
103+
"libero_goal": "https://utexas.box.com/shared/static/iv5e4dos8yy2b212pkzkpxu9wbdgjfeg.zip",
104+
"libero_spatial": "https://utexas.box.com/shared/static/04k94hyizn4huhbv5sz4ev9p2h1p6s7f.zip",
105+
"libero_100": "https://utexas.box.com/shared/static/cv73j8zschq8auh9npzt876fdc1akvmk.zip",
106+
}
107+
108+
HF_REPO_ID = "yifengzhu-hf/LIBERO-datasets"
109+
110+
111+
def download_from_huggingface(dataset_name, download_dir, check_overwrite=True):
112+
"""
113+
Download a specific LIBERO dataset from Hugging Face.
114+
115+
Args:
116+
dataset_name (str): Name of the dataset to download (e.g., 'libero_spatial')
117+
download_dir (str): Directory where the dataset should be downloaded
118+
check_overwrite (bool): If True, will check if dataset already exists
119+
"""
120+
if not HUGGINGFACE_AVAILABLE:
121+
raise ImportError(
122+
"Hugging Face Hub is not available. Install it with 'pip install huggingface_hub'"
123+
)
124+
125+
# Create the destination folder
126+
os.makedirs(download_dir, exist_ok=True)
127+
128+
# Check if dataset already exists
129+
dataset_dir = os.path.join(download_dir, dataset_name)
130+
if check_overwrite and os.path.exists(dataset_dir):
131+
user_response = input(
132+
f"Warning: dataset {dataset_name} already exists at {dataset_dir}. Overwrite? y/n\n"
133+
)
134+
if user_response.lower() not in {"yes", "y"}:
135+
print(f"Skipping download of {dataset_name}")
136+
return
137+
138+
# Remove existing directory
139+
print(f"Removing existing folder: {dataset_dir}")
140+
shutil.rmtree(dataset_dir)
141+
142+
# Download the dataset
143+
print(f"Downloading {dataset_name} from Hugging Face...")
144+
folder_path = snapshot_download(
145+
repo_id=HF_REPO_ID,
146+
repo_type="dataset",
147+
local_dir=download_dir,
148+
allow_patterns=f"{dataset_name}/*",
149+
local_dir_use_symlinks=False, # Prevents using symlinks to cached files
150+
force_download=True # Forces re-downloading files
151+
)
152+
153+
# Verify downloaded files
154+
file_count = sum([len(files) for _, _, files in os.walk(os.path.join(download_dir, dataset_name))])
155+
print(f"Downloaded {file_count} files for {dataset_name}")
156+
157+
158+
def libero_dataset_download(datasets="all", download_dir=None, check_overwrite=True, use_huggingface=False):
101159
"""Download libero datasets
102160
103161
Args:
104162
datasets (str, optional): Specify which datasets to save. Defaults to "all", downloading all the datasets.
105163
download_dir (str, optional): Target location for storing datasets. Defaults to None, using the default path.
106164
check_overwrite (bool, optional): Check if overwriting datasets. Defaults to True.
165+
use_huggingface (bool, optional): Use Hugging Face instead of the original download links. Defaults to False.
107166
"""
108-
109167
if download_dir is None:
110168
download_dir = get_libero_path("datasets")
111169
if not os.path.exists(download_dir):
112170
os.makedirs(download_dir)
113171

114-
assert datasets in [
115-
"all",
116-
"libero_object",
117-
"libero_goal",
118-
"libero_spatial",
119-
"libero_100",
120-
]
172+
assert datasets in [
173+
"all",
174+
"libero_object",
175+
"libero_goal",
176+
"libero_spatial",
177+
"libero_100",
178+
]
121179

122-
for dataset_name in [
180+
datasets_to_download = [
123181
"libero_object",
124182
"libero_goal",
125183
"libero_spatial",
126184
"libero_100",
127-
]:
128-
if datasets == dataset_name or datasets == "all":
129-
print(f"Downloading {dataset_name}")
185+
] if datasets == "all" else [datasets]
186+
187+
for dataset_name in datasets_to_download:
188+
print(f"Downloading {dataset_name}")
189+
190+
if use_huggingface:
191+
download_from_huggingface(
192+
dataset_name=dataset_name,
193+
download_dir=download_dir,
194+
check_overwrite=check_overwrite
195+
)
196+
else:
197+
print("Using original download links (these may expire soon)")
130198
download_url(
131199
DATASET_LINKS[dataset_name],
132200
download_dir=download_dir,
133201
check_overwrite=check_overwrite,
134202
)
135203

136-
# (TODO): unzip the files
137-
138204

139205
def check_libero_dataset(download_dir=None):
140206
"""Check the integrity of the downloaded datasets.

0 commit comments

Comments
 (0)