Skip to content

Commit 70d3af7

Browse files
authored
Merge pull request #12 from FlagOpen/rgs
update download script on nearly only printing codes.
2 parents 734b46c + 44449c0 commit 70d3af7

File tree

1 file changed

+138
-95
lines changed

1 file changed

+138
-95
lines changed

src/lerobot/scripts/download.py

Lines changed: 138 additions & 95 deletions
Original file line numberDiff line numberDiff line change
@@ -21,21 +21,32 @@
2121
DEFAULT_SLEEP_SECONDS = 5
2222
MAX_SLEEP_SECONDS = 120
2323
DEFAULT_OUTPUT_DIR = "~/.cache/huggingface/lerobot/"
24+
GATE_DATASET_NAME = "gate"
2425

2526
# Setup logging with file output
2627
_log_dir = Path("logs/download")
2728
_log_dir.mkdir(parents=True, exist_ok=True)
28-
_file_handler = logging.FileHandler(_log_dir / "download.log")
29+
_log_file = _log_dir / "download.log"
30+
_file_handler = logging.FileHandler(_log_file)
2931
_file_handler.setFormatter(logging.Formatter("%(asctime)s | %(levelname)s | %(message)s", datefmt="%H:%M:%S"))
32+
_file_handler.setLevel(logging.DEBUG) # File handler captures all levels
33+
34+
# Console handler only shows WARNING and above
35+
_console_handler = logging.StreamHandler()
36+
_console_handler.setFormatter(logging.Formatter("%(asctime)s | %(levelname)s | %(message)s", datefmt="%H:%M:%S"))
37+
_console_handler.setLevel(logging.WARNING) # Console only shows warnings and errors
3038

3139
logging.basicConfig(
32-
level=logging.INFO,
33-
format="%(asctime)s | %(levelname)s | %(message)s",
34-
datefmt="%H:%M:%S",
35-
handlers=[logging.StreamHandler(), _file_handler],
40+
level=logging.DEBUG, # Root logger accepts all levels
41+
handlers=[_console_handler, _file_handler],
3642
)
3743
LOGGER = logging.getLogger("hub-download")
3844

45+
# Suppress verbose logging from huggingface_hub and other libraries
46+
logging.getLogger("huggingface_hub").setLevel(logging.WARNING)
47+
logging.getLogger("urllib3").setLevel(logging.WARNING)
48+
logging.getLogger("requests").setLevel(logging.WARNING)
49+
3950

4051
# --------------------------------------------------------------------------- #
4152
# CLI helpers
@@ -47,7 +58,6 @@ def build_parser() -> argparse.ArgumentParser:
4758
)
4859
parser.add_argument("--hub", required=True, choices=["huggingface", "modelscope"])
4960
parser.add_argument("--ds_lists", nargs="+", help="Dataset names provided on the CLI.")
50-
parser.add_argument("--ds_file", help="Optional text file with one dataset per line.")
5161
parser.add_argument("--namespace", help="Hub namespace/owner.", default=None)
5262
parser.add_argument(
5363
"--output_dir",
@@ -86,21 +96,12 @@ def _resolve_namespace(namespace: str | None) -> str:
8696
# --------------------------------------------------------------------------- #
8797
# Dataset helper implementations
8898
# --------------------------------------------------------------------------- #
89-
def _read_dataset_names(cli_values: Iterable[str] | None, file_path: str | None) -> list[str]:
99+
def _read_dataset_names(cli_values: Iterable[str] | None) -> list[str]:
90100
names: list[str] = []
91101

92102
if cli_values:
93103
names.extend(cli_values)
94104

95-
if file_path:
96-
parsed_path = Path(file_path).expanduser().resolve()
97-
if not parsed_path.exists():
98-
raise FileNotFoundError(f"Dataset list not found: {parsed_path}")
99-
for line in parsed_path.read_text(encoding="utf-8").splitlines():
100-
item = line.strip()
101-
if item and not item.startswith("#"):
102-
names.append(item)
103-
104105
ordered_unique: list[str] = []
105106
seen: set[str] = set()
106107
for name in names:
@@ -116,7 +117,7 @@ def _retry_loop(label: str, max_retries: int, fn: Callable[[], Path]) -> Path:
116117

117118
for attempt in range(1, max(1, max_retries) + 1):
118119
try:
119-
LOGGER.info(f"{label}: attempt {attempt}")
120+
LOGGER.debug(f"{label}: attempt {attempt}")
120121
return fn()
121122
except Exception as exc: # noqa: PERF203
122123
last_exc = exc
@@ -160,12 +161,12 @@ def _log_download_plan(
160161
max_retries: int,
161162
token_provided: bool,
162163
) -> None:
163-
LOGGER.info("Hub: %s", hub)
164-
LOGGER.info("Namespace: %s", namespace)
165-
LOGGER.info("Output: %s", out_dir)
166-
LOGGER.info("Datasets: %s", ", ".join(datasets))
167-
LOGGER.info("Retry budget: %d attempt(s) per dataset", int(max_retries))
168-
LOGGER.info("Token: %s", "provided" if token_provided else "not provided")
164+
LOGGER.debug("Hub: %s", hub)
165+
LOGGER.debug("Namespace: %s", namespace)
166+
LOGGER.debug("Output: %s", out_dir)
167+
LOGGER.debug("Datasets: %s", ", ".join(datasets))
168+
LOGGER.debug("Retry budget: %d attempt(s) per dataset", int(max_retries))
169+
LOGGER.debug("Token: %s", "provided" if token_provided else "not provided")
169170

170171

171172
def _download_requested_datasets(
@@ -180,7 +181,7 @@ def _download_requested_datasets(
180181
) -> list[str]:
181182
failures: list[str] = []
182183
for idx, name in enumerate(datasets, 1):
183-
LOGGER.info("[%d/%d] %s", idx, len(datasets), name)
184+
LOGGER.debug("[%d/%d] %s", idx, len(datasets), name)
184185
try:
185186
path = download_dataset(
186187
hub=hub,
@@ -191,7 +192,7 @@ def _download_requested_datasets(
191192
max_workers=max_workers,
192193
max_retries=max_retries,
193194
)
194-
LOGGER.info("Completed: %s --> %s", name, path)
195+
LOGGER.debug("Completed: %s --> %s", name, path)
195196
except Exception as exc: # noqa: PERF203
196197
LOGGER.error("Failed: %s (%s)", name, exc)
197198
failures.append(name)
@@ -205,19 +206,21 @@ def _ensure_gate_dataset(
205206
out_dir: Path,
206207
token: str | None,
207208
max_workers: int,
208-
max_retries: int,
209209
) -> None:
210-
gate_name = "gate"
210+
# Only perform gate check for HuggingFace hub
211+
if hub != "huggingface":
212+
return
213+
214+
gate_name = GATE_DATASET_NAME
211215
gate_repo_id = f"{namespace}/{gate_name}"
212216
gate_path = out_dir / namespace / gate_name
213217

214-
215218
# Check if gate dataset already exists
216219
if gate_path.exists() and any(gate_path.rglob("*")):
217-
LOGGER.info("Gate dataset already exists at: %s", gate_path)
218-
LOGGER.info("Verifying gate dataset access...")
220+
LOGGER.debug("Gate dataset already exists at: %s", gate_path)
221+
LOGGER.debug("Verifying gate dataset access...")
219222
else:
220-
LOGGER.info("Gate dataset not found. Attempting to download mandatory dataset %s from %s", gate_repo_id, hub)
223+
LOGGER.debug("Gate dataset not found. Attempting to download mandatory dataset %s from %s", gate_repo_id, hub)
221224

222225
try:
223226
gate_path = download_dataset(
@@ -230,39 +233,81 @@ def _ensure_gate_dataset(
230233
max_retries=1,
231234
enable_retry=False,
232235
)
233-
LOGGER.error("============================================================")
234-
LOGGER.error(" GATE CHECK PASSED — THANK YOU FOR SUPPORTING ROBOCOIN")
235-
LOGGER.error("============================================================")
236-
LOGGER.error("Gate dataset is ready at: %s", gate_path)
237-
LOGGER.error("Your consent keeps RoboCOIN sustainable and region-aware.")
238-
LOGGER.error("Proceeding with the remaining dataset downloads...")
239-
LOGGER.error("------------------------------------------------------------")
236+
_log_gate_success(gate_path)
240237
except Exception as exc: # noqa: PERF203
241-
if hub == "huggingface":
242-
gate_url = f"https://huggingface.co/datasets/{gate_repo_id}"
243-
else:
244-
gate_url = f"https://modelscope.cn/datasets/{gate_repo_id}"
245-
246-
LOGGER.error("============================================================")
247-
LOGGER.error(" GATE DATASET ACCESS REQUIRED — PLEASE COMPLETE STATISTICS FORM")
248-
LOGGER.error("============================================================")
249-
LOGGER.error("To improve RoboCOIN’s regional coverage and understand how the data")
250-
LOGGER.error("is used, we need a one-time, lightweight consent submission before")
251-
LOGGER.error("any other datasets can be downloaded. Please visit the following link")
252-
LOGGER.error("and fill out the brief form, then re-run this command:")
253-
LOGGER.error(" %s", gate_url)
254-
LOGGER.error("The information is collected solely via the official Hugging Face flow")
255-
LOGGER.error("and will never be used for unrelated purposes. Your response helps us")
256-
LOGGER.error("prioritize support and keep the project sustainable. Thank you!")
257-
LOGGER.error("------------------------------------------------------------")
258-
LOGGER.error("Technical tips:")
259-
LOGGER.error(" - Ensure you have granted access at the URL above")
260-
LOGGER.error(" - If the dataset is private, confirm your token and permissions")
261-
LOGGER.error(" - Verify network connectivity and try again")
262-
LOGGER.error("Original error: %s: %s", type(exc).__name__, exc)
238+
gate_url = f"https://huggingface.co/datasets/{gate_repo_id}"
239+
_log_gate_failure(gate_repo_id, gate_url, exc)
263240
raise RuntimeError(f"Gate dataset '{gate_repo_id}' download failed") from exc
264241

265242

243+
# --------------------------------------------------------------------------- #
244+
# Gate dataset logging helpers
245+
# --------------------------------------------------------------------------- #
246+
def _log_gate_success(gate_path: Path) -> None:
247+
"""Log successful gate dataset access."""
248+
# Calculate box width based on longest line, with minimum width
249+
longest_line = len(" THANK YOU FOR SUPPORTING ROBOCOIN!")
250+
box_width = max(62, longest_line + 20) # Ensure enough space for content + padding
251+
252+
# Create borders
253+
top_border = "╔" + "═" * (box_width - 2) + "╗"
254+
header_border = "╠" + "═" * (box_width - 2) + "╣"
255+
bottom_border = "╚" + "═" * (box_width - 2) + "╝"
256+
257+
def _print_line(text: str) -> None:
258+
"""Print a line with left and right borders."""
259+
padding = max(0, box_width - len(text) - 2)
260+
print(f"║{text}{' ' * padding}║")
261+
262+
print() # Add a blank line before the box for better formatting
263+
print(top_border)
264+
_print_line(" THANK YOU FOR SUPPORTING ROBOCOIN!")
265+
print(header_border)
266+
_print_line(" Your consent keeps RoboCOIN sustainable and region-aware.")
267+
_print_line(" Proceeding with the remaining dataset downloads...")
268+
print(bottom_border)
269+
print() # Add a blank line after the box for better formatting
270+
271+
272+
def _log_gate_failure(gate_repo_id: str, gate_url: str, exc: Exception) -> None:
273+
"""Log gate dataset access failure."""
274+
# Calculate box width based on URL length, with minimum width
275+
url_len = len(gate_url)
276+
box_width = max(62, url_len + 20) # Ensure enough space for URL + padding
277+
278+
# Create top border
279+
top_border = "╔" + "═" * (box_width - 2) + "╗"
280+
header_border = "╠" + "═" * (box_width - 2) + "╣"
281+
bottom_border = "╚" + "═" * (box_width - 2) + "╝"
282+
283+
def _print_line(text: str) -> None:
284+
"""Print a line with left and right borders."""
285+
padding = max(0, box_width - len(text) - 2)
286+
print(f"║{text}{' ' * padding}║")
287+
288+
print(top_border)
289+
_print_line(" ACCESS REQUIRED — PLEASE COMPLETE STATISTICS FORM...")
290+
print(header_border)
291+
_print_line(" To improve RoboCOIN's regional coverage and understand")
292+
_print_line(" how the data is used, we need a one-time, lightweight")
293+
_print_line(" consent submission before any other datasets can be")
294+
_print_line(" downloaded. Please visit the following link and fill out")
295+
_print_line(" the brief form, then re-run this command:")
296+
_print_line("")
297+
_print_line(f" >>> {gate_url} <<<")
298+
_print_line("")
299+
_print_line(" The information is collected solely via the official")
300+
_print_line(" Hugging Face flow and will never be used for unrelated")
301+
_print_line(" purposes. Your response helps us prioritize support and")
302+
_print_line(" keep the project sustainable. Thank you!")
303+
print(bottom_border)
304+
print("")
305+
print("Technical tips:")
306+
print(" - Ensure you have granted access at the URL above")
307+
print(" - Verify network connectivity and try again")
308+
print(f"Original error: {type(exc).__name__}: {exc}")
309+
310+
266311
# --------------------------------------------------------------------------- #
267312
# Hub specific downloaders
268313
# --------------------------------------------------------------------------- #
@@ -280,7 +325,6 @@ def _run() -> Path:
280325
"repo_id": repo_id,
281326
"repo_type": "dataset",
282327
"token": token,
283-
"resume_download": True,
284328
"max_workers": max_workers,
285329
"local_dir": str(target_dir),
286330
}
@@ -331,17 +375,17 @@ def _download_from_ms(repo_id: str, target_dir: Path, token: str | None, max_wor
331375
)
332376

333377
def _run() -> Path:
334-
LOGGER.info("ModelScope: attempting to download dataset_id=%s", repo_id)
378+
LOGGER.debug("ModelScope: attempting to download dataset_id=%s", repo_id)
335379
LOGGER.debug(" local_dir=%s", target_dir)
336380

337381
try:
338382
if token:
339-
LOGGER.info("Logging in to ModelScope with provided token")
383+
LOGGER.debug("Logging in to ModelScope with provided token")
340384
HubApi().login(token)
341385

342386
# Use dataset_snapshot_download for downloading dataset files
343387
# This downloads all raw files from the dataset repository
344-
LOGGER.info("Downloading dataset using dataset_snapshot_download...")
388+
LOGGER.debug("Downloading dataset using dataset_snapshot_download...")
345389
download_kwargs = {
346390
"dataset_id": repo_id,
347391
"local_dir": str(target_dir),
@@ -354,7 +398,7 @@ def _run() -> Path:
354398
path = dataset_snapshot_download(**download_kwargs)
355399

356400
# The dataset files are now downloaded to target_dir (or default cache)
357-
LOGGER.info("Dataset downloaded successfully to %s", path)
401+
LOGGER.debug("Dataset downloaded successfully to %s", path)
358402
return Path(path)
359403

360404
except Exception as exc:
@@ -429,7 +473,7 @@ def download_dataset(
429473
# will use the same consistent path: output_dir/namespace/dataset_name/
430474
dataset_path: Path = output_dir / namespace / dataset_name
431475

432-
LOGGER.info("Downloading repo_id: %s from %s", repo_id, hub)
476+
LOGGER.debug("Downloading repo_id: %s from %s", repo_id, hub)
433477
LOGGER.debug("Target path: %s", dataset_path)
434478
LOGGER.debug("Token provided: %s", bool(token))
435479

@@ -493,9 +537,9 @@ def download_datasets(
493537
)
494538

495539
if failures:
496-
LOGGER.error("Failed datasets: %s", ", ".join(failures))
540+
print(f"Failed datasets: {', '.join(failures)}")
497541
else:
498-
LOGGER.info("All datasets downloaded successfully.")
542+
print("All datasets downloaded successfully.")
499543

500544
return failures
501545

@@ -507,13 +551,14 @@ def main(argv: Sequence[str] | None = None) -> int:
507551
parser = build_parser()
508552
args = parser.parse_args(argv)
509553

510-
try:
511-
dataset_names = _read_dataset_names(args.ds_lists, args.ds_file)
512-
except FileNotFoundError as exc:
513-
parser.error(str(exc))
554+
# Print log file location at the start
555+
print(f"Detailed logs are being written to: {_log_file.absolute()}")
556+
print()
557+
558+
dataset_names = _read_dataset_names(args.ds_lists)
514559

515560
if not dataset_names:
516-
parser.error("No datasets supplied. Use --ds_lists and/or --ds_file.")
561+
parser.error("No datasets supplied. Use --ds_lists.")
517562

518563
# Use default output directory if not provided
519564
if args.output_dir is None:
@@ -522,33 +567,31 @@ def main(argv: Sequence[str] | None = None) -> int:
522567
output_dir = _resolve_output_dir(args.output_dir)
523568

524569
if args.dry_run:
525-
LOGGER.info("Dry run")
526-
LOGGER.info(" Hub: %s", args.hub)
527-
LOGGER.info(" Namespace: %s", args.namespace or DEFAULT_NAMESPACE)
528-
LOGGER.info(" Output: %s", output_dir)
529-
LOGGER.info(" Datasets (%d): %s", len(dataset_names), ", ".join(dataset_names))
530-
LOGGER.info(" Max retries: %d", args.max_retry_time)
531-
LOGGER.info(" Token: %s", "provided" if args.token else "not provided")
570+
print("Dry run")
571+
print(f" Hub: {args.hub}")
572+
print(f" Namespace: {args.namespace or DEFAULT_NAMESPACE}")
573+
print(f" Output: {output_dir}")
574+
print(f" Datasets ({len(dataset_names)}): {', '.join(dataset_names)}")
575+
print(f" Max retries: {args.max_retry_time}")
576+
print(f" Token: {'provided' if args.token else 'not provided'}")
532577
return 0
533578

534579
# Perform gate check before actual download (HuggingFace only)
535580
resolved_namespace = _resolve_namespace(args.namespace)
536581
resolved_token = _resolve_token(args.hub, args.token)
537-
if args.hub == "huggingface":
538-
try:
539-
_ensure_gate_dataset(
540-
hub=args.hub,
541-
namespace=resolved_namespace,
542-
out_dir=output_dir,
543-
token=resolved_token,
544-
max_workers=max(1, args.max_workers),
545-
max_retries=int(args.max_retry_time),
546-
)
547-
LOGGER.error("Gate check completed successfully. Proceeding with dataset downloads...")
548-
except RuntimeError as exc:
549-
# Gate dataset failure – abort cleanly before downloading other datasets
550-
LOGGER.error("Download aborted due to gate check failure: %s", exc)
551-
return 1
582+
try:
583+
_ensure_gate_dataset(
584+
hub=args.hub,
585+
namespace=resolved_namespace,
586+
out_dir=output_dir,
587+
token=resolved_token,
588+
max_workers=max(1, args.max_workers),
589+
)
590+
LOGGER.debug("Gate check completed successfully. Proceeding with dataset downloads...")
591+
except RuntimeError as exc:
592+
# Gate dataset failure – abort cleanly before downloading other datasets
593+
LOGGER.error("Download aborted due to gate check failure: %s", exc)
594+
return 1
552595

553596
try:
554597
failures = download_datasets(

0 commit comments

Comments
 (0)