Skip to content

Commit 44449c0

Browse files
committed
upgrade
1 parent d83999b commit 44449c0

File tree

1 file changed

+98
-52
lines changed

1 file changed

+98
-52
lines changed

src/lerobot/scripts/download.py

Lines changed: 98 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -26,17 +26,27 @@
2626
# Setup logging with file output
2727
_log_dir = Path("logs/download")
2828
_log_dir.mkdir(parents=True, exist_ok=True)
29-
_file_handler = logging.FileHandler(_log_dir / "download.log")
29+
_log_file = _log_dir / "download.log"
30+
_file_handler = logging.FileHandler(_log_file)
3031
_file_handler.setFormatter(logging.Formatter("%(asctime)s | %(levelname)s | %(message)s", datefmt="%H:%M:%S"))
32+
_file_handler.setLevel(logging.DEBUG) # File handler captures all levels
33+
34+
# Console handler only shows WARNING and above
35+
_console_handler = logging.StreamHandler()
36+
_console_handler.setFormatter(logging.Formatter("%(asctime)s | %(levelname)s | %(message)s", datefmt="%H:%M:%S"))
37+
_console_handler.setLevel(logging.WARNING) # Console only shows warnings and errors
3138

3239
logging.basicConfig(
33-
level=logging.INFO,
34-
format="%(asctime)s | %(levelname)s | %(message)s",
35-
datefmt="%H:%M:%S",
36-
handlers=[logging.StreamHandler(), _file_handler],
40+
level=logging.DEBUG, # Root logger accepts all levels
41+
handlers=[_console_handler, _file_handler],
3742
)
3843
LOGGER = logging.getLogger("hub-download")
3944

45+
# Suppress verbose logging from huggingface_hub and other libraries
46+
logging.getLogger("huggingface_hub").setLevel(logging.WARNING)
47+
logging.getLogger("urllib3").setLevel(logging.WARNING)
48+
logging.getLogger("requests").setLevel(logging.WARNING)
49+
4050

4151
# --------------------------------------------------------------------------- #
4252
# CLI helpers
@@ -107,7 +117,7 @@ def _retry_loop(label: str, max_retries: int, fn: Callable[[], Path]) -> Path:
107117

108118
for attempt in range(1, max(1, max_retries) + 1):
109119
try:
110-
LOGGER.info(f"{label}: attempt {attempt}")
120+
LOGGER.debug(f"{label}: attempt {attempt}")
111121
return fn()
112122
except Exception as exc: # noqa: PERF203
113123
last_exc = exc
@@ -151,12 +161,12 @@ def _log_download_plan(
151161
max_retries: int,
152162
token_provided: bool,
153163
) -> None:
154-
LOGGER.info("Hub: %s", hub)
155-
LOGGER.info("Namespace: %s", namespace)
156-
LOGGER.info("Output: %s", out_dir)
157-
LOGGER.info("Datasets: %s", ", ".join(datasets))
158-
LOGGER.info("Retry budget: %d attempt(s) per dataset", int(max_retries))
159-
LOGGER.info("Token: %s", "provided" if token_provided else "not provided")
164+
LOGGER.debug("Hub: %s", hub)
165+
LOGGER.debug("Namespace: %s", namespace)
166+
LOGGER.debug("Output: %s", out_dir)
167+
LOGGER.debug("Datasets: %s", ", ".join(datasets))
168+
LOGGER.debug("Retry budget: %d attempt(s) per dataset", int(max_retries))
169+
LOGGER.debug("Token: %s", "provided" if token_provided else "not provided")
160170

161171

162172
def _download_requested_datasets(
@@ -171,7 +181,7 @@ def _download_requested_datasets(
171181
) -> list[str]:
172182
failures: list[str] = []
173183
for idx, name in enumerate(datasets, 1):
174-
LOGGER.info("[%d/%d] %s", idx, len(datasets), name)
184+
LOGGER.debug("[%d/%d] %s", idx, len(datasets), name)
175185
try:
176186
path = download_dataset(
177187
hub=hub,
@@ -182,7 +192,7 @@ def _download_requested_datasets(
182192
max_workers=max_workers,
183193
max_retries=max_retries,
184194
)
185-
LOGGER.info("Completed: %s --> %s", name, path)
195+
LOGGER.debug("Completed: %s --> %s", name, path)
186196
except Exception as exc: # noqa: PERF203
187197
LOGGER.error("Failed: %s (%s)", name, exc)
188198
failures.append(name)
@@ -207,10 +217,10 @@ def _ensure_gate_dataset(
207217

208218
# Check if gate dataset already exists
209219
if gate_path.exists() and any(gate_path.rglob("*")):
210-
LOGGER.info("Gate dataset already exists at: %s", gate_path)
211-
LOGGER.info("Verifying gate dataset access...")
220+
LOGGER.debug("Gate dataset already exists at: %s", gate_path)
221+
LOGGER.debug("Verifying gate dataset access...")
212222
else:
213-
LOGGER.info("Gate dataset not found. Attempting to download mandatory dataset %s from %s", gate_repo_id, hub)
223+
LOGGER.debug("Gate dataset not found. Attempting to download mandatory dataset %s from %s", gate_repo_id, hub)
214224

215225
try:
216226
gate_path = download_dataset(
@@ -235,30 +245,63 @@ def _ensure_gate_dataset(
235245
# --------------------------------------------------------------------------- #
236246
def _log_gate_success(gate_path: Path) -> None:
237247
"""Log successful gate dataset access."""
238-
print("============================================================")
239-
print(" THANK YOU FOR SUPPORTING ROBOCOIN!")
240-
print("============================================================")
241-
print("Your consent keeps RoboCOIN sustainable and region-aware.")
242-
print("Proceeding with the remaining dataset downloads...")
243-
print("------------------------------------------------------------")
248+
# Calculate box width based on longest line, with minimum width
249+
longest_line = len(" THANK YOU FOR SUPPORTING ROBOCOIN!")
250+
box_width = max(62, longest_line + 20) # Ensure enough space for content + padding
251+
252+
# Create borders
253+
top_border = "╔" + "═" * (box_width - 2) + "╗"
254+
header_border = "╠" + "═" * (box_width - 2) + "╣"
255+
bottom_border = "╚" + "═" * (box_width - 2) + "╝"
256+
257+
def _print_line(text: str) -> None:
258+
"""Print a line with left and right borders."""
259+
padding = max(0, box_width - len(text) - 2)
260+
print(f"║{text}{' ' * padding}║")
261+
262+
print() # Add a blank line before the box for better formatting
263+
print(top_border)
264+
_print_line(" THANK YOU FOR SUPPORTING ROBOCOIN!")
265+
print(header_border)
266+
_print_line(" Your consent keeps RoboCOIN sustainable and region-aware.")
267+
_print_line(" Proceeding with the remaining dataset downloads...")
268+
print(bottom_border)
269+
print() # Add a blank line after the box for better formatting
244270

245271

246272
def _log_gate_failure(gate_repo_id: str, gate_url: str, exc: Exception) -> None:
247273
"""Log gate dataset access failure."""
248-
print("============================================================")
249-
print(" ACCESS REQUIRED — PLEASE COMPLETE STATISTICS FORM...")
250-
print("============================================================")
251-
print("To improve RoboCOIN's regional coverage and understand how the data")
252-
print("is used, we need a one-time, lightweight consent submission before")
253-
print("any other datasets can be downloaded. Please visit the following link")
254-
print("and fill out the brief form, then re-run this command:")
255-
print("")
256-
print(f" >>> {gate_url} <<<")
274+
# Calculate box width based on URL length, with minimum width
275+
url_len = len(gate_url)
276+
box_width = max(62, url_len + 20) # Ensure enough space for URL + padding
277+
278+
# Create top border
279+
top_border = "╔" + "═" * (box_width - 2) + "╗"
280+
header_border = "╠" + "═" * (box_width - 2) + "╣"
281+
bottom_border = "╚" + "═" * (box_width - 2) + "╝"
282+
283+
def _print_line(text: str) -> None:
284+
"""Print a line with left and right borders."""
285+
padding = max(0, box_width - len(text) - 2)
286+
print(f"║{text}{' ' * padding}║")
287+
288+
print(top_border)
289+
_print_line(" ACCESS REQUIRED — PLEASE COMPLETE STATISTICS FORM...")
290+
print(header_border)
291+
_print_line(" To improve RoboCOIN's regional coverage and understand")
292+
_print_line(" how the data is used, we need a one-time, lightweight")
293+
_print_line(" consent submission before any other datasets can be")
294+
_print_line(" downloaded. Please visit the following link and fill out")
295+
_print_line(" the brief form, then re-run this command:")
296+
_print_line("")
297+
_print_line(f" >>> {gate_url} <<<")
298+
_print_line("")
299+
_print_line(" The information is collected solely via the official")
300+
_print_line(" Hugging Face flow and will never be used for unrelated")
301+
_print_line(" purposes. Your response helps us prioritize support and")
302+
_print_line(" keep the project sustainable. Thank you!")
303+
print(bottom_border)
257304
print("")
258-
print("The information is collected solely via the official Hugging Face flow")
259-
print("and will never be used for unrelated purposes. Your response helps us")
260-
print("prioritize support and keep the project sustainable. Thank you!")
261-
print("------------------------------------------------------------")
262305
print("Technical tips:")
263306
print(" - Ensure you have granted access at the URL above")
264307
print(" - Verify network connectivity and try again")
@@ -282,7 +325,6 @@ def _run() -> Path:
282325
"repo_id": repo_id,
283326
"repo_type": "dataset",
284327
"token": token,
285-
"resume_download": True,
286328
"max_workers": max_workers,
287329
"local_dir": str(target_dir),
288330
}
@@ -333,17 +375,17 @@ def _download_from_ms(repo_id: str, target_dir: Path, token: str | None, max_wor
333375
)
334376

335377
def _run() -> Path:
336-
LOGGER.info("ModelScope: attempting to download dataset_id=%s", repo_id)
378+
LOGGER.debug("ModelScope: attempting to download dataset_id=%s", repo_id)
337379
LOGGER.debug(" local_dir=%s", target_dir)
338380

339381
try:
340382
if token:
341-
LOGGER.info("Logging in to ModelScope with provided token")
383+
LOGGER.debug("Logging in to ModelScope with provided token")
342384
HubApi().login(token)
343385

344386
# Use dataset_snapshot_download for downloading dataset files
345387
# This downloads all raw files from the dataset repository
346-
LOGGER.info("Downloading dataset using dataset_snapshot_download...")
388+
LOGGER.debug("Downloading dataset using dataset_snapshot_download...")
347389
download_kwargs = {
348390
"dataset_id": repo_id,
349391
"local_dir": str(target_dir),
@@ -356,7 +398,7 @@ def _run() -> Path:
356398
path = dataset_snapshot_download(**download_kwargs)
357399

358400
# The dataset files are now downloaded to target_dir (or default cache)
359-
LOGGER.info("Dataset downloaded successfully to %s", path)
401+
LOGGER.debug("Dataset downloaded successfully to %s", path)
360402
return Path(path)
361403

362404
except Exception as exc:
@@ -431,7 +473,7 @@ def download_dataset(
431473
# will use the same consistent path: output_dir/namespace/dataset_name/
432474
dataset_path: Path = output_dir / namespace / dataset_name
433475

434-
LOGGER.info("Downloading repo_id: %s from %s", repo_id, hub)
476+
LOGGER.debug("Downloading repo_id: %s from %s", repo_id, hub)
435477
LOGGER.debug("Target path: %s", dataset_path)
436478
LOGGER.debug("Token provided: %s", bool(token))
437479

@@ -495,9 +537,9 @@ def download_datasets(
495537
)
496538

497539
if failures:
498-
LOGGER.error("Failed datasets: %s", ", ".join(failures))
540+
print(f"Failed datasets: {', '.join(failures)}")
499541
else:
500-
LOGGER.info("All datasets downloaded successfully.")
542+
print("All datasets downloaded successfully.")
501543

502544
return failures
503545

@@ -509,6 +551,10 @@ def main(argv: Sequence[str] | None = None) -> int:
509551
parser = build_parser()
510552
args = parser.parse_args(argv)
511553

554+
# Print log file location at the start
555+
print(f"Detailed logs are being written to: {_log_file.absolute()}")
556+
print()
557+
512558
dataset_names = _read_dataset_names(args.ds_lists)
513559

514560
if not dataset_names:
@@ -521,13 +567,13 @@ def main(argv: Sequence[str] | None = None) -> int:
521567
output_dir = _resolve_output_dir(args.output_dir)
522568

523569
if args.dry_run:
524-
LOGGER.info("Dry run")
525-
LOGGER.info(" Hub: %s", args.hub)
526-
LOGGER.info(" Namespace: %s", args.namespace or DEFAULT_NAMESPACE)
527-
LOGGER.info(" Output: %s", output_dir)
528-
LOGGER.info(" Datasets (%d): %s", len(dataset_names), ", ".join(dataset_names))
529-
LOGGER.info(" Max retries: %d", args.max_retry_time)
530-
LOGGER.info(" Token: %s", "provided" if args.token else "not provided")
570+
print("Dry run")
571+
print(f" Hub: {args.hub}")
572+
print(f" Namespace: {args.namespace or DEFAULT_NAMESPACE}")
573+
print(f" Output: {output_dir}")
574+
print(f" Datasets ({len(dataset_names)}): {', '.join(dataset_names)}")
575+
print(f" Max retries: {args.max_retry_time}")
576+
print(f" Token: {'provided' if args.token else 'not provided'}")
531577
return 0
532578

533579
# Perform gate check before actual download (HuggingFace only)
@@ -541,7 +587,7 @@ def main(argv: Sequence[str] | None = None) -> int:
541587
token=resolved_token,
542588
max_workers=max(1, args.max_workers),
543589
)
544-
LOGGER.error("Gate check completed successfully. Proceeding with dataset downloads...")
590+
LOGGER.debug("Gate check completed successfully. Proceeding with dataset downloads...")
545591
except RuntimeError as exc:
546592
# Gate dataset failure – abort cleanly before downloading other datasets
547593
LOGGER.error("Download aborted due to gate check failure: %s", exc)

0 commit comments

Comments
 (0)