2121DEFAULT_SLEEP_SECONDS = 5
2222MAX_SLEEP_SECONDS = 120
2323DEFAULT_OUTPUT_DIR = "~/.cache/huggingface/lerobot/"
24+ GATE_DATASET_NAME = "gate"
2425
2526# Setup logging with file output
2627_log_dir = Path ("logs/download" )
2728_log_dir .mkdir (parents = True , exist_ok = True )
28- _file_handler = logging .FileHandler (_log_dir / "download.log" )
29+ _log_file = _log_dir / "download.log"
30+ _file_handler = logging .FileHandler (_log_file )
2931_file_handler .setFormatter (logging .Formatter ("%(asctime)s | %(levelname)s | %(message)s" , datefmt = "%H:%M:%S" ))
32+ _file_handler .setLevel (logging .DEBUG ) # File handler captures all levels
33+
34+ # Console handler only shows WARNING and above
35+ _console_handler = logging .StreamHandler ()
36+ _console_handler .setFormatter (logging .Formatter ("%(asctime)s | %(levelname)s | %(message)s" , datefmt = "%H:%M:%S" ))
37+ _console_handler .setLevel (logging .WARNING ) # Console only shows warnings and errors
3038
3139logging .basicConfig (
32- level = logging .INFO ,
33- format = "%(asctime)s | %(levelname)s | %(message)s" ,
34- datefmt = "%H:%M:%S" ,
35- handlers = [logging .StreamHandler (), _file_handler ],
40+ level = logging .DEBUG , # Root logger accepts all levels
41+ handlers = [_console_handler , _file_handler ],
3642)
3743LOGGER = logging .getLogger ("hub-download" )
3844
45+ # Suppress verbose logging from huggingface_hub and other libraries
46+ logging .getLogger ("huggingface_hub" ).setLevel (logging .WARNING )
47+ logging .getLogger ("urllib3" ).setLevel (logging .WARNING )
48+ logging .getLogger ("requests" ).setLevel (logging .WARNING )
49+
3950
4051# --------------------------------------------------------------------------- #
4152# CLI helpers
@@ -47,7 +58,6 @@ def build_parser() -> argparse.ArgumentParser:
4758 )
4859 parser .add_argument ("--hub" , required = True , choices = ["huggingface" , "modelscope" ])
4960 parser .add_argument ("--ds_lists" , nargs = "+" , help = "Dataset names provided on the CLI." )
50- parser .add_argument ("--ds_file" , help = "Optional text file with one dataset per line." )
5161 parser .add_argument ("--namespace" , help = "Hub namespace/owner." , default = None )
5262 parser .add_argument (
5363 "--output_dir" ,
@@ -86,21 +96,12 @@ def _resolve_namespace(namespace: str | None) -> str:
8696# --------------------------------------------------------------------------- #
8797# Dataset helper implementations
8898# --------------------------------------------------------------------------- #
89- def _read_dataset_names (cli_values : Iterable [str ] | None , file_path : str | None ) -> list [str ]:
99+ def _read_dataset_names (cli_values : Iterable [str ] | None ) -> list [str ]:
90100 names : list [str ] = []
91101
92102 if cli_values :
93103 names .extend (cli_values )
94104
95- if file_path :
96- parsed_path = Path (file_path ).expanduser ().resolve ()
97- if not parsed_path .exists ():
98- raise FileNotFoundError (f"Dataset list not found: { parsed_path } " )
99- for line in parsed_path .read_text (encoding = "utf-8" ).splitlines ():
100- item = line .strip ()
101- if item and not item .startswith ("#" ):
102- names .append (item )
103-
104105 ordered_unique : list [str ] = []
105106 seen : set [str ] = set ()
106107 for name in names :
@@ -116,7 +117,7 @@ def _retry_loop(label: str, max_retries: int, fn: Callable[[], Path]) -> Path:
116117
117118 for attempt in range (1 , max (1 , max_retries ) + 1 ):
118119 try :
119- LOGGER .info (f"{ label } : attempt { attempt } " )
120+ LOGGER .debug (f"{ label } : attempt { attempt } " )
120121 return fn ()
121122 except Exception as exc : # noqa: PERF203
122123 last_exc = exc
@@ -160,12 +161,12 @@ def _log_download_plan(
160161 max_retries : int ,
161162 token_provided : bool ,
162163) -> None :
163- LOGGER .info ("Hub: %s" , hub )
164- LOGGER .info ("Namespace: %s" , namespace )
165- LOGGER .info ("Output: %s" , out_dir )
166- LOGGER .info ("Datasets: %s" , ", " .join (datasets ))
167- LOGGER .info ("Retry budget: %d attempt(s) per dataset" , int (max_retries ))
168- LOGGER .info ("Token: %s" , "provided" if token_provided else "not provided" )
164+ LOGGER .debug ("Hub: %s" , hub )
165+ LOGGER .debug ("Namespace: %s" , namespace )
166+ LOGGER .debug ("Output: %s" , out_dir )
167+ LOGGER .debug ("Datasets: %s" , ", " .join (datasets ))
168+ LOGGER .debug ("Retry budget: %d attempt(s) per dataset" , int (max_retries ))
169+ LOGGER .debug ("Token: %s" , "provided" if token_provided else "not provided" )
169170
170171
171172def _download_requested_datasets (
@@ -180,7 +181,7 @@ def _download_requested_datasets(
180181) -> list [str ]:
181182 failures : list [str ] = []
182183 for idx , name in enumerate (datasets , 1 ):
183- LOGGER .info ("[%d/%d] %s" , idx , len (datasets ), name )
184+ LOGGER .debug ("[%d/%d] %s" , idx , len (datasets ), name )
184185 try :
185186 path = download_dataset (
186187 hub = hub ,
@@ -191,7 +192,7 @@ def _download_requested_datasets(
191192 max_workers = max_workers ,
192193 max_retries = max_retries ,
193194 )
194- LOGGER .info ("Completed: %s --> %s" , name , path )
195+ LOGGER .debug ("Completed: %s --> %s" , name , path )
195196 except Exception as exc : # noqa: PERF203
196197 LOGGER .error ("Failed: %s (%s)" , name , exc )
197198 failures .append (name )
@@ -205,19 +206,21 @@ def _ensure_gate_dataset(
205206 out_dir : Path ,
206207 token : str | None ,
207208 max_workers : int ,
208- max_retries : int ,
209209) -> None :
210- gate_name = "gate"
210+ # Only perform gate check for HuggingFace hub
211+ if hub != "huggingface" :
212+ return
213+
214+ gate_name = GATE_DATASET_NAME
211215 gate_repo_id = f"{ namespace } /{ gate_name } "
212216 gate_path = out_dir / namespace / gate_name
213217
214-
215218 # Check if gate dataset already exists
216219 if gate_path .exists () and any (gate_path .rglob ("*" )):
217- LOGGER .info ("Gate dataset already exists at: %s" , gate_path )
218- LOGGER .info ("Verifying gate dataset access..." )
220+ LOGGER .debug ("Gate dataset already exists at: %s" , gate_path )
221+ LOGGER .debug ("Verifying gate dataset access..." )
219222 else :
220- LOGGER .info ("Gate dataset not found. Attempting to download mandatory dataset %s from %s" , gate_repo_id , hub )
223+ LOGGER .debug ("Gate dataset not found. Attempting to download mandatory dataset %s from %s" , gate_repo_id , hub )
221224
222225 try :
223226 gate_path = download_dataset (
@@ -230,39 +233,81 @@ def _ensure_gate_dataset(
230233 max_retries = 1 ,
231234 enable_retry = False ,
232235 )
233- LOGGER .error ("============================================================" )
234- LOGGER .error (" GATE CHECK PASSED — THANK YOU FOR SUPPORTING ROBOCOIN" )
235- LOGGER .error ("============================================================" )
236- LOGGER .error ("Gate dataset is ready at: %s" , gate_path )
237- LOGGER .error ("Your consent keeps RoboCOIN sustainable and region-aware." )
238- LOGGER .error ("Proceeding with the remaining dataset downloads..." )
239- LOGGER .error ("------------------------------------------------------------" )
236+ _log_gate_success (gate_path )
240237 except Exception as exc : # noqa: PERF203
241- if hub == "huggingface" :
242- gate_url = f"https://huggingface.co/datasets/{ gate_repo_id } "
243- else :
244- gate_url = f"https://modelscope.cn/datasets/{ gate_repo_id } "
245-
246- LOGGER .error ("============================================================" )
247- LOGGER .error (" GATE DATASET ACCESS REQUIRED — PLEASE COMPLETE STATISTICS FORM" )
248- LOGGER .error ("============================================================" )
249- LOGGER .error ("To improve RoboCOIN’s regional coverage and understand how the data" )
250- LOGGER .error ("is used, we need a one-time, lightweight consent submission before" )
251- LOGGER .error ("any other datasets can be downloaded. Please visit the following link" )
252- LOGGER .error ("and fill out the brief form, then re-run this command:" )
253- LOGGER .error (" %s" , gate_url )
254- LOGGER .error ("The information is collected solely via the official Hugging Face flow" )
255- LOGGER .error ("and will never be used for unrelated purposes. Your response helps us" )
256- LOGGER .error ("prioritize support and keep the project sustainable. Thank you!" )
257- LOGGER .error ("------------------------------------------------------------" )
258- LOGGER .error ("Technical tips:" )
259- LOGGER .error (" - Ensure you have granted access at the URL above" )
260- LOGGER .error (" - If the dataset is private, confirm your token and permissions" )
261- LOGGER .error (" - Verify network connectivity and try again" )
262- LOGGER .error ("Original error: %s: %s" , type (exc ).__name__ , exc )
238+ gate_url = f"https://huggingface.co/datasets/{ gate_repo_id } "
239+ _log_gate_failure (gate_repo_id , gate_url , exc )
263240 raise RuntimeError (f"Gate dataset '{ gate_repo_id } ' download failed" ) from exc
264241
265242
243+ # --------------------------------------------------------------------------- #
244+ # Gate dataset logging helpers
245+ # --------------------------------------------------------------------------- #
246+ def _log_gate_success (gate_path : Path ) -> None :
247+ """Log successful gate dataset access."""
248+ # Calculate box width based on longest line, with minimum width
249+ longest_line = len (" THANK YOU FOR SUPPORTING ROBOCOIN!" )
250+ box_width = max (62 , longest_line + 20 ) # Ensure enough space for content + padding
251+
252+ # Create borders
253+ top_border = "╔" + "═" * (box_width - 2 ) + "╗"
254+ header_border = "╠" + "═" * (box_width - 2 ) + "╣"
255+ bottom_border = "╚" + "═" * (box_width - 2 ) + "╝"
256+
257+ def _print_line (text : str ) -> None :
258+ """Print a line with left and right borders."""
259+ padding = max (0 , box_width - len (text ) - 2 )
260+ print (f"║{ text } { ' ' * padding } ║" )
261+
262+ print () # Add a blank line before the box for better formatting
263+ print (top_border )
264+ _print_line (" THANK YOU FOR SUPPORTING ROBOCOIN!" )
265+ print (header_border )
266+ _print_line (" Your consent keeps RoboCOIN sustainable and region-aware." )
267+ _print_line (" Proceeding with the remaining dataset downloads..." )
268+ print (bottom_border )
269+ print () # Add a blank line after the box for better formatting
270+
271+
272+ def _log_gate_failure (gate_repo_id : str , gate_url : str , exc : Exception ) -> None :
273+ """Log gate dataset access failure."""
274+ # Calculate box width based on URL length, with minimum width
275+ url_len = len (gate_url )
276+ box_width = max (62 , url_len + 20 ) # Ensure enough space for URL + padding
277+
278+ # Create top border
279+ top_border = "╔" + "═" * (box_width - 2 ) + "╗"
280+ header_border = "╠" + "═" * (box_width - 2 ) + "╣"
281+ bottom_border = "╚" + "═" * (box_width - 2 ) + "╝"
282+
283+ def _print_line (text : str ) -> None :
284+ """Print a line with left and right borders."""
285+ padding = max (0 , box_width - len (text ) - 2 )
286+ print (f"║{ text } { ' ' * padding } ║" )
287+
288+ print (top_border )
289+ _print_line (" ACCESS REQUIRED — PLEASE COMPLETE STATISTICS FORM..." )
290+ print (header_border )
291+ _print_line (" To improve RoboCOIN's regional coverage and understand" )
292+ _print_line (" how the data is used, we need a one-time, lightweight" )
293+ _print_line (" consent submission before any other datasets can be" )
294+ _print_line (" downloaded. Please visit the following link and fill out" )
295+ _print_line (" the brief form, then re-run this command:" )
296+ _print_line ("" )
297+ _print_line (f" >>> { gate_url } <<<" )
298+ _print_line ("" )
299+ _print_line (" The information is collected solely via the official" )
300+ _print_line (" Hugging Face flow and will never be used for unrelated" )
301+ _print_line (" purposes. Your response helps us prioritize support and" )
302+ _print_line (" keep the project sustainable. Thank you!" )
303+ print (bottom_border )
304+ print ("" )
305+ print ("Technical tips:" )
306+ print (" - Ensure you have granted access at the URL above" )
307+ print (" - Verify network connectivity and try again" )
308+ print (f"Original error: { type (exc ).__name__ } : { exc } " )
309+
310+
266311# --------------------------------------------------------------------------- #
267312# Hub specific downloaders
268313# --------------------------------------------------------------------------- #
@@ -280,7 +325,6 @@ def _run() -> Path:
280325 "repo_id" : repo_id ,
281326 "repo_type" : "dataset" ,
282327 "token" : token ,
283- "resume_download" : True ,
284328 "max_workers" : max_workers ,
285329 "local_dir" : str (target_dir ),
286330 }
@@ -331,17 +375,17 @@ def _download_from_ms(repo_id: str, target_dir: Path, token: str | None, max_wor
331375 )
332376
333377 def _run () -> Path :
334- LOGGER .info ("ModelScope: attempting to download dataset_id=%s" , repo_id )
378+ LOGGER .debug ("ModelScope: attempting to download dataset_id=%s" , repo_id )
335379 LOGGER .debug (" local_dir=%s" , target_dir )
336380
337381 try :
338382 if token :
339- LOGGER .info ("Logging in to ModelScope with provided token" )
383+ LOGGER .debug ("Logging in to ModelScope with provided token" )
340384 HubApi ().login (token )
341385
342386 # Use dataset_snapshot_download for downloading dataset files
343387 # This downloads all raw files from the dataset repository
344- LOGGER .info ("Downloading dataset using dataset_snapshot_download..." )
388+ LOGGER .debug ("Downloading dataset using dataset_snapshot_download..." )
345389 download_kwargs = {
346390 "dataset_id" : repo_id ,
347391 "local_dir" : str (target_dir ),
@@ -354,7 +398,7 @@ def _run() -> Path:
354398 path = dataset_snapshot_download (** download_kwargs )
355399
356400 # The dataset files are now downloaded to target_dir (or default cache)
357- LOGGER .info ("Dataset downloaded successfully to %s" , path )
401+ LOGGER .debug ("Dataset downloaded successfully to %s" , path )
358402 return Path (path )
359403
360404 except Exception as exc :
@@ -429,7 +473,7 @@ def download_dataset(
429473 # will use the same consistent path: output_dir/namespace/dataset_name/
430474 dataset_path : Path = output_dir / namespace / dataset_name
431475
432- LOGGER .info ("Downloading repo_id: %s from %s" , repo_id , hub )
476+ LOGGER .debug ("Downloading repo_id: %s from %s" , repo_id , hub )
433477 LOGGER .debug ("Target path: %s" , dataset_path )
434478 LOGGER .debug ("Token provided: %s" , bool (token ))
435479
@@ -493,9 +537,9 @@ def download_datasets(
493537 )
494538
495539 if failures :
496- LOGGER . error ( "Failed datasets: %s" , ", " .join (failures ))
540+ print ( f "Failed datasets: { ', ' .join (failures )} " )
497541 else :
498- LOGGER . info ("All datasets downloaded successfully." )
542+ print ("All datasets downloaded successfully." )
499543
500544 return failures
501545
@@ -507,13 +551,14 @@ def main(argv: Sequence[str] | None = None) -> int:
507551 parser = build_parser ()
508552 args = parser .parse_args (argv )
509553
510- try :
511- dataset_names = _read_dataset_names (args .ds_lists , args .ds_file )
512- except FileNotFoundError as exc :
513- parser .error (str (exc ))
554+ # Print log file location at the start
555+ print (f"Detailed logs are being written to: { _log_file .absolute ()} " )
556+ print ()
557+
558+ dataset_names = _read_dataset_names (args .ds_lists )
514559
515560 if not dataset_names :
516- parser .error ("No datasets supplied. Use --ds_lists and/or --ds_file ." )
561+ parser .error ("No datasets supplied. Use --ds_lists." )
517562
518563 # Use default output directory if not provided
519564 if args .output_dir is None :
@@ -522,33 +567,31 @@ def main(argv: Sequence[str] | None = None) -> int:
522567 output_dir = _resolve_output_dir (args .output_dir )
523568
524569 if args .dry_run :
525- LOGGER . info ("Dry run" )
526- LOGGER . info ( " Hub: %s" , args .hub )
527- LOGGER . info ( " Namespace: %s" , args .namespace or DEFAULT_NAMESPACE )
528- LOGGER . info ( " Output: %s" , output_dir )
529- LOGGER . info ( " Datasets (%d): %s" , len (dataset_names ), ", " .join (dataset_names ))
530- LOGGER . info ( " Max retries: %d" , args .max_retry_time )
531- LOGGER . info ( " Token: %s" , " provided" if args .token else " not provided" )
570+ print ("Dry run" )
571+ print ( f " Hub: { args .hub } " )
572+ print ( f " Namespace: { args .namespace or DEFAULT_NAMESPACE } " )
573+ print ( f " Output: { output_dir } " )
574+ print ( f " Datasets ({ len (dataset_names )} ): { ', ' .join (dataset_names )} " )
575+ print ( f " Max retries: { args .max_retry_time } " )
576+ print ( f " Token: { ' provided' if args .token else ' not provided' } " )
532577 return 0
533578
534579 # Perform gate check before actual download (HuggingFace only)
535580 resolved_namespace = _resolve_namespace (args .namespace )
536581 resolved_token = _resolve_token (args .hub , args .token )
537- if args .hub == "huggingface" :
538- try :
539- _ensure_gate_dataset (
540- hub = args .hub ,
541- namespace = resolved_namespace ,
542- out_dir = output_dir ,
543- token = resolved_token ,
544- max_workers = max (1 , args .max_workers ),
545- max_retries = int (args .max_retry_time ),
546- )
547- LOGGER .error ("Gate check completed successfully. Proceeding with dataset downloads..." )
548- except RuntimeError as exc :
549- # Gate dataset failure – abort cleanly before downloading other datasets
550- LOGGER .error ("Download aborted due to gate check failure: %s" , exc )
551- return 1
582+ try :
583+ _ensure_gate_dataset (
584+ hub = args .hub ,
585+ namespace = resolved_namespace ,
586+ out_dir = output_dir ,
587+ token = resolved_token ,
588+ max_workers = max (1 , args .max_workers ),
589+ )
590+ LOGGER .debug ("Gate check completed successfully. Proceeding with dataset downloads..." )
591+ except RuntimeError as exc :
592+ # Gate dataset failure – abort cleanly before downloading other datasets
593+ LOGGER .error ("Download aborted due to gate check failure: %s" , exc )
594+ return 1
552595
553596 try :
554597 failures = download_datasets (
0 commit comments