2626# Setup logging with file output
2727_log_dir = Path ("logs/download" )
2828_log_dir .mkdir (parents = True , exist_ok = True )
29- _file_handler = logging .FileHandler (_log_dir / "download.log" )
29+ _log_file = _log_dir / "download.log"
30+ _file_handler = logging .FileHandler (_log_file )
3031_file_handler .setFormatter (logging .Formatter ("%(asctime)s | %(levelname)s | %(message)s" , datefmt = "%H:%M:%S" ))
32+ _file_handler .setLevel (logging .DEBUG ) # File handler captures all levels
33+
34+ # Console handler only shows WARNING and above
35+ _console_handler = logging .StreamHandler ()
36+ _console_handler .setFormatter (logging .Formatter ("%(asctime)s | %(levelname)s | %(message)s" , datefmt = "%H:%M:%S" ))
37+ _console_handler .setLevel (logging .WARNING ) # Console only shows warnings and errors
3138
3239logging .basicConfig (
33- level = logging .INFO ,
34- format = "%(asctime)s | %(levelname)s | %(message)s" ,
35- datefmt = "%H:%M:%S" ,
36- handlers = [logging .StreamHandler (), _file_handler ],
40+ level = logging .DEBUG , # Root logger accepts all levels
41+ handlers = [_console_handler , _file_handler ],
3742)
3843LOGGER = logging .getLogger ("hub-download" )
3944
45+ # Suppress verbose logging from huggingface_hub and other libraries
46+ logging .getLogger ("huggingface_hub" ).setLevel (logging .WARNING )
47+ logging .getLogger ("urllib3" ).setLevel (logging .WARNING )
48+ logging .getLogger ("requests" ).setLevel (logging .WARNING )
49+
4050
4151# --------------------------------------------------------------------------- #
4252# CLI helpers
@@ -107,7 +117,7 @@ def _retry_loop(label: str, max_retries: int, fn: Callable[[], Path]) -> Path:
107117
108118 for attempt in range (1 , max (1 , max_retries ) + 1 ):
109119 try :
110- LOGGER .info (f"{ label } : attempt { attempt } " )
120+ LOGGER .debug (f"{ label } : attempt { attempt } " )
111121 return fn ()
112122 except Exception as exc : # noqa: PERF203
113123 last_exc = exc
@@ -151,12 +161,12 @@ def _log_download_plan(
151161 max_retries : int ,
152162 token_provided : bool ,
153163) -> None :
154- LOGGER .info ("Hub: %s" , hub )
155- LOGGER .info ("Namespace: %s" , namespace )
156- LOGGER .info ("Output: %s" , out_dir )
157- LOGGER .info ("Datasets: %s" , ", " .join (datasets ))
158- LOGGER .info ("Retry budget: %d attempt(s) per dataset" , int (max_retries ))
159- LOGGER .info ("Token: %s" , "provided" if token_provided else "not provided" )
164+ LOGGER .debug ("Hub: %s" , hub )
165+ LOGGER .debug ("Namespace: %s" , namespace )
166+ LOGGER .debug ("Output: %s" , out_dir )
167+ LOGGER .debug ("Datasets: %s" , ", " .join (datasets ))
168+ LOGGER .debug ("Retry budget: %d attempt(s) per dataset" , int (max_retries ))
169+ LOGGER .debug ("Token: %s" , "provided" if token_provided else "not provided" )
160170
161171
162172def _download_requested_datasets (
@@ -171,7 +181,7 @@ def _download_requested_datasets(
171181) -> list [str ]:
172182 failures : list [str ] = []
173183 for idx , name in enumerate (datasets , 1 ):
174- LOGGER .info ("[%d/%d] %s" , idx , len (datasets ), name )
184+ LOGGER .debug ("[%d/%d] %s" , idx , len (datasets ), name )
175185 try :
176186 path = download_dataset (
177187 hub = hub ,
@@ -182,7 +192,7 @@ def _download_requested_datasets(
182192 max_workers = max_workers ,
183193 max_retries = max_retries ,
184194 )
185- LOGGER .info ("Completed: %s --> %s" , name , path )
195+ LOGGER .debug ("Completed: %s --> %s" , name , path )
186196 except Exception as exc : # noqa: PERF203
187197 LOGGER .error ("Failed: %s (%s)" , name , exc )
188198 failures .append (name )
@@ -207,10 +217,10 @@ def _ensure_gate_dataset(
207217
208218 # Check if gate dataset already exists
209219 if gate_path .exists () and any (gate_path .rglob ("*" )):
210- LOGGER .info ("Gate dataset already exists at: %s" , gate_path )
211- LOGGER .info ("Verifying gate dataset access..." )
220+ LOGGER .debug ("Gate dataset already exists at: %s" , gate_path )
221+ LOGGER .debug ("Verifying gate dataset access..." )
212222 else :
213- LOGGER .info ("Gate dataset not found. Attempting to download mandatory dataset %s from %s" , gate_repo_id , hub )
223+ LOGGER .debug ("Gate dataset not found. Attempting to download mandatory dataset %s from %s" , gate_repo_id , hub )
214224
215225 try :
216226 gate_path = download_dataset (
@@ -235,30 +245,63 @@ def _ensure_gate_dataset(
235245# --------------------------------------------------------------------------- #
236246def _log_gate_success (gate_path : Path ) -> None :
237247 """Log successful gate dataset access."""
238- print ("============================================================" )
239- print (" THANK YOU FOR SUPPORTING ROBOCOIN!" )
240- print ("============================================================" )
241- print ("Your consent keeps RoboCOIN sustainable and region-aware." )
242- print ("Proceeding with the remaining dataset downloads..." )
243- print ("------------------------------------------------------------" )
248+ # Calculate box width based on longest line, with minimum width
249+ longest_line = len (" THANK YOU FOR SUPPORTING ROBOCOIN!" )
250+ box_width = max (62 , longest_line + 20 ) # Ensure enough space for content + padding
251+
252+ # Create borders
253+ top_border = "╔" + "═" * (box_width - 2 ) + "╗"
254+ header_border = "╠" + "═" * (box_width - 2 ) + "╣"
255+ bottom_border = "╚" + "═" * (box_width - 2 ) + "╝"
256+
257+ def _print_line (text : str ) -> None :
258+ """Print a line with left and right borders."""
259+ padding = max (0 , box_width - len (text ) - 2 )
260+ print (f"║{ text } { ' ' * padding } ║" )
261+
262+ print () # Add a blank line before the box for better formatting
263+ print (top_border )
264+ _print_line (" THANK YOU FOR SUPPORTING ROBOCOIN!" )
265+ print (header_border )
266+ _print_line (" Your consent keeps RoboCOIN sustainable and region-aware." )
267+ _print_line (" Proceeding with the remaining dataset downloads..." )
268+ print (bottom_border )
269+ print () # Add a blank line after the box for better formatting
244270
245271
246272def _log_gate_failure (gate_repo_id : str , gate_url : str , exc : Exception ) -> None :
247273 """Log gate dataset access failure."""
248- print ("============================================================" )
249- print (" ACCESS REQUIRED — PLEASE COMPLETE STATISTICS FORM..." )
250- print ("============================================================" )
251- print ("To improve RoboCOIN's regional coverage and understand how the data" )
252- print ("is used, we need a one-time, lightweight consent submission before" )
253- print ("any other datasets can be downloaded. Please visit the following link" )
254- print ("and fill out the brief form, then re-run this command:" )
255- print ("" )
256- print (f" >>> { gate_url } <<<" )
274+ # Calculate box width based on URL length, with minimum width
275+ url_len = len (gate_url )
276+ box_width = max (62 , url_len + 20 ) # Ensure enough space for URL + padding
277+
278+ # Create top border
279+ top_border = "╔" + "═" * (box_width - 2 ) + "╗"
280+ header_border = "╠" + "═" * (box_width - 2 ) + "╣"
281+ bottom_border = "╚" + "═" * (box_width - 2 ) + "╝"
282+
283+ def _print_line (text : str ) -> None :
284+ """Print a line with left and right borders."""
285+ padding = max (0 , box_width - len (text ) - 2 )
286+ print (f"║{ text } { ' ' * padding } ║" )
287+
288+ print (top_border )
289+ _print_line (" ACCESS REQUIRED — PLEASE COMPLETE STATISTICS FORM..." )
290+ print (header_border )
291+ _print_line (" To improve RoboCOIN's regional coverage and understand" )
292+ _print_line (" how the data is used, we need a one-time, lightweight" )
293+ _print_line (" consent submission before any other datasets can be" )
294+ _print_line (" downloaded. Please visit the following link and fill out" )
295+ _print_line (" the brief form, then re-run this command:" )
296+ _print_line ("" )
297+ _print_line (f" >>> { gate_url } <<<" )
298+ _print_line ("" )
299+ _print_line (" The information is collected solely via the official" )
300+ _print_line (" Hugging Face flow and will never be used for unrelated" )
301+ _print_line (" purposes. Your response helps us prioritize support and" )
302+ _print_line (" keep the project sustainable. Thank you!" )
303+ print (bottom_border )
257304 print ("" )
258- print ("The information is collected solely via the official Hugging Face flow" )
259- print ("and will never be used for unrelated purposes. Your response helps us" )
260- print ("prioritize support and keep the project sustainable. Thank you!" )
261- print ("------------------------------------------------------------" )
262305 print ("Technical tips:" )
263306 print (" - Ensure you have granted access at the URL above" )
264307 print (" - Verify network connectivity and try again" )
@@ -282,7 +325,6 @@ def _run() -> Path:
282325 "repo_id" : repo_id ,
283326 "repo_type" : "dataset" ,
284327 "token" : token ,
285- "resume_download" : True ,
286328 "max_workers" : max_workers ,
287329 "local_dir" : str (target_dir ),
288330 }
@@ -333,17 +375,17 @@ def _download_from_ms(repo_id: str, target_dir: Path, token: str | None, max_wor
333375 )
334376
335377 def _run () -> Path :
336- LOGGER .info ("ModelScope: attempting to download dataset_id=%s" , repo_id )
378+ LOGGER .debug ("ModelScope: attempting to download dataset_id=%s" , repo_id )
337379 LOGGER .debug (" local_dir=%s" , target_dir )
338380
339381 try :
340382 if token :
341- LOGGER .info ("Logging in to ModelScope with provided token" )
383+ LOGGER .debug ("Logging in to ModelScope with provided token" )
342384 HubApi ().login (token )
343385
344386 # Use dataset_snapshot_download for downloading dataset files
345387 # This downloads all raw files from the dataset repository
346- LOGGER .info ("Downloading dataset using dataset_snapshot_download..." )
388+ LOGGER .debug ("Downloading dataset using dataset_snapshot_download..." )
347389 download_kwargs = {
348390 "dataset_id" : repo_id ,
349391 "local_dir" : str (target_dir ),
@@ -356,7 +398,7 @@ def _run() -> Path:
356398 path = dataset_snapshot_download (** download_kwargs )
357399
358400 # The dataset files are now downloaded to target_dir (or default cache)
359- LOGGER .info ("Dataset downloaded successfully to %s" , path )
401+ LOGGER .debug ("Dataset downloaded successfully to %s" , path )
360402 return Path (path )
361403
362404 except Exception as exc :
@@ -431,7 +473,7 @@ def download_dataset(
431473 # will use the same consistent path: output_dir/namespace/dataset_name/
432474 dataset_path : Path = output_dir / namespace / dataset_name
433475
434- LOGGER .info ("Downloading repo_id: %s from %s" , repo_id , hub )
476+ LOGGER .debug ("Downloading repo_id: %s from %s" , repo_id , hub )
435477 LOGGER .debug ("Target path: %s" , dataset_path )
436478 LOGGER .debug ("Token provided: %s" , bool (token ))
437479
@@ -495,9 +537,9 @@ def download_datasets(
495537 )
496538
497539 if failures :
498- LOGGER . error ( "Failed datasets: %s" , ", " .join (failures ))
540+ print ( f "Failed datasets: { ', ' .join (failures )} " )
499541 else :
500- LOGGER . info ("All datasets downloaded successfully." )
542+ print ("All datasets downloaded successfully." )
501543
502544 return failures
503545
@@ -509,6 +551,10 @@ def main(argv: Sequence[str] | None = None) -> int:
509551 parser = build_parser ()
510552 args = parser .parse_args (argv )
511553
554+ # Print log file location at the start
555+ print (f"Detailed logs are being written to: { _log_file .absolute ()} " )
556+ print ()
557+
512558 dataset_names = _read_dataset_names (args .ds_lists )
513559
514560 if not dataset_names :
@@ -521,13 +567,13 @@ def main(argv: Sequence[str] | None = None) -> int:
521567 output_dir = _resolve_output_dir (args .output_dir )
522568
523569 if args .dry_run :
524- LOGGER . info ("Dry run" )
525- LOGGER . info ( " Hub: %s" , args .hub )
526- LOGGER . info ( " Namespace: %s" , args .namespace or DEFAULT_NAMESPACE )
527- LOGGER . info ( " Output: %s" , output_dir )
528- LOGGER . info ( " Datasets (%d): %s" , len (dataset_names ), ", " .join (dataset_names ))
529- LOGGER . info ( " Max retries: %d" , args .max_retry_time )
530- LOGGER . info ( " Token: %s" , " provided" if args .token else " not provided" )
570+ print ("Dry run" )
571+ print ( f " Hub: { args .hub } " )
572+ print ( f " Namespace: { args .namespace or DEFAULT_NAMESPACE } " )
573+ print ( f " Output: { output_dir } " )
574+ print ( f " Datasets ({ len (dataset_names )} ): { ', ' .join (dataset_names )} " )
575+ print ( f " Max retries: { args .max_retry_time } " )
576+ print ( f " Token: { ' provided' if args .token else ' not provided' } " )
531577 return 0
532578
533579 # Perform gate check before actual download (HuggingFace only)
@@ -541,7 +587,7 @@ def main(argv: Sequence[str] | None = None) -> int:
541587 token = resolved_token ,
542588 max_workers = max (1 , args .max_workers ),
543589 )
544- LOGGER .error ("Gate check completed successfully. Proceeding with dataset downloads..." )
590+ LOGGER .debug ("Gate check completed successfully. Proceeding with dataset downloads..." )
545591 except RuntimeError as exc :
546592 # Gate dataset failure – abort cleanly before downloading other datasets
547593 LOGGER .error ("Download aborted due to gate check failure: %s" , exc )
0 commit comments