11"""Trainingdata command for downloading training datasets."""
22
3- import logging
43import os
54import subprocess
65import urllib .request
@@ -27,17 +26,17 @@ def download_file(url: str, output_path: str, description: str = "Downloading"):
2726 console = console ,
2827 ) as progress :
2928 task = progress .add_task (f"{ description } { os .path .basename (output_path )} " )
30-
29+
3130 def reporthook (block_num , block_size , total_size ):
3231 if total_size > 0 :
3332 percent = min (100 , (block_num * block_size * 100 ) / total_size )
3433 progress .update (task , completed = percent )
35-
34+
3635 urllib .request .urlretrieve (url , output_path , reporthook )
3736 progress .update (task , completed = 100 )
38-
37+
3938 logger .info (f"Downloaded: { output_path } " )
40-
39+
4140 except Exception as e :
4241 raise VotuDerepError (f"Failed to download { url } : { e } " )
4342
@@ -52,19 +51,14 @@ def run_curl(url: str, output_path: str, description: str = "Downloading"):
5251 console = console ,
5352 ) as progress :
5453 task = progress .add_task (f"{ description } { os .path .basename (output_path )} " )
55-
54+
5655 cmd = ["curl" , "-L" , url , "-o" , output_path ]
57- process = subprocess .run (
58- cmd ,
59- capture_output = True ,
60- text = True ,
61- check = True
62- )
63-
56+ subprocess .run (cmd , capture_output = True , text = True , check = True )
57+
6458 progress .update (task , completed = 100 )
65-
59+
6660 logger .info (f"Downloaded: { output_path } " )
67-
61+
6862 except subprocess .CalledProcessError as e :
6963 raise VotuDerepError (f"Failed to download { url } : { e .stderr } " )
7064 except FileNotFoundError :
@@ -83,61 +77,61 @@ def run_curl(url: str, output_path: str, description: str = "Downloading"):
8377def trainingdata (ctx , outdir : str ):
8478 """
8579 Download training dataset from the internet.
86-
80+
8781 Downloads viral assembly and sequencing reads for training purposes.
8882 """
8983 verbose = ctx .obj .get ("verbose" , False )
90-
84+
9185 if verbose :
9286 console .print (f"[blue]Output directory:[/blue] { outdir } " )
93-
87+
9488 # Create output directory structure
9589 outdir_path = Path (outdir )
9690 reads_dir = outdir_path / "reads"
97-
91+
9892 try :
9993 reads_dir .mkdir (parents = True , exist_ok = True )
10094 logger .info (f"Created directory structure: { reads_dir } " )
101-
95+
10296 console .print ("[bold green]Downloading training dataset...[/bold green]" )
103-
97+
10498 # Download assembly
10599 assembly_url = "https://zenodo.org/api/records/10650983/files/illumina_sample_pool_megahit.fa.gz/content"
106100 assembly_path = outdir_path / "human_gut_assembly.fa.gz"
107-
101+
108102 console .print ("\n [blue]Downloading assembly...[/blue]" )
109103 download_file (assembly_url , str (assembly_path ), "Downloading assembly" )
110-
104+
111105 # Download reads
112106 console .print ("\n [blue]Downloading sequencing reads...[/blue]" )
113107 ebi_base = "ftp://ftp.sra.ebi.ac.uk/vol1/fastq"
114-
108+
115109 reads_to_download = [
116110 ("ERR6797445" , "ERR679/005/ERR6797445" ),
117- ("ERR6797444" , "ERR679/004/ERR6797444" ),
111+ ("ERR6797444" , "ERR679/004/ERR6797444" ),
118112 ("ERR6797443" , "ERR679/003/ERR6797443" ),
119113 ]
120-
114+
121115 for sample_id , path_suffix in reads_to_download :
122116 for read_num in ["1" , "2" ]:
123117 url = f"{ ebi_base } /{ path_suffix } /{ sample_id } _{ read_num } .fastq.gz"
124118 output_file = reads_dir / f"{ sample_id } _R{ read_num } .fastq.gz"
125-
119+
126120 run_curl (url , str (output_file ), f"Downloading { sample_id } _R{ read_num } " )
127-
128- console .print (f "\n [bold green]✓ Training dataset downloaded successfully![/bold green]" )
121+
122+ console .print ("\n [bold green]✓ Training dataset downloaded successfully![/bold green]" )
129123 console .print (f"[blue]Files saved to:[/blue] { outdir_path .absolute ()} " )
130-
124+
131125 # Summary of downloaded files
132126 if verbose :
133127 console .print ("\n [bold]Downloaded files:[/bold]" )
134128 for file_path in sorted (outdir_path .rglob ("*" )):
135129 if file_path .is_file ():
136130 size = file_path .stat ().st_size / (1024 * 1024 ) # MB
137131 console .print (f" • { file_path .relative_to (outdir_path )} ({ size :.1f} MB)" )
138-
132+
139133 except Exception as e :
140134 if isinstance (e , VotuDerepError ):
141135 raise
142136 else :
143- raise VotuDerepError (f"Failed to download training dataset: { e } " )
137+ raise VotuDerepError (f"Failed to download training dataset: { e } " )
0 commit comments