@@ -68,7 +68,7 @@ async def main():
6868 loop .run_until_complete (main ())
6969 loop .close ()
7070
71- def downloadGTDBGenomes (taxa_name , gtdb_release , outdir , genome_listing_file , logObject , sanity_check = False , automated_download = False ):
71+ def downloadGTDBGenomes (taxa_name , gtdb_release , outdir , genome_listing_file , logObject , sanity_check = False , automated_download = False , gunzip = False , threads = 1 ):
7272 """
7373 Download GTDB genomes from NCBI Genbank using ncbi-genome-download.
7474 **********************************************************
@@ -157,6 +157,7 @@ def downloadGTDBGenomes(taxa_name, gtdb_release, outdir, genome_listing_file, lo
157157
158158 gf_listing_handle = open (genome_listing_file , 'a+' )
159159 final_genome_count = 0
160+ gunzip_cmds = []
160161 for f in os .listdir (genomes_directory ):
161162 genome_file = genomes_directory + f
162163 if not os .path .isfile (genome_file ): continue
@@ -178,9 +179,27 @@ def downloadGTDBGenomes(taxa_name, gtdb_release, outdir, genome_listing_file, lo
178179 polished_filename = url_file_to_polished_name [f ]
179180 renamed_gfile = genomes_directory + polished_filename
180181 os .rename (genome_file , renamed_gfile )
182+ if gunzip :
183+ gunzip_cmds .append (['gunzip' , renamed_gfile ])
184+ renamed_gfile = renamed_gfile [:- 3 ]
181185 gf_listing_handle .write (renamed_gfile + '\n ' )
182186 gf_listing_handle .close ()
183-
187+
188+ if len (gunzip_cmds ) > 0 :
189+ try :
190+ msg = "Uncompressing %d genomic assemblies for gene calling!" % genome_count
191+ sys .stdout .write (msg + '\n ' )
192+ logObject .info (msg )
193+ p = multiprocessing .Pool (threads )
194+ for _ in tqdm .tqdm (p .imap_unordered (multiProcess , gunzip_cmds ), total = len (gunzip_cmds )):
195+ pass
196+ p .close ()
197+ except Exception as e :
198+ msg = "An error occurred during multiprocessing: %s" % str (e )
199+ sys .stderr .write (msg + '\n ' )
200+ logObject .info (msg )
201+
202+
184203 msg = 'Was able to download %d of %d genomes belonging to taxa "%s" in GTDB %s.' % (final_genome_count , genome_count , taxa_name , gtdb_release )
185204 sys .stdout .write (msg + '\n ' )
186205 logObject .info (msg )
@@ -288,7 +307,7 @@ def processInputProteomes(proteomes, combined_proteome_faa, genome_to_path, logO
288307 combined_proteome_handle .close ()
289308 return proteome_name_to_path
290309
291- def processInputGenomes (genomes , genome_listing_file , logObject , sanity_check = False ):
310+ def processInputGenomes (genomes , genome_listing_file , logObject , sanity_check = False , allow_gzipped = True ):
292311 gf_listing_handle = open (genome_listing_file , 'a+' )
293312 for g in genomes :
294313 g_path = os .path .abspath (g )
@@ -298,7 +317,11 @@ def processInputGenomes(genomes, genome_listing_file, logObject, sanity_check=Fa
298317 suffix = genome_file .split ('/' )[- 1 ].split ('.' )[- 1 ].lower ()
299318 if genome_file .endswith ('.gz' ):
300319 suffix = genome_file .split ('/' )[- 1 ][:- 3 ].split ('.' )[- 1 ].lower ()
301-
320+ if not allow_gzipped :
321+ msg = 'Warning: genome %s is gzipped. Skipping ...' % genome_file
322+ sys .stderr .write (msg + '\n ' )
323+ logObject .warning (msg )
324+ continue
302325 try :
303326 assert (suffix in ACCEPTED_FASTA_SUFFICES )
304327 except :
@@ -323,7 +346,11 @@ def processInputGenomes(genomes, genome_listing_file, logObject, sanity_check=Fa
323346 suffix = genome_file .split ('/' )[- 1 ].split ('.' )[- 1 ].lower ()
324347 if genome_file .endswith ('.gz' ):
325348 suffix = genome_file .split ('/' )[- 1 ][:- 3 ].split ('.' )[- 1 ].lower ()
326-
349+ if not allow_gzipped :
350+ msg = 'Warning: genome %s is gzipped. Skipping ...' % genome_file
351+ sys .stderr .write (msg + '\n ' )
352+ logObject .warning (msg )
353+ continue
327354 try :
328355 assert (suffix in ACCEPTED_FASTA_SUFFICES )
329356 except :
@@ -393,7 +420,6 @@ def determineN50(genome_listing_file, outdir, logObject, threads=1):
393420 p = multiprocessing .Pool (threads )
394421 for _ in tqdm .tqdm (p .imap_unordered (compute_n50 , n50_inputs ), total = len (n50_inputs )):
395422 pass
396- p .map (compute_n50 , n50_inputs )
397423 p .close ()
398424 except Exception as e :
399425 msg = "An error occurred during multiprocessing: %s" % str (e )
0 commit comments